├── Figure ├── Normalize_data.png ├── Check Stationary Time Series.png ├── Find the lags of AR and etc models.png └── Autoregressive and Automated Methods.png ├── Code ├── Least_Squares.py ├── AR_Model.py ├── Linear_Regression.py ├── Xgboost_Regression.py ├── Tree_Decision_Regression.py ├── ARIMA_Model.py ├── Random_Forest_Regression.py ├── ARX_Model.py ├── Sequences_Data.py ├── Normalize_Regression.py ├── Normalize_Data.py ├── Auto_Correlation.py ├── Test_Stationary.py ├── Plot_Models.py └── Main.py └── README.md /Figure/Normalize_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RezaSaadatyar/Time-Series-Analysis-in-Python/HEAD/Figure/Normalize_data.png -------------------------------------------------------------------------------- /Figure/Check Stationary Time Series.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RezaSaadatyar/Time-Series-Analysis-in-Python/HEAD/Figure/Check Stationary Time Series.png -------------------------------------------------------------------------------- /Figure/Find the lags of AR and etc models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RezaSaadatyar/Time-Series-Analysis-in-Python/HEAD/Figure/Find the lags of AR and etc models.png -------------------------------------------------------------------------------- /Figure/Autoregressive and Automated Methods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RezaSaadatyar/Time-Series-Analysis-in-Python/HEAD/Figure/Autoregressive and Automated Methods.png -------------------------------------------------------------------------------- /Code/Least_Squares.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from Plot_Models import plot_models 3 | 4 | 5 | def lest_squares(data, data_lags, train_size, axs, num_sample): 6 | pinv = np.linalg.pinv(data_lags[:train_size]) 7 | alpha = pinv.dot(data[:train_size]) 8 | if alpha.ndim > 1: 9 | alpha = alpha.flatten() 10 | y_train_pred = np.sum(alpha * data_lags[:train_size], axis=1) 11 | y_test_pred = np.sum(alpha * data_lags[train_size:], axis=1) 12 | plot_models(data, y_train_pred, y_test_pred, axs, [], train_size, num_sample, type_model='LS') 13 | 14 | -------------------------------------------------------------------------------- /Code/AR_Model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from statsmodels import tsa 4 | from Plot_Models import plot_models 5 | 6 | 7 | def ar_model(data, train_size, axs, n_lags, num_sample): 8 | mod = tsa.ar_model.AutoReg(data[:train_size], lags=n_lags).fit() 9 | y_train_pred = pd.Series(mod.fittedvalues) # train 10 | y_test_pred = pd.Series(mod.model.predict(mod.params, start=train_size, end=len(data)-1)) # For predict Future: end - start samples 11 | plot_models(data, y_train_pred, y_test_pred, axs, [], train_size, num_sample=num_sample, type_model='AR') 12 | -------------------------------------------------------------------------------- /Code/Linear_Regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import linear_model 4 | from Plot_Models import plot_models 5 | 6 | 7 | def linear_regression(data, data_lags, train_size, axs, num_sample): 8 | mod = linear_model.LinearRegression() 9 | mod.fit(data_lags[:train_size], data[:train_size]) 10 | y_train_pred = pd.Series(mod.predict(data_lags[:train_size])) 11 | y_test_pred = pd.Series(mod.predict(data_lags[train_size:])) 12 | y_test_pred.index = np.arange(data_lags[train_size:].index[0], data_lags[train_size:].index[-1] + 1, 1, dtype='int') 13 | plot_models(data, y_train_pred, y_test_pred, axs, [], train_size, num_sample=num_sample, type_model='LR') 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /Code/Xgboost_Regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import xgboost as xgb 4 | from sklearn import linear_model 5 | from Plot_Models import plot_models 6 | 7 | 8 | def xgboost_regression(data, data_lags, train_size, axs, n_estimators, num_sample): 9 | mod = xgb.XGBRegressor(n_estimators=n_estimators) 10 | mod.fit(data_lags[:train_size], data[:train_size]) 11 | y_train_pred = pd.Series(mod.predict(data_lags[:train_size])) 12 | y_test_pred = pd.Series(mod.predict(data_lags[train_size:])) 13 | y_test_pred.index = np.arange(data_lags[train_size:].index[0], data_lags[train_size:].index[-1] + 1, 1, dtype='int') 14 | plot_models(data, y_train_pred, y_test_pred, axs, [], train_size, num_sample=num_sample, type_model='Xgboost') 15 | -------------------------------------------------------------------------------- /Code/Tree_Decision_Regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import tree 4 | from Plot_Models import plot_models 5 | 6 | 7 | def tree_decision_regression(data, data_lags, train_size, axs, max_depth, num_sample): 8 | mod = tree.DecisionTreeRegressor(max_depth=max_depth, random_state=0) 9 | 10 | mod.fit(data_lags[:train_size], data[:train_size]) 11 | y_train_pred = pd.Series(mod.predict(data_lags[:train_size])) 12 | y_test_pred = pd.Series(mod.predict(data_lags[train_size:])) 13 | y_test_pred.index = np.arange(data_lags[train_size:].index[0], data_lags[train_size:].index[-1] + 1, 1, dtype='int') 14 | plot_models(data, y_train_pred, y_test_pred, axs, [], train_size, num_sample=num_sample, type_model='DT') 15 | 16 | -------------------------------------------------------------------------------- /Code/ARIMA_Model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from statsmodels import tsa 3 | from Plot_Models import plot_models 4 | 5 | 6 | def arima_model(data, train_size, axs, order, seasonal_order, num_sample): 7 | # mod = pm.auto_arima(X[:train_size], start_p=5, start_q=1, seasonal=True, m=10, d=1, n_fits=50, information_criterion="bic", trace=True, stepwise=True, method='lbfgs') 8 | mod = tsa.statespace.sarimax.SARIMAX(data[:train_size], order=order, seasonal_order=seasonal_order) 9 | mod = mod.fit(disp=False) 10 | y_train_pred = pd.Series(mod.fittedvalues) 11 | y_test_pred = mod.predict(start=train_size, end=len(data) - 1, dynamic=True, typ='levels') # predict N steps into the future 12 | plot_models(data, y_train_pred, y_test_pred, axs, [], train_size, num_sample=num_sample, type_model='ARIMA') 13 | -------------------------------------------------------------------------------- /Code/Random_Forest_Regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import ensemble 4 | from Plot_Models import plot_models 5 | 6 | 7 | def random_forest_regression(data, data_lags, train_size, axs, n_estimators, max_features, num_sample): 8 | mod = ensemble.RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, random_state=0) 9 | mod.fit(data_lags[:train_size], data[:train_size]) 10 | y_train_pred = pd.Series(mod.predict(data_lags[:train_size])) 11 | y_test_pred = pd.Series(mod.predict(data_lags[train_size:])) 12 | y_test_pred.index = np.arange(data_lags[train_size:].index[0], data_lags[train_size:].index[-1] + 1, 1, dtype='int') 13 | plot_models(data, y_train_pred, y_test_pred, axs, [], train_size, num_sample=num_sample, type_model='RF') 14 | 15 | 16 | -------------------------------------------------------------------------------- /Code/ARX_Model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import pandas as pd 4 | from Least_Squares import lest_squares 5 | from Plot_Models import plot_models 6 | 7 | 8 | def arx(data, data_lags, train_size, axs, mu, num_sample): 9 | nLags = data_lags.shape[1] 10 | data_lags1 = copy.copy(data_lags) 11 | alpha = np.zeros(nLags + 1) 12 | data_lags1.insert(0, "0", data, True) 13 | data_lags1.columns = list(range(nLags + 1)) 14 | y_train_pred = [] 15 | for i in range(train_size): 16 | e = data.iloc[i] - alpha.dot(data_lags1.iloc[i]) 17 | alpha = alpha + mu * data_lags1.iloc[i] * e 18 | y_train_pred.append(alpha.dot(data_lags1.iloc[i])) 19 | y_train_pred = pd.DataFrame(y_train_pred) 20 | y_test_pred = np.sum(alpha * data_lags1[train_size:], axis=1) 21 | plot_models(data, y_train_pred, y_test_pred, axs, [], train_size, num_sample, type_model='ARX') 22 | 23 | -------------------------------------------------------------------------------- /Code/Sequences_Data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def sequences_data(dataset, seq_size=1): 5 | """ 6 | Args: 7 | dataset: Creates a dataset where x is the number of values at a given time (t, t-1, ...) and y is the number 8 | of values at the next time (t+1) 9 | seq_size: It is the number of previous time steps to use as input variables to predict the next time period. 10 | Larger sequences (look further back) may improve forecasting. 11 | Returns: Variables for train and test LSTM 12 | """ 13 | x = [] 14 | y = [] 15 | dataset = dataset.reshape(-1, 1) 16 | for k in range(len(dataset) - seq_size): 17 | window = dataset[k:(k + seq_size), 0] 18 | x.append(window) 19 | y.append(dataset[k + seq_size, 0]) 20 | x = np.reshape(np.array(x), (np.array(x).shape[0], 1, np.array(x).shape[1])) # Reshape input to be [samples, time steps, features] 21 | return x, np.array(y) 22 | -------------------------------------------------------------------------------- /Code/Normalize_Regression.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import preprocessing 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def normalize_regression(data, type_normalize, display_figure): 7 | data1 = data.values 8 | if data.ndim == 1: 9 | data1 = data1.reshape(-1, 1) 10 | 11 | if type_normalize == 'MinMaxScaler': 12 | normalize_model = preprocessing.MinMaxScaler(feature_range=(0, 1)) 13 | normalize_model.fit(data1) 14 | min_data = normalize_model.data_min_ 15 | max_data = normalize_model.data_max_ 16 | normalized_data = normalize_model.transform(data1) # (Data-min)/(max-min) 17 | elif type_normalize == 'normalize': 18 | normalized_data = preprocessing.normalize(data1, norm='l1', axis=0) # l1, l2 19 | if data.ndim == 1: 20 | normalized_data = pd.Series(normalized_data.ravel()) 21 | Label = 'Raw Signal' 22 | else: 23 | normalized_data = pd.DataFrame(normalized_data, columns=data.columns) 24 | Label = data.columns 25 | if display_figure == 'on': 26 | plt.rcParams.update({'font.size': 11}) 27 | plt.subplot(211) 28 | plt.plot(data, label=Label), plt.legend(fontsize=14, ncol=2, frameon=False, loc='best', labelcolor='linecolor', handlelength=0) 29 | plt.subplot(212) 30 | plt.plot(normalized_data) 31 | plt.tight_layout(), plt.style.use('ggplot'), plt.show() 32 | return normalized_data, normalize_model 33 | -------------------------------------------------------------------------------- /Code/Normalize_Data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import preprocessing 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def normalize_data(Data, Type_Normalize, Display_Figure): 7 | data = Data.values 8 | if Data.ndim == 1: 9 | data = data.reshape(-1, 1) 10 | 11 | if Type_Normalize == 'MinMaxScaler': 12 | normalize = preprocessing.MinMaxScaler(feature_range=(0, 1)) 13 | normalize.fit(data) 14 | min_data = normalize.data_min_ 15 | max_data = normalize.data_max_ 16 | normalized_data = normalize.transform(data) # (Data-min)/(max-min) 17 | elif Type_Normalize == 'normalize': 18 | normalized_data = preprocessing.normalize(data, norm='l1', axis=0) # l1, l2 19 | if Data.ndim == 1: 20 | normalized_data = pd.Series(normalized_data.ravel()) 21 | if Display_Figure == 'on': 22 | plt.rcParams.update({'font.size': 11}) 23 | if Data.ndim == 1: 24 | plt.subplot(211) 25 | plt.plot(Data, label='Raw Data'), plt.legend() 26 | plt.subplot(212) 27 | plt.plot(normalized_data, label='Normalized Data') 28 | else: 29 | plt.subplot(121) 30 | plt.plot(data[:, 0], data[:, 1], '.', label='Raw Data'), plt.legend() 31 | plt.subplot(122) 32 | plt.plot(normalized_data[:, 0], normalized_data[:, 1], '.', label='Normalized Data') 33 | plt.legend(), plt.tight_layout(), plt.style.use('ggplot'), plt.show() 34 | return normalized_data, normalize 35 | -------------------------------------------------------------------------------- /Code/Auto_Correlation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import stats 3 | import matplotlib.pyplot as plt 4 | from statsmodels.tsa.stattools import acf, pacf 5 | 6 | 7 | def auto_correlation(data, nLags): 8 | """ 9 | Args: x: X is an array usually 1*N 10 | Returns: plot raw data, histogram, pacf and acf # use pacf plot to find the order p 11 | 12 | Auto-correlation: The ACF can be used to identify trends in data and the influence of previously observed values on a current observation 13 | Sharp peaks indicate a sharp correlation in time series, whereas shorter peaks indicate little correlation in the time series. 14 | lag: We can calculate the correlation for current time-series observations with observations of previous time steps called lags and 15 | after lag q, the auto-correlation is not significant anymore. In other words, instead of calculating the correlation between two different series, 16 | we calculate the correlation of the series with an “x” unit lagged version (x∈N) of itself. It is also known as lagged correlation 17 | or serial correlation. The value of auto-correlation varies between +1 & -1. If the auto-correlation of series is a very small value 18 | that does not mean, there is no correlation. 19 | """ 20 | plt.figure(figsize=(12, 8)) 21 | plt.rcParams.update({'font.size': 11}) 22 | # plt.rcParams.update({'font.weight': 'bold'}) 23 | ax1 = plt.subplot(211) 24 | ax1.plot(data), ax1.set_title('Data') 25 | 26 | ax2 = plt.subplot(234) 27 | _, bins, _ = ax2.hist(data, bins='auto', density=True, alpha=0.8) 28 | ax2.plot(bins, stats.norm.pdf(bins, np.mean(data), np.std(data)), linewidth=3), ax2.set_ylabel('Probability'), ax2.set_title('Histogram of data') 29 | 30 | ax3 = plt.subplot(235) 31 | acf_value, acf_interval, _, _ = acf(data, nlags=nLags, qstat=True, alpha=0.05, fft=False) 32 | time = np.arange(start=0, stop=acf_value.shape[0]) 33 | _, _, baseline = plt.stem(time, acf_value, linefmt='b-.', markerfmt='bo', basefmt='r-') 34 | plt.setp(baseline, color='r', linewidth=0.5) 35 | plt.fill_between(x=time[1:], y1=acf_interval[1:, 0] - acf_value[1:], y2=acf_interval[1:, 1] - acf_value[1:], alpha=0.25, linewidth=0, color='red') 36 | 37 | ax4 = plt.subplot(236) 38 | pacf_value, pacf_interval = pacf(data, nlags=nLags, alpha=0.05) 39 | _, _, bas = plt.stem(time, pacf_value, linefmt='b-.', markerfmt='bo', basefmt='r-') 40 | plt.setp(bas, color='r', linewidth=0.50) 41 | plt.fill_between(x=time[1:], y1=pacf_interval[1:, 0] - pacf_value[1:], y2=pacf_interval[1:, 1] - pacf_value[1:], alpha=0.25, linewidth=0.5, color='red') 42 | ax3.set_ylabel('Correlation value'), ax3.set_xlabel('#Lag'), ax3.set_title('Autocorrelation'), ax4.set_xlabel('#Lag') 43 | ax4.set_title('Partial Autocorrelation'), plt.tight_layout(), plt.style.use('ggplot'), plt.savefig("squares.png"), plt.show() 44 | -------------------------------------------------------------------------------- /Code/Test_Stationary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import seaborn as sns 4 | from statsmodels.tsa import stattools 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def test_stationary(data, window): 9 | """ 10 | param Data: Data is a ndarray and often 1 * N 11 | 12 | param window: Size of the moving window. If an integer, the fixed number of observations used for each window. 13 | If an offset, the time period of each window. Each window will be a variable sized based on the observations included 14 | in the time-period. 15 | 16 | return: Convert non-stationary data to stationary data if Data is Non-Stationary. 17 | 18 | Check Stationary Time Series: 1)Rolling statistics: plot the moving average/variance and see if it varies with 19 | time. 2) Augmented Dickey-Fuller Test: result[0]: When the test statistic is lower than the critical value shown, 20 | the time series is stationary result[1]: p-value >>>> If Test statistic < Critical Value and p-value < 0.05 >>>> 21 | the time series is stationary. Stationary means >>> mean, variance and covariance is constant over periods and 22 | auto-covariance that does not depend on time. 23 | 24 | Converting Non-stationary data to stationary dataset: 25 | Log: np.log(Data) 26 | Differencing simple moving average: MA = Data.rolling(window=window).mean() 27 | Data = Data - MA 28 | Data.dropna(inplace=True) 29 | """ 30 | # ================================ Step 2: Check Stationary Time Series ======================================== 31 | data1 = data 32 | sns.set(style='white') 33 | result = stattools.adfuller(data) # Perform Dickey-Fuller Test 34 | if result[0] < result[4]["5%"]: 35 | fig, ax1 = plt.subplots(1, 1, sharey='row', figsize=(10, 6)) 36 | plt.rcParams.update({'font.size': 11}) 37 | ax1.set_title('Rolling Mean & Standard Deviation; ' + 'p-value:' + str(round(result[1], 3)) + '; Data is Stationary') 38 | else: 39 | fig, (ax1, ax2) = plt.subplots(2, 1, sharey='row', figsize=(10, 6)) 40 | plt.rcParams.update({'font.size': 11}) 41 | ax1.set_title('Rolling Mean & Standard Deviation; ' + 'p-value:' + str(round(result[1], 3)) + '; Data is Non-Stationary') 42 | data = data - data.rolling(window=window).mean() # X.diff(periods=1) 43 | data.dropna(inplace=True) 44 | data.index = (np.linspace(0, len(data), num=len(data), endpoint=False, dtype='int')) 45 | data = pd.Series(data) 46 | result = stattools.adfuller(data) # Perform Dickey-Fuller Test 47 | ax2.plot(data) 48 | ax2.plot(data.rolling(window=window).mean()) # Determine rolling statistics 49 | ax2.plot(data.rolling(window=window).std()) 50 | ax2.set_title('Rolling Mean & Standard Deviation; ' + 'p-value:' + str(round(result[1], 3)) + '; Data is Stationary') 51 | output_result = pd.Series(result[0:4], index=['Test Statistic', 'p-value', '#lags used', 'number of observations used']) 52 | for key, value in result[4].items(): 53 | output_result['critical value (%s)' % key] = value 54 | print(output_result) 55 | ax1.plot(data1, label='Data') 56 | ax1.plot(data1.rolling(window=window).mean(), label='Rolling Mean') # Determine rolling statistics 57 | ax1.plot(data1.rolling(window=window).std(), label='Rolling Std') 58 | ax1.legend(loc='best'), plt.tight_layout(), plt.show() 59 | return data 60 | -------------------------------------------------------------------------------- /Code/Plot_Models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | from sklearn import metrics 6 | 7 | 8 | # from math import sqrt 9 | 10 | 11 | def plot_models(data, y_train_pred, y_test_pred, axs, n_lag, train_size, num_sample, type_model): 12 | plt.rcParams.update({'font.size': 18}) 13 | 14 | if 'DataFrame' not in str(type(data)): # Check type data 15 | data = pd.DataFrame(data) 16 | 17 | if train_size - num_sample <= 0: 18 | i = 0 19 | iii = 0.16 20 | axs[0].set_xticks([]) 21 | else: 22 | iii = 0.07 23 | i = train_size - num_sample 24 | axs[0].tick_params(axis='x', labelsize=16) 25 | 26 | if train_size + num_sample > len(data): 27 | ii = len(data) - 1 28 | else: 29 | ii = train_size + num_sample 30 | 31 | if type_model == 'Actual_Data': 32 | 33 | if data.shape[1] > 1: 34 | axs[0].plot(data, label=data.columns, linewidth=2.5) 35 | else: 36 | axs[0].plot(data, 'k', label="Original data", linewidth=2.5), 37 | 38 | axs[0].axvline(x=train_size, linewidth=4, color='r', ls='--'), axs[1].axvline(x=train_size, linewidth=4, color='r', ls='--') 39 | axs[2].axvline(x=train_size, linewidth=4, color='r', ls='--'), axs[3].axvline(x=train_size, linewidth=4, color='r', ls='--') 40 | axs[0].axvspan(i, ii, color='yellow', alpha=0.75), axs[1].plot(data.iloc[i:train_size + ii, 0], 'k', linewidth=4) 41 | axs[2].plot(data.iloc[i:train_size + ii, 0], 'k', linewidth=3.5), axs[3].plot(data.iloc[i:train_size + ii, 0], 'k', linewidth=3.5), 42 | 43 | axs[0].set_xlim(data.index[0], data.index[-1]), axs[1].set_xlim(data.index[i], data.index[ii]), axs[2].set_xlim(data.index[i], data.index[ii]) 44 | axs[3].set_xlim(data.index[i], data.index[ii]) 45 | 46 | axs[0].text(train_size / 2, np.max(np.max(data)) / 1.1, "Training set", ha="center", va="center", rotation=0, size=17, 47 | bbox=dict(boxstyle="round,pad=0.6", fc="w", ec="b", lw=2.5)) 48 | axs[0].text(train_size + ((len(data) - train_size) / 2), np.max(np.max(data)) / 1.1, "Test set", ha="center", va="center", rotation=0, size=17, 49 | bbox=dict(boxstyle="round,pad=0.6", fc="w", ec="g", lw=2.5)) 50 | axs[1].set_title('Linear regression models', loc='right', y=1 + iii, pad=-20, color='deeppink', fontsize=19, fontstyle='italic') 51 | axs[2].set_title('Machine learning models', loc='right', y=1 + 0.16, pad=-20, color='deeppink', fontsize=19, fontstyle='italic') 52 | axs[3].set_title('Deep learning models', loc='right', y=1 + 0.16, pad=-20, color='deeppink', fontsize=19, fontstyle='italic') 53 | axs[1].set_xticks([]), axs[2].set_xticks([]), axs[0].tick_params(axis='y', labelsize=16), axs[1].tick_params(axis='y', labelsize=16) 54 | axs[2].tick_params(axis='y', labelsize=16), axs[3].tick_params(axis='y', labelsize=16), axs[3].tick_params(axis='x', labelsize=16) 55 | axs[0].legend(fontsize=16, ncol=2, frameon=False, loc='best', labelcolor='linecolor', handlelength=0) 56 | 57 | elif (type_model == 'LS') | (type_model == 'AR') | (type_model == 'ARX') | (type_model == 'ARIMA') | (type_model == 'AR+LS'): 58 | sns.set(style='white') 59 | r2_tr = metrics.r2_score(data[train_size - len(y_train_pred):train_size], y_train_pred) 60 | r2_te = metrics.r2_score(data[-len(y_test_pred):], y_test_pred) 61 | # rmse_tr = sqrt(metrics.mean_squared_error(data[train_size - len(y_train_pred):train_size], y_train_pred)) 62 | # rmse_test = sqrt(metrics.mean_squared_error(data[-len(y_test_pred):], y_test_pred)) 63 | axs[1].plot(pd.concat([y_train_pred[i:], y_test_pred[:train_size + ii]]), label=type_model + "=" + "$R_{tr,te}^{2}$:" + 64 | str(round(r2_tr, 2)) + "; " + str(round(r2_te, 2)), linestyle='--', linewidth=4, alpha=1) 65 | axs[1].legend(fontsize=16, ncol=2, loc='best', borderaxespad=0, frameon=False, labelcolor='linecolor', handlelength=0) 66 | 67 | elif (type_model == 'LR') | (type_model == 'RF') | (type_model == 'DT') | (type_model == 'Xgboost'): 68 | sns.set(style='white') 69 | r2_tr = metrics.r2_score(data[train_size - len(y_train_pred):train_size], y_train_pred) 70 | r2_te = metrics.r2_score(data[-len(y_test_pred):], y_test_pred) 71 | axs[2].plot(pd.concat([y_train_pred[i:], y_test_pred[:train_size + ii]]), label=type_model + "=" + "$R_{tr, te}^{2}$:" + 72 | str(round(r2_tr, 2)) + "; " + str(round(r2_te, 2)), linestyle='--', linewidth=4, alpha=1) 73 | axs[2].legend(fontsize=16, ncol=2, loc='best', borderaxespad=0, frameon=False, labelcolor='linecolor', handlelength=0) 74 | 75 | elif type_model == 'LSTM': 76 | data_train = data[:train_size] 77 | data_test = data[train_size:] 78 | r2_tr = metrics.r2_score(data_train[-len(y_train_pred):], y_train_pred) 79 | r2_te = metrics.r2_score(data_test[-len(y_test_pred):], y_test_pred) 80 | 81 | axs[3].plot(pd.concat([y_train_pred[i:], y_test_pred[:train_size + ii]]), label=type_model + "=" + r'$R_{tr, te}^{2}$:' + 82 | str(round(r2_tr, 2)) + "; " + str(round(r2_te, 2)), linestyle='--', linewidth=4, alpha=1) 83 | axs[3].legend(fontsize=16, ncol='2', loc='best', borderaxespad=0, frameon=False, labelcolor='linecolor', handlelength=0) 84 | -------------------------------------------------------------------------------- /Code/Main.py: -------------------------------------------------------------------------------- 1 | # ========================================================================== 2 | # ============================ Time series ================================ 3 | # ====================== Presented by: Reza Saadatyar ===================== 4 | # =================== E-mail: Reza.Saadatyar92@gmail.com ================== 5 | # ============================ 2022-2023 ================================== 6 | # The program will run automatically when you run code/file Main.py, and you do not need to run any of the other codes. 7 | # ============================================= Import Libraries ======================================== 8 | import os 9 | import numpy as np 10 | import pandas as pd 11 | import seaborn as sns 12 | from ARX_Model import arx 13 | import statsmodels.api as sm 14 | from AR_Model import ar_model 15 | import matplotlib.pyplot as plt 16 | from ARIMA_Model import arima_model 17 | from Plot_Models import plot_models 18 | from Least_Squares import lest_squares 19 | from Normalize_Regression import normalize_regression 20 | from Sequences_Data import sequences_data 21 | from Test_Stationary import test_stationary 22 | from Auto_Correlation import auto_correlation 23 | from Linear_Regression import linear_regression 24 | from Xgboost_Regression import xgboost_regression 25 | from keras import models, layers 26 | from Random_Forest_Regression import random_forest_regression 27 | from Tree_Decision_Regression import tree_decision_regression 28 | # ======================================== Step 1: Load Data ================================================== 29 | os.system('cls') 30 | data = sm.datasets.sunspots.load_pandas() # df = pd.read_csv('monthly_milk_production.csv'), df.info(), X = df["Value"].values 31 | data = data.data["SUNACTIVITY"] 32 | # print('Shape of data \t', data.shape) 33 | # print('Original Dataset:\n', data.head()) 34 | # print('Values:\n', data) 35 | # ================================ Step 2.1: Normalize Data (0-1) ================================================ 36 | #data, normalize_modele = normalize_regression(data, type_normalize='MinMaxScaler', display_figure='on') # Type_Normalize: 'MinMaxScaler', 'normalize' 37 | # ================================ Step 2.2: Check Stationary Time Series ======================================== 38 | #data = test_stationary(data, window=20) 39 | # ==================================== Step 3: Find the lags of AR and etc models ============================== 40 | #auto_correlation(data, nLags=10) 41 | # =========================== Step 4: Split Dataset intro Train and Test ======================================= 42 | nLags = 3 43 | num_sample = 300 44 | mu = 0.000001 45 | 46 | Data_Lags = pd.DataFrame(np.zeros((len(data), nLags))) 47 | for i in range(0, nLags): 48 | Data_Lags[i] = data.shift(i + 1) 49 | Data_Lags = Data_Lags[nLags:] 50 | data = data[nLags:] 51 | Data_Lags.index = np.arange(0, len(Data_Lags), 1, dtype=int) 52 | data.index = np.arange(0, len(data), 1, dtype=int) 53 | train_size = int(len(data) * 0.8) 54 | # ================================= Step 5: Autoregressive and Automated Methods =============================== 55 | sns.set(style='white') 56 | fig, axs = plt.subplots(nrows=4, ncols=1, sharey='row', figsize=(16, 10)) 57 | plot_models(data, [], [], axs, nLags, train_size, num_sample=num_sample, type_model='Actual_Data') 58 | # ------------------------------------------- Least Squares --------------------------------------------------- 59 | lest_squares(data, Data_Lags, train_size, axs, num_sample=num_sample) 60 | # -------------------------------------------- Auto-Regressive (AR) model -------------------------------------- 61 | ar_model(data, train_size, axs, n_lags=nLags, num_sample=num_sample) 62 | # ------------------------------------------------ ARX -------------------------------------------------------- 63 | arx(data, Data_Lags, train_size, axs, mu=mu, num_sample=num_sample) 64 | # ----------------------------- Auto-Regressive Integrated Moving Averages (ARIMA) ----------------------------- 65 | arima_model(data, train_size, axs, order=(5, 1, (1, 1, 1, 1)), seasonal_order=(0, 0, 2, 12), num_sample=num_sample) 66 | # ======================================= Step 5: Machine Learning Models ====================================== 67 | # ------------------------------------------- Linear Regression Model ----------------------------------------- 68 | linear_regression(data, Data_Lags, train_size, axs, num_sample=num_sample) 69 | # ------------------------------------------ RandomForestRegressor Model --------------------------------------- 70 | random_forest_regression(data, Data_Lags, train_size, axs, n_estimators=100, max_features=nLags, num_sample=num_sample) 71 | # -------------------------------------------- Decision Tree Model --------------------------------------------- 72 | tree_decision_regression(data, Data_Lags, train_size, axs, max_depth=2, num_sample=num_sample) 73 | # ---------------------------------------------- xgboost ------------------------------------------------------- 74 | xgboost_regression(data, Data_Lags, train_size, axs, n_estimators=1000, num_sample=num_sample) 75 | # ----------------------------------------------- LSTM model -------------------------------------------------- 76 | train_x, train_y = sequences_data(np.array(data[:train_size]), nLags) # Convert to a time series dimension:[samples, nLags, n_features] 77 | test_x, test_y = sequences_data(np.array(data[train_size:]), nLags) 78 | mod = models.Sequential() # Build the model 79 | # mod.add(layers.ConvLSTM2D(filters=64, kernel_size=(1, 1), activation='relu', input_shape=(None, nLags))) # ConvLSTM2D 80 | # mod.add(layers.Flatten()) 81 | mod.add(layers.LSTM(units=100, activation='tanh', input_shape=(None, nLags))) 82 | mod.add(layers.Dropout(rate=0.2)) 83 | # mod.add(layers.LSTM(units=100, activation='tanh')) # Stacked LSTM 84 | # mod.add(layers.Bidirectional(layers.LSTM(units=100, activation='tanh'), input_shape=(None, 1))) # Bidirectional LSTM: forward and backward 85 | mod.add(layers.Dense(32)) 86 | mod.add(layers.Dense(1)) # A Dense layer of 1 node is added in order to predict the label(Prediction of the next value) 87 | mod.compile(optimizer='adam', loss='mse') 88 | mod.fit(train_x, train_y, validation_data=(test_x, test_y), verbose=2, epochs=100) 89 | y_train_pred = pd.Series(mod.predict(train_x).ravel()) 90 | y_test_pred = pd.Series(mod.predict(test_x).ravel()) 91 | y_train_pred.index = np.arange(nLags, len(y_train_pred)+nLags, 1, dtype=int) 92 | y_test_pred.index = np.arange(train_size + nLags, len(data), 1, dtype=int) 93 | plot_models(data, y_train_pred, y_test_pred, axs, nLags, train_size, num_sample=num_sample, type_model='LSTM') 94 | # data_train = normalize.inverse_transform((np.array(data_train)).reshape(-1, 1)) 95 | mod.summary(), plt.tight_layout(), plt.subplots_adjust(wspace=0, hspace=0.2), plt.show() 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Time Series Analysis and Forecasting** 2 | 3 | ***This repository Covers:*** 4 | - 1. A brief about the Time series 5 | - 2. Preparing the data 6 | - Normalize data (0-1) 7 | - Check stationary time series (p < 0.005) 8 | - 3. Find the lags 9 | - 4. Split dataset into train and test 10 | - 5. Types of Regression Algorithms 11 | - Training the model 12 | - Prediction and performance check 13 | 14 | :arrow_forward: The program will run automatically when you run **code/file Main.py**, and you do not need to run any of the other codes. Depending on your goal, you can execute all steps independently or interdependently within the code/file main. In addition, you can copy and run each section along with its related files in your own code or disable each section with a **#**. To run the program, the only thing you need is your input, which is **data (data = Your data)**. 15 | 16 | ---- 17 | :one: The term time series refers to a series of observations that depend on time. Time is an essential feature in natural processes such as air temperature, a pulse of the heart, or stock price changes. Analyzing time series and forecasting time series are two different things. 18 | 19 | **Time series analysis:** As a result of time series analysis, we can extract useful information from time series data: trends, cyclic and seasonal deviations, correlations, etc. Time series analysis is the first step to preparing and analyzing time series datasets for time series forecasting 20 | 21 | **Time series forecasting** includes: Developing models and Using them to forecast future predictions. Time series forecasting tries to find the most likely 22 | time-series values in the future 23 | 24 | --- 25 | :two: Data pre-processing is the step where clean data sets from outliers and missing data and create additional features with the raw data to feed the model.
26 | - ***Missing values*** can be filled by interpolating between two closest non-missing values or by using different Python functions (e.g., interpolate()) to fill NAN values in the DataFrame or series. 27 | - ***Normalization*** can be useful, and even required in some machine learning algorithms, when your time series data has input values and features with differing measurements and dimensions. For machine learning algorithms, such as *k-nearest neighbors*, which use distance estimates, *linear regression, and neural networks* that process a weight calibration on input values, normalization is necessary. 28 | - In ***standardizing*** a data set, the distribution of observed values is rescaled to have a mean of 0 and a standard deviation of 1. Standardization assumes that your observations fit a Gaussian distribution with a well-behaved mean and standard deviation. Algorithms like support vector machines linear and logistic regression and other algorithms have improved performance with Gaussian data. 29 | - ***Check Stationary Time Series:*** Mean, and variance are constant over periods, and auto-covariance does not depend on time. Plot the moving average/variance (Rolling window statistics) and see if it varies with time. Augmented Dickey-Fuller Test: When the test statistic (p-value) is lower than the critical value shown, the time series is stationary. 30 | ---- 31 | 32 | :three: **Lag features**
33 | They are time-shifted values of the actual demand. For example, lag 1 feature stores the demand of the previous hour/sample relative to the current time stamp. Similarly, we can add lag 2, lag 3, and so on. A combination of lag features is selected during the modeling phase based on the evaluation of the model results. The operation of adding lag features is called the sliding window method or Window Features. 34 | 35 | ***Autocorrelation*** describes the correlation between the output (that is, the target variable that we need to predict) and a specific lagged variable (that is, a group of values at a prior time stamp used as input). ***Autocorrelation plot*** is also often used to check randomness in time series. If the time series is random, autocorrelation values should be near zero for all time lags. If the time series is non-random, then one or more of the autocorrelations will be significantly non-zero. The purpose of *the autocorrelation plot* is to show whether the data points in a time series are positively correlated, negatively correlated, or independent of one another. A plot of the autocorrelation of a time series by lag is also called the ***autocorrelation function (ACF)***.
36 | **ACF** is an autocorrelation function that provides information about the amount of autocorrelation in a series with its lagged values. In other words, it describes how well present values are related to its past values. A time series consists of several components that include seasonality, trend, cycle, and residuals. The ACF takes all these factors into account while finding correlations, so this is the full auto-correlation plot. 37 | 38 | **PACF** is the partial autocorrelation function. Unlike ACF, PACF finds correlations between residuals (the values that remain after removing the other effects) and the subsequent lag, which we will keep it as a feature in our models. thus, in order to avoid *overfitting* data for time series models, it is necessary to find optimum features or order of the autoregression process using the PACF plot. The best order is the lag value after which the PACF plot passes the upper confidence band for the first time. These p lags will act as the number of features used to forecast the time series. In the figure below, lags up to six have a reasonable correlation before the plot first cuts the upper confidence interval. By combining the first six lags, we can model the given autoregression process. 39 | 40 | ![image](https://user-images.githubusercontent.com/96347878/188177323-4f2fab92-ef86-4bc1-9906-f62e00e4d8c3.png) 41 | 42 | --- 43 | :four: **An explanation of data set splits** 44 | - ***Train data set:*** A train data set represents the amount of data that machine learning models are fitted with. 45 | - ***Validation data set:*** Validation data sets provide an unbiased evaluation of model fit on train data sets while tuning model hyperparameters. 46 | - ***Test data set:*** A test data set is used to identify whether a model is underfitting (the model performs poorly on the train data set) or overfitting (the model performs well on the train data set but fails to perform well on the test data set). It is determined by looking at the prediction error on both train and test data sets. The test data set is only used after the train and validation data sets have been used to train and validate the model.
47 | ![image](https://user-images.githubusercontent.com/96347878/187898924-6b434403-bac1-41d8-ac6f-4acd9053e511.png) 48 | --- 49 | :five: **Autoregressive and Automated Methods for Time Series Forecasting** 50 | 51 | :black_medium_square: **Linear Regression Models:**
52 | - ***Linear Correlation:*** For two related variables, the correlation measures the association between the two variables. In contrast, a ***linear regression*** is used for the prediction of the value of one variable from another. 53 | - ***Linear Regression (LR):*** We can use the method of Linear Regression when we want to predict the value of one variable from the value(s) of one or more other variables. ***LR model:*** **$y_{t} = a_{0} + x_{t} + e_{t}$** 54 | - ***Least Squares Regression (LS):*** By minimizing the sum of all offsets or residuals from the plotted curve, the least squares method can be used to identify the best fit for a set of data points. Least squares regression is used for predicting the behavior of dependent variables.
*LS model:* $Coeff = (X^{T}X)^{-1}X^{T}y$ 55 | - ***Moving Average (MA) Model:*** You can also forecast a series based solely on the past error values (et). Called short-memory models.
*MA(p) model:* $y_{t} = a_{0} + e_{t} + a_{1}e_{t-1} + a_{2}e_{t-2} + ... + a_{p}e_{t-p}$ 56 | - ***Autoregressive (AR) Model:*** The AR(p) notation refers to the autoregressive model which uses p history lag to predict the future.
*AR(p) model:* $y_{t} = a_{0} + a_{1}y_{t-1} + a_{2}y_{t-2} + ... + a_{p}y_{t-p} + e_{t}$ 57 | - ***Autoregressive Exogenous (ARX) Model:*** The ARX model is a type of autoregressive model that includes an input term, unlike the AR model.
*ARX(p, q) model:* $y_{t} + a_{1}y_{t-1} + a_{2}y_{t-2} + ... + a_{p}y_{t-p} = b_{1}x_{t} + b_{2}x_{t-1} + ... + b_{p}x_{t-p} + e_{t}$ 58 | - ***Auto-Regressive Integrated Moving Averages (ARIMA) Model:*** In statistics and in time series analysis, an ARIMA model is an update of ARMA (autoregressive moving average). The ARMA consists of mainly two components, the autoregressive and moving average; the ARIMA consists of an integrated moving average of autoregressive time series. ARIMA is used to help reduce the number of parameters needed for good estimation in the model.
59 | *ARIMA(p,d,q):* $y_{t} = C + a_{1}y_{t-1} + a_{2}y_{t-2} + ... + a_{p}y_{t-p} + e_{t} + b_{1}e_{t-1} + b_{2}e_{t-2} + ... + b_{q}e_{t-q}$
60 | :black_medium_small_square: p: The order of the AR model (i.e., the number of lag observations).
61 | :black_medium_small_square: d: The degree of differencing.
62 | :black_medium_small_square: q: The order of the MA model. This is essentially the size of the “window” function over your time series data. 63 | 64 | |Models Name| Model Equation | 65 | |--|--| 66 | |*ARIMA (0, 1, 1) = IAM (1, 1) with constant*| $y_{t} = C + y_{t-1} + e_{t} + b_{1}e_{t-1}$| 67 | |*ARIMA (0, 1, 1) = IAM (1, 1)*| $y_{t} = y_{t-1} + e_{t} + b_{1}e_{t-1}$| 68 | |*ARIMA (0, 1, 2) with constant*| $y_{t} = C + y_{t-1} + e_{t} - a_{1}e_{t-1} - a_{2}e_{t-2}$| 69 | |*ARIMA (1, 1, 1) with constant*| $y_{t} = C + (1+a_{1})y_{t-1} + a_{1}y_{t-2} + e_{t} - b_{1}e_{t-1}$| 70 | |*ARIMA (1, 1, 1)*| $y_{t} = (1+a_{1})y_{t-1} + a_{1}y_{t-2} + e_{t} - b_{1}e_{t-1}$| 71 | |*ARIMA (0, 2, 2) with constant*| $y_{t} = C + 2y_{t-1} - y_{t-1} + e_{t} - b_{1}e_{t-1} - b_{2}e_{t-2}$| 72 | Linear methods like AR, ARX, and ARIMA are popular classical techniques for time series forecasting. But these traditional approaches also have some constraints:
73 | :black_small_square: Focus on linear relationships and inability to find complex nonlinear ones.
74 | :black_small_square: Fixed lag observations and incapacity to make feature pre-processing.
75 | :black_small_square: Missing data & noise are not supported.
76 | :black_small_square: Working with univariate time series only, but common real-world problems have multiple input variables.
77 | :black_small_square: One-step predictions while many real-world problems require predictions with a long time horizon.
78 | 79 | :black_medium_square: **Machine Learning for Time Series Forecasting: [Further information](https://github.com/RezaSaadatyar/Machine-Learning-in-Python)** 80 | - ***Xgboost Regression*** 81 | - ***Linear Regression*** 82 | - ***Decision Trees (DT) Regression*** 83 | - ***Random Forest (RF) Regression*** 84 | 85 | ***The learning process is based on the following steps:***
:black_small_square: Algorithms are fed data. (In this step you can provide additional information to the model, for example, by performing feature extraction).
:black_small_square: Train a model using this data.
:black_small_square: Test and deploy the model.
:black_small_square: Utilize the deployed model to automate predictive tasks. 86 | 87 | :black_medium_square: **Deep Learning for Time Series Forecasting: [Further information](https://github.com/RezaSaadatyar/Deep-Learning-in-python)** 88 | - ***Long short-term memory (LSTM):***
LSTM is an artificial recurrent neural network (RNN) architecture used in the field of deep learning. Unlike standard feedforward neural networks, LSTM has feedback connections. LSTMs are sensitive to the scale of the input data, specifically when the sigmoid (default) or tanh activation functions are used. It can be a good practice to rescale the data to the range of 0 to 1, also called normalizing. We can easily normalize the dataset using the MinMaxscaler preprocessing class from the scikit-learn library.

***There are several types of x, including:***
:black_small_square: *LSTM Autoenooder*
:black_small_square: *Vanilla LSTM:* A Vanilla LSTM is an LSTM model that has a single hidden layer of LSTM units, and an output layer used to make a prediction.
:black_small_square: *Stacked LSTM:* Multiple hidden LSTM layers can be stacked one on top of another in what is referred to as a stacked LSTM model.
:black_small_square: *Bidirectional LSTM:* On some sequence prediction problem, it can be beneficial to allow the LSTM model to learn the input sequence both forward, backward and concatenate both interpretations.

***LSTM life-cycle in keras:***
:black_small_square: Define network.
:black_small_square: compile network.
:black_small_square: Fit network.
:black_small_square: Evaluate network.
:black_small_square: Make predictions 89 | 90 | ---- 91 | **Install the required packages (if required)**
:black_small_square: pip install numpy
:black_small_square: pip install scipy
:black_small_square: pip install pandas
:black_small_square: pip install seaborn
:black_small_square: pip install matplotlib
:black_small_square: pip install scikit-learn
:black_small_square: pip install keras 92 | 93 | --- 94 | **The five steps to effective time series forecasting are as follows:** 95 | ``` 96 | # ============================================= Import Libraries ======================================== 97 | import os 98 | import numpy as np 99 | import pandas as pd 100 | from ARX_Model import arx 101 | import statsmodels.api as sm 102 | from AR_Model import ar_model 103 | import matplotlib.pyplot as plt 104 | from ARIMA_Model import arima_model 105 | from Plot_Models import plot_models 106 | from Least_Squares import lest_squares 107 | from Normalize_Data import normalize_data 108 | from Sequences_Data import sequences_data 109 | from Test_Stationary import test_stationary 110 | from Auto_Correlation import auto_correlation 111 | from Linear_Regression import linear_regression 112 | from Xgboost_Regression import xgboost_regression 113 | from keras import models, layers, optimizers, utils 114 | from Random_Forest_Regression import random_forest_regression 115 | from Tree_Decision_Regression import tree_decision_regression 116 | ``` 117 | ***Step 1:*** 118 | ``` 119 | # ======================================== Step 1: Load Data ================================================== 120 | os.system('cls') 121 | data = sm.datasets.sunspots.load_pandas() # df = pd.read_csv('monthly_milk_production.csv'), df.info(), X = df["Value"].values 122 | data = data.data["SUNACTIVITY"] 123 | # print('Shape of data \t', data.shape) 124 | # print('Original Dataset:\n', data.head()) 125 | # print('Values:\n', data) 126 | ``` 127 | ***Step 2:*** 128 | ``` 129 | # ================================ Step 2.1: Normalize Data (0-1) ================================================ 130 | data, normalize = normalize_data(data, Type_Normalize='MinMaxScaler', Display_Figure='on') # Type_Normalize: 'MinMaxScaler', 'normalize', 131 | ``` 132 | ![Normalize_data](https://user-images.githubusercontent.com/96347878/188679304-9ad61a53-9f8d-44a4-94b3-b5cbe2e6c2af.png) 133 | 134 | ``` 135 | # ================================ Step 2.2: Check Stationary Time Series ======================================== 136 | data = test_stationary(data, window=20) 137 | ``` 138 | ![Check Stationary Time Series](https://user-images.githubusercontent.com/96347878/188680819-9b1ac97e-086d-42b6-89af-e835492c0086.png) 139 | 140 | ***Step 3***: 141 | ``` 142 | # ==================================== Step 3: Find the lags of AR and etc models ============================== 143 | auto_correlation(data, nLags=10) 144 | ``` 145 | ![Find the lags of AR and etc models](https://user-images.githubusercontent.com/96347878/188682420-b9bf0369-9c31-4b19-a29d-3fdf2cfd113d.png) 146 | 147 | ***Step 4:*** 148 | ``` 149 | # =========================== Step 4: Split Dataset intro Train and Test ======================================= 150 | nLags = 3 151 | Data_Lags = pd.DataFrame(np.zeros((len(data), nLags))) 152 | for i in range(0, nLags): 153 | Data_Lags[i] = data.shift(i + 1) 154 | Data_Lags = Data_Lags[nLags:] 155 | data = data[nLags:] 156 | Data_Lags.index = np.arange(0, len(Data_Lags), 1, dtype=int) 157 | data.index = np.arange(0, len(data), 1, dtype=int) 158 | train_size = int(len(data) * 0.8) 159 | ``` 160 | ***Step 5:*** 161 | ``` 162 | # ================================= Step 5: Autoregressive and Automated Methods =============================== 163 | sns.set(style='white') 164 | fig, axs = plt.subplots(nrows=4, ncols=1, sharey='row', figsize=(16, 10)) 165 | plot_models(data, [], [], axs, nLags, train_size, num_sample=50, type_model='Actual_Data') 166 | # ------------------------------------------- Least Squares --------------------------------------------------- 167 | lest_squares(data, Data_Lags, train_size, axs, num_sample=50) 168 | # -------------------------------------------- Auto-Regressive (AR) model -------------------------------------- 169 | ar_model(data, train_size, axs, n_lags=nLags, num_sample=50) 170 | # ------------------------------------------------ ARX -------------------------------------------------------- 171 | arx(data, Data_Lags, train_size, axs, mu=0.9, num_sample=50) 172 | # ----------------------------- Auto-Regressive Integrated Moving Averages (ARIMA) ----------------------------- 173 | arima_model(data, train_size, axs, order=(5, 1, (1, 1, 1, 1)), seasonal_order=(0, 0, 2, 12), num_sample=50) 174 | # ======================================= Step 5: Machine Learning Models ====================================== 175 | # ------------------------------------------- Linear Regression Model ----------------------------------------- 176 | linear_regression(data, Data_Lags, train_size, axs, num_sample=50) 177 | # ------------------------------------------ RandomForestRegressor Model --------------------------------------- 178 | random_forest_regression(data, Data_Lags, train_size, axs, n_estimators=100, max_features=nLags, num_sample=50) 179 | # -------------------------------------------- Decision Tree Model --------------------------------------------- 180 | tree_decision_regression(data, Data_Lags, train_size, axs, max_depth=2, num_sample=50) 181 | # ---------------------------------------------- xgboost ------------------------------------------------------- 182 | xgboost_regression(data, Data_Lags, train_size, axs, n_estimators=1000, num_sample=50) 183 | # ----------------------------------------------- LSTM model -------------------------------------------------- 184 | train_x, train_y = sequences_data(np.array(data[:train_size]), nLags) # Convert to a time series dimension:[samples, nLags, n_features] 185 | test_x, test_y = sequences_data(np.array(data[train_size:]), nLags) 186 | mod = models.Sequential() # Build the model 187 | # mod.add(layers.ConvLSTM2D(filters=64, kernel_size=(1, 1), activation='relu', input_shape=(None, nLags))) # ConvLSTM2D 188 | # mod.add(layers.Flatten()) 189 | mod.add(layers.LSTM(units=100, activation='tanh', input_shape=(None, nLags))) 190 | mod.add(layers.Dropout(rate=0.2)) 191 | # mod.add(layers.LSTM(units=100, activation='tanh')) # Stacked LSTM 192 | # mod.add(layers.Bidirectional(layers.LSTM(units=100, activation='tanh'), input_shape=(None, 1))) # Bidirectional LSTM: forward and backward 193 | mod.add(layers.Dense(32)) 194 | mod.add(layers.Dense(1)) # A Dense layer of 1 node is added in order to predict the label(Prediction of the next value) 195 | mod.compile(optimizer='adam', loss='mse') 196 | mod.fit(train_x, train_y, validation_data=(test_x, test_y), verbose=2, epochs=100) 197 | y_train_pred = pd.Series(mod.predict(train_x).ravel()) 198 | y_test_pred = pd.Series(mod.predict(test_x).ravel()) 199 | y_train_pred.index = np.arange(nLags, len(y_train_pred)+nLags, 1, dtype=int) 200 | y_test_pred.index = np.arange(train_size + nLags, len(data), 1, dtype=int) 201 | plot_models(data, y_train_pred, y_test_pred, axs, nLags, train_size, num_sample=50, type_model='LSTM') 202 | # data_train = normalize.inverse_transform((np.array(data_train)).reshape(-1, 1)) 203 | mod.summary(), plt.tight_layout(),plt.xticks(fontsize=15), plt.yticks(fontsize=15), plt.show() 204 | ``` 205 | ![Autoregressive and Automated Methods](https://user-images.githubusercontent.com/96347878/188715358-252f5f1c-ee91-4003-ae82-0403d41fc001.png) 206 | 207 | 208 | 209 |
Training Test
210 | 211 | | Method | $${R^2}$$| 212 | | ------ | ----- | 213 | |LS | 0.83 | 0.87 | 214 | |AR | 0.83 | 0.21 | 215 | |ARX | 0.95 | ***0.99***| 216 | |ARIMA| 0.84 | 0.25 | 217 | |LR| 0.83 | 0.87 | 218 | |DT | 0.67 | 0.56 | 219 | |RF | 0.97 | 0.84 | 220 | |XGBoost | 1 | 0.83 | 221 | |LSTM | 0.86 | 0.89 | 222 | 223 | 224 | 225 | $${R^2}$$| 226 | | ----- | 227 | | 0.87 | 228 | | 0.21 | 229 | |0.99 | 230 | | 0.25 | 231 | | 0.87 | 232 | | 0.56 | 233 | | 0.84 | 234 | | 0.83 | 235 | | ***0.89*** | 236 | 237 |
238 | 239 | Reza.Saadatyar@outlook.com 240 | --------------------------------------------------------------------------------