├── LICENSE ├── README.md ├── arima.py └── rnn.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Štěpán Trčka 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Time series forecasting scripts 2 | 3 | These scripts use ARIMA and LSTM RNN methods for time series forecasting. The series itself must be in CSV format (atleast they need to have timestamp + value columns). 4 | 5 | To run these scripts you need to have Python 3 and bunch of it's libraries installed: 6 | 7 | * Numpy (http://www.numpy.org/) 8 | * Pandas (https://pandas.pydata.org/) 9 | * Matplotlib (https://matplotlib.org/) 10 | * Statsmodels (http://www.statsmodels.org/) 11 | * Keras (https://keras.io/) 12 | * Scikit-learn (http://scikit-learn.org/) 13 | 14 | To install them, use pip3 installation program. 15 | 16 | FYI, I used old version of these scripts in my bachelor thesis. 17 | -------------------------------------------------------------------------------- /arima.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pandas 3 | import datetime 4 | import sys 5 | import time 6 | import matplotlib.pyplot as ma 7 | import statsmodels.tsa.seasonal as st 8 | import statsmodels.tsa.arima_model as arima 9 | import statsmodels.tsa.stattools as tools 10 | 11 | """ 12 | Load time series from CSV file, parse date times and 13 | select column with values. 14 | """ 15 | def ts_load(filename, value_name, date_name, date_parser): 16 | csv = pandas.read_csv(filename) 17 | csv.index = date_parser(csv[date_name]) 18 | for x in csv.columns.values.tolist(): 19 | if x != value_name: 20 | del csv[x] 21 | return csv 22 | 23 | """ 24 | Deep copy of time series. 25 | """ 26 | def ts_copy(ts): 27 | return ts.copy(deep=True) 28 | 29 | """ 30 | Check whether given time series is stationary. 31 | """ 32 | def ts_check_stationarity(ts, critic_value=0.5): 33 | try: 34 | # Dickey-Fuller algorithm 35 | result = tools.adfuller(ts) 36 | return result[0] < 0.0 and result[1] < critic_value 37 | except: 38 | # Program may raise an exception when there are NA values in TS 39 | return False 40 | 41 | """ 42 | Fit ARIMA model on given time series. 43 | """ 44 | def ts_fit_arima(ts, order): 45 | return arima.ARIMA(ts, order=order).fit(disp=0) 46 | 47 | """ 48 | Find best ARIMA model for given time series using Akaike information criterion. 49 | """ 50 | def ts_find_best_arima_model(ts, arima_orders): 51 | best_score = sys.maxsize 52 | best_order = None 53 | 54 | for order in arima_orders: 55 | model_fit = ts_fit_arima(ts, order) 56 | score = model_fit.aic 57 | if score <= best_score: 58 | best_score = score 59 | best_order = order 60 | 61 | return best_order 62 | 63 | """ 64 | Forecast new values using ARIMA model. 65 | """ 66 | def ts_forecast_arima(arima_model, samples=1): 67 | return arima_model.forecast(steps=samples) 68 | 69 | """ 70 | Estimate integrate (I) parameter by try-fail-success algorithm. 71 | """ 72 | def estimate_integrate_param(ts): 73 | integrate_param = 0 74 | ts2 = ts_copy(ts) 75 | 76 | while not ts_check_stationarity(ts2) and integrate_param < 2: 77 | integrate_param += 1 78 | ts2 = (ts2 - ts2.shift()).interpolate(limit_direction="both") 79 | 80 | return integrate_param 81 | 82 | """ 83 | Plot graphs for ACF and PACF functions. 84 | """ 85 | def ts_plot_acf_pacf(ts, nlags=40): 86 | 87 | def plot_bar(ts, horizontal_line=None): 88 | ma.bar(range(0, len(ts)), ts, width=0.5) 89 | ma.axhline(0) 90 | if horizontal_line != None: 91 | ma.axhline(horizontal_line, linestyle="-") 92 | ma.axhline(-horizontal_line, linestyle="-") 93 | 94 | acf = tools.acf(ts, nlags=nlags) 95 | plot_bar(acf, 1.96 / numpy.sqrt(len(ts))) 96 | ma.show() 97 | pacf = tools.pacf(ts, nlags=nlags) 98 | plot_bar(pacf, 1.96 / numpy.sqrt(len(ts))) 99 | ma.show() 100 | 101 | """ 102 | Split time series into two series - train and test. 103 | """ 104 | def ts_split_train_test(ts, ts_split_train_test=0.8): 105 | ts_len = len(ts) 106 | train_end = (int)(ts_len*ts_split_train_test) 107 | train, test = ts[:train_end], ts[train_end+1:] 108 | return train, test 109 | 110 | """ 111 | Apply ARIMA on given time series with given order. 112 | @M = number of past train values 113 | @N = number of values to predict in one iteration 114 | """ 115 | def run_arima(ts, order, M, N, train_test_ratio): 116 | # Ignore timestamps 117 | ts = [x[0] for x in ts.values] 118 | 119 | # Split time series sequence 120 | train, test = ts_split_train_test(ts, train_test_ratio) 121 | predictions = [] 122 | confidence = [] 123 | train_end = len(train)+1 124 | 125 | # Performance measure 126 | start_time = time.time() 127 | 128 | # Forecast 129 | for i in range(train_end, len(ts), N): 130 | print("Forecasting ", i) 131 | try: 132 | start = i-M if i-M >= 0 else 0 133 | arima_model = ts_fit_arima(ts[start:i], order) 134 | forecast = ts_forecast_arima(arima_model, N) 135 | for j in range(0, N): 136 | predictions.append(forecast[0][j]) 137 | confidence.append(forecast[2][j]) 138 | except: 139 | print("Error during forecast ", i) 140 | # Push back last successful predictions 141 | for j in range(0, N): 142 | predictions.append(predictions[-1]) 143 | confidence.append(confidence[-1]) 144 | 145 | print("TIME ELAPSED ", time.time() - start_time) 146 | 147 | score = 0 148 | iterations = 0 149 | result = zip(test, predictions, confidence) 150 | 151 | print("Real value,predicted value,conf. interval lower,conf. interval upper") 152 | for x in result: 153 | print(x[0], x[1], x[2][0], x[2][1]) 154 | score += pow(x[0]-x[1], 2) 155 | iterations += 1 156 | 157 | print("MSE ", score / iterations) 158 | 159 | ma.plot(ts[train_end+1:], color="blue") 160 | ma.plot(predictions, color="red") 161 | ma.show() 162 | 163 | def main(): 164 | if len(sys.argv) == 1: 165 | program_name = sys.argv[0] 166 | print("Usage:\n") 167 | print("For ACF, PACF plot:\npython3.6 %s acf_pacf " % (program_name) + 168 | " \n") 169 | print("For best order estimation:\npython3.6 %s best_order " % (program_name) + 170 | " \n") 171 | print("For predictions:\npython3.6 %s predictions " % (program_name) + 172 | " " + 173 | " " + 174 | "\n") 175 | exit() 176 | 177 | method_type = sys.argv[1] 178 | ts_path = sys.argv[2] 179 | value_column = sys.argv[3] 180 | timestamp_column = sys.argv[4] 181 | ts = ts_load(ts_path, value_column, timestamp_column, lambda x : pandas.to_datetime(x)) 182 | 183 | def acf_pacf(): 184 | integrate_param = estimate_integrate_param(ts) 185 | print("POSSIBLE INTEGRATE PARAMETER ", integrate_param) 186 | ts_plot_acf_pacf(ts) 187 | 188 | def best_order(): 189 | print("INSERT P D Q PARAMETERS OR LEAVE EMPTY LINE FOR BREAK") 190 | possible_models = [] 191 | for line in sys.stdin: 192 | params = line.split() 193 | if len(params) == 0: 194 | break 195 | possible_models.append((int(params[0]), int(params[1]), int(params[2]))) 196 | 197 | order = ts_find_best_arima_model(ts, possible_models) 198 | print("BEST ORDER ", order) 199 | 200 | def predictions(): 201 | p, d, q = int(sys.argv[6]), int(sys.argv[7]), int(sys.argv[8]) # ARIMA order 202 | m, n = int(sys.argv[9]), int(sys.argv[10]) # past and prediction values 203 | m = len(ts) if m < 0 else m 204 | run_arima(ts, (p, d, q), m, n, float(sys.argv[5])) 205 | 206 | if method_type == "acf_pacf": 207 | acf_pacf() 208 | elif method_type == "best_order": 209 | best_order() 210 | elif method_type == "predictions": 211 | predictions() 212 | 213 | if __name__ == "__main__": 214 | main() 215 | -------------------------------------------------------------------------------- /rnn.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pandas 3 | import math 4 | import time 5 | import sys 6 | import datetime 7 | import matplotlib.pyplot as ma 8 | import keras.models as km 9 | import keras.layers as kl 10 | import sklearn.preprocessing as sp 11 | 12 | numpy.random.seed(42) 13 | 14 | """ 15 | Load time series from CSV file, parse date times and 16 | select column with values. 17 | """ 18 | def ts_load(filename, value_name, date_name, date_parser): 19 | csv = pandas.read_csv(filename) 20 | csv.index = date_parser(csv[date_name]) 21 | for x in csv.columns.values.tolist(): 22 | if x != value_name: 23 | del csv[x] 24 | return csv 25 | 26 | """ 27 | LSTM cells are sensitive to large-scaled values, 28 | normalize them to get better resuts. 29 | """ 30 | def ts_normalize(ts): 31 | scaler = sp.MinMaxScaler(feature_range=(0,1)) 32 | return scaler.fit_transform(ts.values), scaler 33 | 34 | """ 35 | Inverse operation to ts_normalize. 36 | """ 37 | def ts_undo_normalization(ts, scaler): 38 | return scaler.inverse_transform(ts) 39 | 40 | """ 41 | Split time series into two series - train and test. 42 | """ 43 | def ts_split_train_test(ts, ts_split_train_test=0.8): 44 | ts_len = len(ts) 45 | train_end = (int)(ts_len*ts_split_train_test) 46 | train, test = ts[:train_end], ts[train_end+1:] 47 | return train, test 48 | 49 | """ 50 | Create LSTM RNN. 51 | """ 52 | def network_create(num_lstm, loss="mse", optimizer="sgd"): 53 | # Layer based network 54 | network = km.Sequential() 55 | # Hidden layer is made from LSTM nodes 56 | network.add(kl.LSTM(num_lstm, activation="sigmoid", input_shape=(1,1))) 57 | # Output layer with one output 58 | network.add(kl.Dense(1)) 59 | network.compile(loss=loss, optimizer=optimizer) 60 | return network 61 | 62 | """ 63 | Train LSTM RNN. 64 | """ 65 | def network_fit(network, train_data, target_data, num_training_iterations): 66 | return network.fit(train_data, target_data, epochs=num_training_iterations, batch_size=1, verbose=0) 67 | 68 | """ 69 | Reshape time series dataset for LSTM RNN 70 | into [batch size; timesteps; input dimensionality] format. 71 | """ 72 | def dataset_reshape_for_network(dataset): 73 | return dataset.reshape((dataset.shape[0], 1, dataset.shape[1])) 74 | 75 | """ 76 | Create dataset for LSTM RNN training. 77 | Basically this creates two lists, first with training values 78 | and second with lagged target values. 79 | """ 80 | def dataset_create(ts, num_lags=1): 81 | x = [] 82 | y = [] 83 | for i in range(len(ts)-num_lags-1): 84 | x.append(ts[i:(i+num_lags), 0]) 85 | y.append(ts[i+num_lags, 0]) 86 | return numpy.array(x), numpy.array(y) 87 | 88 | """ 89 | Predict new values with LSTM RNN. 90 | """ 91 | def network_predict_new_values(network, data): 92 | return network.predict(data) 93 | 94 | """ 95 | Load time series from CSV file, 96 | create LSTM RNN with custom number of cells, train it on data 97 | and try to predict new values. 98 | """ 99 | def rnn(ts_name, num_lstm, iterations, 100 | train_test_ratio, value_column_name, timestamp_column_name): 101 | 102 | ts = ts_load(ts_name, 103 | value_column_name, 104 | timestamp_column_name, 105 | lambda x : pandas.to_datetime(x)) 106 | predicted_values = [] 107 | 108 | # Sigmoids are sensitive to large scaled values, normalize them to <0,1> 109 | ts, scaler = ts_normalize(ts) 110 | ts_train, ts_test = ts_split_train_test(ts, train_test_ratio) 111 | 112 | # Create dataset from TS 113 | train_dataset_x, train_dataset_y = dataset_create(ts_train) 114 | test_dataset_x, test_dataset_y = dataset_create(ts_test) 115 | 116 | # The input data for our network needs to be 117 | # provided in [batch size; timesteps; input dimensionality] format 118 | train_dataset_x = dataset_reshape_for_network(train_dataset_x) 119 | test_dataset_x = dataset_reshape_for_network(test_dataset_x) 120 | 121 | # Create and fit LSTM network 122 | start_time = time.time() 123 | network = network_create(num_lstm) 124 | network_fit(network, train_dataset_x, train_dataset_y, iterations) 125 | print("TIME ELAPSED ", time.time() - start_time) 126 | 127 | predicted_unscaled = network_predict_new_values(network, test_dataset_x) 128 | predicted_scaled_back = ts_undo_normalization(predicted_unscaled, scaler) 129 | test_scaled_back = ts_undo_normalization(ts_test, scaler) 130 | 131 | # Present results 132 | test_result = [] 133 | predicted_result = [] 134 | score = 0 135 | iterations = 0 136 | 137 | print("Real value;predicted value") 138 | for x in zip(test_scaled_back, predicted_scaled_back): 139 | test_value = x[0][0] 140 | predicted_value = x[1][0] 141 | print("%f,%f" % (test_value, predicted_value)) 142 | test_result.append(test_value) 143 | predicted_result.append(predicted_value) 144 | score += pow(test_value - predicted_value, 2) 145 | iterations += 1 146 | 147 | print("MSE ", score / iterations) 148 | 149 | ma.plot(test_result, color="blue") 150 | ma.plot(predicted_result, color="red") 151 | ma.show() 152 | 153 | def main(): 154 | if len(sys.argv) != 7: 155 | print("Usage:") 156 | print("python3.6 %s ts_path num_lstm train_iterations train_test_ratio value_column_name timestamp_column_name" % (sys.argv[0])) 157 | exit() 158 | 159 | rnn(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), float(sys.argv[4]), sys.argv[5], sys.argv[6]) 160 | 161 | if __name__ == "__main__": 162 | main() 163 | --------------------------------------------------------------------------------