├── LICENSE
├── README.md
├── arima.py
└── rnn.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Štěpán Trčka
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Time series forecasting scripts
 2 | 
 3 | These scripts use ARIMA and LSTM RNN methods for time series forecasting. The series itself must be in CSV format (atleast they need to have timestamp + value columns).
 4 | 
 5 | To run these scripts you need to have Python 3 and bunch of it's libraries installed:
 6 | 
 7 | * Numpy (http://www.numpy.org/)
 8 | * Pandas (https://pandas.pydata.org/)
 9 | * Matplotlib (https://matplotlib.org/)
10 | * Statsmodels (http://www.statsmodels.org/)
11 | * Keras (https://keras.io/)
12 | * Scikit-learn (http://scikit-learn.org/)
13 | 
14 | To install them, use pip3 installation program.
15 | 
16 | FYI, I used old version of these scripts in my bachelor thesis.
17 | 


--------------------------------------------------------------------------------
/arima.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import pandas
  3 | import datetime
  4 | import sys
  5 | import time
  6 | import matplotlib.pyplot as ma
  7 | import statsmodels.tsa.seasonal as st
  8 | import statsmodels.tsa.arima_model as arima
  9 | import statsmodels.tsa.stattools as tools
 10 | 
 11 | """
 12 | Load time series from CSV file, parse date times and
 13 | select column with values.
 14 | """
 15 | def ts_load(filename, value_name, date_name, date_parser):
 16 |     csv = pandas.read_csv(filename)
 17 |     csv.index = date_parser(csv[date_name])
 18 |     for x in csv.columns.values.tolist():
 19 | 	    if x != value_name:
 20 | 		    del csv[x]
 21 |     return csv
 22 | 
 23 | """
 24 | Deep copy of time series.
 25 | """
 26 | def ts_copy(ts):
 27 | 	return ts.copy(deep=True)
 28 | 
 29 | """
 30 | Check whether given time series is stationary.
 31 | """
 32 | def ts_check_stationarity(ts, critic_value=0.5):
 33 |     try:
 34 |         # Dickey-Fuller algorithm
 35 |         result = tools.adfuller(ts)
 36 |         return result[0] < 0.0 and result[1] < critic_value
 37 |     except:
 38 |         # Program may raise an exception when there are NA values in TS 
 39 |         return False
 40 | 
 41 | """
 42 | Fit ARIMA model on given time series.
 43 | """
 44 | def ts_fit_arima(ts, order):
 45 | 	return arima.ARIMA(ts, order=order).fit(disp=0)
 46 | 
 47 | """
 48 | Find best ARIMA model for given time series using Akaike information criterion.
 49 | """
 50 | def ts_find_best_arima_model(ts, arima_orders):
 51 |     best_score = sys.maxsize
 52 |     best_order = None
 53 |     
 54 |     for order in arima_orders:
 55 |         model_fit = ts_fit_arima(ts, order)
 56 |         score = model_fit.aic
 57 |         if score <= best_score:
 58 |             best_score = score
 59 |             best_order = order
 60 | 
 61 |     return best_order
 62 | 
 63 | """
 64 | Forecast new values using ARIMA model.
 65 | """
 66 | def ts_forecast_arima(arima_model, samples=1):
 67 |     return arima_model.forecast(steps=samples)
 68 | 
 69 | """
 70 | Estimate integrate (I) parameter by try-fail-success algorithm.
 71 | """
 72 | def estimate_integrate_param(ts):
 73 | 	integrate_param = 0
 74 | 	ts2 = ts_copy(ts)
 75 | 	
 76 | 	while not ts_check_stationarity(ts2) and integrate_param < 2:
 77 | 		integrate_param += 1
 78 | 		ts2 = (ts2 - ts2.shift()).interpolate(limit_direction="both")
 79 | 	
 80 | 	return integrate_param
 81 | 
 82 | """
 83 | Plot graphs for ACF and PACF functions.
 84 | """
 85 | def ts_plot_acf_pacf(ts, nlags=40):
 86 | 	
 87 | 	def plot_bar(ts, horizontal_line=None):
 88 | 		ma.bar(range(0, len(ts)), ts, width=0.5)
 89 | 		ma.axhline(0)
 90 | 		if horizontal_line != None:
 91 | 			ma.axhline(horizontal_line, linestyle="-")
 92 | 			ma.axhline(-horizontal_line, linestyle="-")
 93 | 
 94 | 	acf = tools.acf(ts, nlags=nlags)
 95 | 	plot_bar(acf, 1.96 / numpy.sqrt(len(ts)))
 96 | 	ma.show()
 97 | 	pacf = tools.pacf(ts, nlags=nlags)
 98 | 	plot_bar(pacf, 1.96 / numpy.sqrt(len(ts)))
 99 | 	ma.show()
100 | 
101 | """
102 | Split time series into two series - train and test.
103 | """
104 | def ts_split_train_test(ts, ts_split_train_test=0.8):
105 |     ts_len = len(ts)
106 |     train_end = (int)(ts_len*ts_split_train_test)
107 |     train, test = ts[:train_end], ts[train_end+1:]
108 |     return train, test
109 | 
110 | """
111 | Apply ARIMA on given time series with given order.
112 | @M = number of past train values
113 | @N = number of values to predict in one iteration
114 | """
115 | def run_arima(ts, order, M, N, train_test_ratio):
116 |     # Ignore timestamps
117 |     ts = [x[0] for x in ts.values]
118 | 
119 |     # Split time series sequence
120 |     train, test = ts_split_train_test(ts, train_test_ratio)
121 |     predictions = []
122 |     confidence = []
123 |     train_end = len(train)+1
124 |     
125 |     # Performance measure
126 |     start_time = time.time()
127 | 
128 |     # Forecast
129 |     for i in range(train_end, len(ts), N):
130 |         print("Forecasting ", i)
131 |         try:
132 |             start = i-M if i-M >= 0 else 0
133 |             arima_model = ts_fit_arima(ts[start:i], order)
134 |             forecast = ts_forecast_arima(arima_model, N)
135 |             for j in range(0, N):
136 |                 predictions.append(forecast[0][j])
137 |                 confidence.append(forecast[2][j])
138 |         except:
139 |             print("Error during forecast ", i)
140 |             # Push back last successful predictions
141 |             for j in range(0, N):
142 |                 predictions.append(predictions[-1])
143 |                 confidence.append(confidence[-1])
144 | 
145 |     print("TIME ELAPSED ", time.time() - start_time)
146 | 
147 |     score = 0
148 |     iterations = 0
149 |     result = zip(test, predictions, confidence)
150 | 		
151 |     print("Real value,predicted value,conf. interval lower,conf. interval upper")
152 |     for x in result:
153 |         print(x[0], x[1], x[2][0], x[2][1])
154 |         score += pow(x[0]-x[1], 2)
155 |         iterations += 1
156 |     
157 |     print("MSE ", score / iterations)
158 | 
159 |     ma.plot(ts[train_end+1:], color="blue")
160 |     ma.plot(predictions, color="red")
161 |     ma.show()
162 | 
163 | def main():
164 | 	if len(sys.argv) == 1:
165 | 		program_name = sys.argv[0]
166 | 		print("Usage:\n")
167 | 		print("For ACF, PACF plot:\npython3.6 %s acf_pacf <ts_path> " % (program_name) +
168 | 			"<value_column_name> <timestamp_column_name>\n")
169 | 		print("For best order estimation:\npython3.6 %s best_order " % (program_name) +
170 | 			"<ts_path> <value_column_name> <timestamp_column_name>\n")
171 | 		print("For predictions:\npython3.6 %s predictions <ts_path> " % (program_name) +
172 | 			"<value_column_name> <timestamp_column_name> <train_test_ratio> " +
173 | 			"<arima_order(P D Q)> <number_of_train_samples(or -1 for all)> " + 
174 | 			"<number_of_values_to_predict>\n")
175 | 		exit()
176 | 
177 | 	method_type = sys.argv[1]
178 | 	ts_path = sys.argv[2]
179 | 	value_column = sys.argv[3]
180 | 	timestamp_column = sys.argv[4]
181 | 	ts = ts_load(ts_path, value_column, timestamp_column, lambda x : pandas.to_datetime(x))
182 | 		
183 | 	def acf_pacf():
184 | 		integrate_param = estimate_integrate_param(ts)
185 | 		print("POSSIBLE INTEGRATE PARAMETER ", integrate_param)
186 | 		ts_plot_acf_pacf(ts)
187 | 
188 | 	def best_order():
189 | 		print("INSERT P D Q PARAMETERS OR LEAVE EMPTY LINE FOR BREAK")
190 | 		possible_models = []
191 | 		for line in sys.stdin:
192 | 			params = line.split()
193 | 			if len(params) == 0:
194 | 				break
195 | 			possible_models.append((int(params[0]), int(params[1]), int(params[2])))
196 | 		
197 | 		order = ts_find_best_arima_model(ts, possible_models)
198 | 		print("BEST ORDER ", order)
199 | 
200 | 	def predictions():
201 | 		p, d, q = int(sys.argv[6]), int(sys.argv[7]), int(sys.argv[8]) # ARIMA order
202 | 		m, n = int(sys.argv[9]), int(sys.argv[10]) # past and prediction values
203 | 		m = len(ts) if m < 0 else m
204 | 		run_arima(ts, (p, d, q), m, n, float(sys.argv[5]))
205 | 
206 | 	if method_type == "acf_pacf":
207 | 		acf_pacf()
208 | 	elif method_type == "best_order":
209 | 		best_order()
210 | 	elif method_type == "predictions":
211 | 		predictions()
212 | 
213 | if __name__ == "__main__":
214 |     main()
215 | 


--------------------------------------------------------------------------------
/rnn.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import pandas
  3 | import math
  4 | import time
  5 | import sys
  6 | import datetime
  7 | import matplotlib.pyplot as ma
  8 | import keras.models as km
  9 | import keras.layers as kl
 10 | import sklearn.preprocessing as sp
 11 | 
 12 | numpy.random.seed(42)
 13 | 
 14 | """
 15 | Load time series from CSV file, parse date times and
 16 | select column with values.
 17 | """
 18 | def ts_load(filename, value_name, date_name, date_parser):
 19 |     csv = pandas.read_csv(filename)
 20 |     csv.index = date_parser(csv[date_name])
 21 |     for x in csv.columns.values.tolist():
 22 | 	    if x != value_name:
 23 | 		    del csv[x]
 24 |     return csv
 25 | 
 26 | """
 27 | LSTM cells are sensitive to large-scaled values,
 28 | normalize them to get better resuts.
 29 | """
 30 | def ts_normalize(ts):
 31 | 	scaler = sp.MinMaxScaler(feature_range=(0,1))
 32 | 	return scaler.fit_transform(ts.values), scaler
 33 | 
 34 | """
 35 | Inverse operation to ts_normalize.
 36 | """
 37 | def ts_undo_normalization(ts, scaler):
 38 | 	return scaler.inverse_transform(ts)
 39 | 
 40 | """
 41 | Split time series into two series - train and test.
 42 | """
 43 | def ts_split_train_test(ts, ts_split_train_test=0.8):
 44 |     ts_len = len(ts)
 45 |     train_end = (int)(ts_len*ts_split_train_test)
 46 |     train, test = ts[:train_end], ts[train_end+1:]
 47 |     return train, test
 48 | 
 49 | """
 50 | Create LSTM RNN.
 51 | """
 52 | def network_create(num_lstm, loss="mse", optimizer="sgd"):
 53 | 	# Layer based network
 54 | 	network = km.Sequential()
 55 | 	# Hidden layer is made from LSTM nodes
 56 | 	network.add(kl.LSTM(num_lstm, activation="sigmoid", input_shape=(1,1)))
 57 | 	# Output layer with one output
 58 | 	network.add(kl.Dense(1))
 59 | 	network.compile(loss=loss, optimizer=optimizer)
 60 | 	return network
 61 | 
 62 | """
 63 | Train LSTM RNN.
 64 | """
 65 | def network_fit(network, train_data, target_data, num_training_iterations):
 66 | 	return network.fit(train_data, target_data, epochs=num_training_iterations, batch_size=1, verbose=0)
 67 | 
 68 | """
 69 | Reshape time series dataset for LSTM RNN 
 70 | into [batch size; timesteps; input dimensionality] format.
 71 | """
 72 | def dataset_reshape_for_network(dataset):
 73 | 	return dataset.reshape((dataset.shape[0], 1, dataset.shape[1]))
 74 | 
 75 | """
 76 | Create dataset for LSTM RNN training.
 77 | Basically this creates two lists, first with training values
 78 | and second with lagged target values.
 79 | """
 80 | def dataset_create(ts, num_lags=1):
 81 | 	x = []
 82 | 	y = []
 83 | 	for i in range(len(ts)-num_lags-1):
 84 | 		x.append(ts[i:(i+num_lags), 0])
 85 | 		y.append(ts[i+num_lags, 0])
 86 | 	return numpy.array(x), numpy.array(y)
 87 | 
 88 | """
 89 | Predict new values with LSTM RNN.
 90 | """
 91 | def network_predict_new_values(network, data):
 92 | 	return network.predict(data)
 93 | 
 94 | """
 95 | Load time series from CSV file, 
 96 | create LSTM RNN with custom number of cells, train it on data
 97 | and try to predict new values.
 98 | """
 99 | def rnn(ts_name, num_lstm, iterations,
100 |     train_test_ratio, value_column_name, timestamp_column_name):
101 |     	
102 |     ts = ts_load(ts_name,
103 | 		value_column_name,
104 | 		timestamp_column_name,
105 | 		lambda x : pandas.to_datetime(x))
106 |     predicted_values = []
107 | 
108 |     # Sigmoids are sensitive to large scaled values, normalize them to <0,1>
109 |     ts, scaler = ts_normalize(ts)
110 |     ts_train, ts_test = ts_split_train_test(ts, train_test_ratio)
111 | 
112 |     # Create dataset from TS
113 |     train_dataset_x, train_dataset_y = dataset_create(ts_train)
114 |     test_dataset_x, test_dataset_y = dataset_create(ts_test)
115 | 
116 |     # The input data for our network needs to be
117 |     # provided in [batch size; timesteps; input dimensionality] format
118 |     train_dataset_x = dataset_reshape_for_network(train_dataset_x)
119 |     test_dataset_x = dataset_reshape_for_network(test_dataset_x)
120 | 
121 |     # Create and fit LSTM network
122 |     start_time = time.time()
123 |     network = network_create(num_lstm)
124 |     network_fit(network, train_dataset_x, train_dataset_y, iterations)
125 |     print("TIME ELAPSED ", time.time() - start_time)
126 |     
127 |     predicted_unscaled = network_predict_new_values(network, test_dataset_x)
128 |     predicted_scaled_back = ts_undo_normalization(predicted_unscaled, scaler)
129 |     test_scaled_back = ts_undo_normalization(ts_test, scaler)
130 |     
131 |     # Present results
132 |     test_result = []
133 |     predicted_result = []
134 |     score = 0
135 |     iterations = 0
136 |     
137 |     print("Real value;predicted value")
138 |     for x in zip(test_scaled_back, predicted_scaled_back):
139 |         test_value = x[0][0]
140 |         predicted_value = x[1][0]
141 |         print("%f,%f" % (test_value, predicted_value))
142 |         test_result.append(test_value)
143 |         predicted_result.append(predicted_value)
144 |         score += pow(test_value - predicted_value, 2)
145 |         iterations += 1
146 |     
147 |     print("MSE ", score / iterations)
148 |     
149 |     ma.plot(test_result, color="blue")
150 |     ma.plot(predicted_result, color="red")
151 |     ma.show()
152 | 
153 | def main():
154 | 	if len(sys.argv) != 7:
155 | 		print("Usage:")
156 | 		print("python3.6 %s ts_path num_lstm train_iterations train_test_ratio value_column_name timestamp_column_name" % (sys.argv[0]))
157 | 		exit()
158 | 	
159 | 	rnn(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), float(sys.argv[4]), sys.argv[5], sys.argv[6])
160 | 
161 | if __name__ == "__main__":
162 |     main()
163 | 


--------------------------------------------------------------------------------