├── Exploratory_analysis.py ├── Gpower_Arima_Main.py ├── Gpower_Xgb_Main.py ├── README.md ├── lstm.py ├── lstm_Main.py ├── lstm_multivariate_Main.py ├── myArima.py ├── myXgb.py └── util.py /Exploratory_analysis.py: -------------------------------------------------------------------------------- 1 | # basic + dates 2 | import numpy as np 3 | import pandas as pd 4 | from util import * 5 | import matplotlib.pyplot as plt 6 | import seaborn as sn 7 | 8 | 9 | 10 | 11 | parse_dates = [['Date', 'Time']] 12 | filename = "household_power_consumption.txt" 13 | encode_cols = ['Month', 'DayofWeek', 'Hour'] 14 | bucket_size="60T" 15 | 16 | # (1) data is not datetime ordered. It is random 17 | N_rows = 60000 18 | df = preprocess(N_rows, parse_dates, filename) 19 | 20 | G_power=df["Global_active_power"] 21 | #G_power_sort=G_power.sort_values('index') 22 | 23 | df = pd.DataFrame(bucket_avg(G_power,bucket_size)) 24 | df.dropna(inplace=True) 25 | df.Global_active_power.plot(style='b.') 26 | plt.ylabel('Global Active Power') 27 | plt.xlabel('Time') 28 | plt.title("Last 60000 rows") 29 | plt.savefig( 'EDA_2010.png', dpi=300) 30 | plt.show() 31 | 32 | 33 | ### (2)So we focus on the 2010-11 area which is around the last 19000 data rows 34 | N_rows = 19000 35 | df = preprocess(N_rows, parse_dates, filename) 36 | G_power=df["Global_active_power"] 37 | print(G_power.shape) # 21661 rows 38 | df_G = pd.DataFrame(bucket_avg(G_power,bucket_size)) 39 | df_G.dropna(inplace=True) 40 | 41 | # time series plot 42 | ts_label='Global_active_power_in_Nov_2010' 43 | timeseries_plot(df_G.Global_active_power,'g', ts_label) 44 | 45 | # component plot 46 | from pylab import rcParams 47 | import statsmodels.api as sm 48 | rcParams['figure.figsize'] = 11, 9 49 | decomposition = sm.tsa.seasonal_decompose(df_G.Global_active_power, model='additive') 50 | fig = decomposition.plot() 51 | plt.tight_layout() 52 | plt.savefig( 'ts_decomposition_plot.png', dpi=300) 53 | plt.show() 54 | 55 | 56 | # heatmap 57 | df_G['Time of Day'] = df_G.index.time 58 | df_G['Date'] = df_G.index.date 59 | print(df_G.head()) 60 | dfG_pivot = df_G.pivot_table(index="Date", columns="Time of Day",values='Global_active_power',fill_value=0) 61 | dfG_pivot.head() 62 | plt.figure(figsize=(12, 8)) 63 | sn.heatmap(dfG_pivot,cmap='Blues') 64 | plt.tight_layout() 65 | plt.savefig( 'date_time_power_heatmap.png', dpi=300) 66 | plt.show() 67 | 68 | # correlation plot 69 | names = df.columns 70 | correlations = df.corr() 71 | # plot correlation matrix 72 | fig = plt.figure() 73 | ax = fig.add_subplot(111) 74 | cax = ax.matshow(correlations, vmin=-1, vmax=1,cmap='Accent') 75 | fig.colorbar(cax) 76 | ticks = np.arange(0,7,1) 77 | ax.set_xticks(ticks) 78 | ax.set_yticks(ticks) 79 | ax.set_xticklabels(names, rotation = 90) 80 | ax.set_yticklabels(names) 81 | #plt.figure(figsize=(8,8)) 82 | plt.tight_layout() 83 | plt.savefig('correlation.png', dpi=300) 84 | plt.show() 85 | -------------------------------------------------------------------------------- /Gpower_Arima_Main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from util import timeseries_plot, bucket_avg, preprocess, config_plot 3 | from myArima import * 4 | 5 | config_plot() 6 | 7 | 8 | # we focus on the last 10 days data in Nov 2010 9 | N_rows = 15000 10 | parse_dates = [['Date', 'Time']] 11 | filename = "household_power_consumption.txt" 12 | 13 | df = preprocess(N_rows, parse_dates, filename) 14 | 15 | G_power = pd.to_numeric(df["Global_active_power"]) 16 | # time series plot of one-minute sampling rate data 17 | timeseries_plot(G_power, 'g', 'Global_active_power') 18 | 19 | # we take a 30 minutes bucket average of our time series data to reduce noise. 20 | bucket_size = "30T" 21 | G_power_avg = bucket_avg(G_power, bucket_size) 22 | # plot of 30 minutes average. 23 | ts_label = 'G_power_avg' 24 | timeseries_plot(G_power_avg, 'g', ts_label) 25 | 26 | 27 | # "Grid search" of seasonal ARIMA model. 28 | # the seasonal periodicy 24 hours, i.e. S=24*60/30 = 48 samples 29 | arima_para = {} 30 | arima_para['p'] = range(2) 31 | arima_para['d'] = range(2) 32 | arima_para['q'] = range(2) 33 | # the seasonal periodicy is 24 hours 34 | seasonal_para = round(24 * 60 / (float(bucket_size[:-1]))) 35 | arima = Arima_Class(arima_para, seasonal_para) 36 | 37 | arima.fit(G_power_avg) 38 | 39 | # Prediction on observed data starting on pred_start 40 | # observed and prediction starting dates in plots 41 | plot_start = '2010-11-24 00:00:00' 42 | pred_start = '2010-11-25 14:00:00' 43 | 44 | # One-step ahead forecasts 45 | dynamic = False 46 | arima.pred(G_power_avg, plot_start, pred_start, dynamic, ts_label) 47 | 48 | # Dynamic forecasts 49 | dynamic = True 50 | arima.pred(G_power_avg, plot_start, pred_start, dynamic, ts_label) 51 | 52 | # Forecasts to unseen future data 53 | n_steps = 100 # next 100 * 30 min = 50 hours 54 | arima.forcast(G_power_avg, n_steps, ts_label) 55 | -------------------------------------------------------------------------------- /Gpower_Xgb_Main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from util import * 4 | from myXgb import * 5 | import matplotlib.pyplot as plt 6 | import xgboost as xgb 7 | from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV 8 | from xgboost.sklearn import XGBRegressor # wrapper 9 | import scipy.stats as st 10 | 11 | config_plot() 12 | 13 | 14 | ############################################################################## 15 | # we only focus on the last 18000 points for datetime information 16 | # Run xgboost on all features 17 | # get data 18 | N_rows = 18000 19 | parse_dates = [['Date', 'Time']] 20 | filename = "household_power_consumption.txt" 21 | encode_cols = ['Month', 'DayofWeek', 'Hour'] 22 | 23 | df = preprocess(N_rows, parse_dates, filename) 24 | # keep all features 25 | df = date_transform(df, encode_cols) 26 | 27 | # base parameters 28 | xgb_params = { 29 | 'booster': 'gbtree', 30 | 'objective': 'reg:linear', # regression task 31 | 'subsample': 0.80, # 80% of data to grow trees and prevent overfitting 32 | 'colsample_bytree': 0.85, # 85% of features used 33 | 'eta': 0.1, 34 | 'max_depth': 10, 35 | 'seed': 42} # for reproducible results 36 | 37 | val_ratio = 0.3 38 | ntree = 300 39 | early_stop = 50 40 | 41 | print('-----Xgboost Using All Numeric Features-----', 42 | '\n---inital model feature importance---') 43 | fig_allFeatures = xgb_importance( 44 | df, val_ratio, xgb_params, ntree, early_stop, 'All Features') 45 | plt.show() 46 | 47 | ############################################################################# 48 | # xgboost using only datetime information 49 | bucket_size = "5T" 50 | df = preprocess(N_rows, parse_dates, filename) 51 | G_power = df["Global_active_power"] 52 | 53 | df = pd.DataFrame(bucket_avg(G_power, bucket_size)) 54 | df.dropna(inplace=True) 55 | df.iloc[-1, :].index # last time step #2010-11-26 21:00:00 56 | 57 | test_start_date = '2010-11-25 20:00:00' 58 | unseen_start_date = '2010-11-26 21:10:00' 59 | steps = 200 60 | # get splited data 61 | df_unseen, df_test, df = xgb_data_split( 62 | df, bucket_size, unseen_start_date, steps, test_start_date, encode_cols) 63 | print('\n-----Xgboost on only datetime information---------\n') 64 | 65 | dim = {'train and validation data ': df.shape, 66 | 'test data ': df_test.shape, 67 | 'forecasting data ': df_unseen.shape} 68 | print(pd.DataFrame(list(dim.items()), columns=['Data', 'dimension'])) 69 | 70 | # train model 71 | Y = df.iloc[:, 0] 72 | X = df.iloc[:, 1:] 73 | X_train, X_val, y_train, y_val = train_test_split(X, Y, 74 | test_size=val_ratio, 75 | random_state=42) 76 | 77 | X_test = xgb.DMatrix(df_test.iloc[:, 1:]) 78 | Y_test = df_test.iloc[:, 0] 79 | X_unseen = xgb.DMatrix(df_unseen) 80 | 81 | dtrain = xgb.DMatrix(X_train, y_train) 82 | dval = xgb.DMatrix(X_val, y_val) 83 | watchlist = [(dtrain, 'train'), (dval, 'validate')] 84 | 85 | # Grid Search 86 | params_sk = { 87 | 'objective': 'reg:linear', 88 | 'subsample': 0.8, 89 | 'colsample_bytree': 0.85, 90 | 'seed': 42} 91 | 92 | skrg = XGBRegressor(**params_sk) 93 | 94 | skrg.fit(X_train, y_train) 95 | 96 | params_grid = {"n_estimators": st.randint(100, 500), 97 | # "colsample_bytree": st.beta(10, 1), 98 | # "subsample": st.beta(10, 1), 99 | # "gamma": st.uniform(0, 10), 100 | # 'reg_alpha': st.expon(0, 50), 101 | # "min_child_weight": st.expon(0, 50), 102 | # "learning_rate": st.uniform(0.06, 0.12), 103 | 'max_depth': st.randint(6, 30) 104 | } 105 | search_sk = RandomizedSearchCV( 106 | skrg, params_grid, cv=5, random_state=1, n_iter=20) # 5 fold cross validation 107 | search_sk.fit(X, Y) 108 | 109 | # best parameters 110 | print("best parameters:", search_sk.best_params_); print( 111 | "best score:", search_sk.best_score_) 112 | # with new parameters 113 | params_new = {**params_sk, **search_sk.best_params_} 114 | 115 | model_final = xgb.train(params_new, dtrain, evals=watchlist, 116 | early_stopping_rounds=early_stop, verbose_eval=True) 117 | 118 | print('-----Xgboost Using Datetime Features Only------', 119 | '\n---Grid Search model feature importance---') 120 | importance = model_final.get_fscore() 121 | importance_sorted = sorted(importance.items(), key=operator.itemgetter(1)) 122 | fig1 = feature_importance_plot(importance_sorted, 'feature importance') 123 | plt.show() 124 | 125 | ############################################################################# 126 | # Forcasting 127 | # prediction to testing data 128 | Y_hat = model_final.predict(X_test) 129 | Y_hat = pd.DataFrame(Y_hat, index=Y_test.index, columns=["predicted"]) 130 | 131 | # predictions to unseen future data 132 | unseen_y = model_final.predict(X_unseen) 133 | forecasts = pd.DataFrame( 134 | unseen_y, index=df_unseen.index, columns=["forecasts"]) 135 | 136 | # plot forcast results using grid search final model 137 | plot_start = '2010-11-24 00:00:00' 138 | print('-----Xgboost Using Datetime Features Only------', 139 | '\n---Forecasting from Grid Search---') 140 | forecasts_plot2 = xgb_forecasts_plot( 141 | plot_start, Y, Y_test, Y_hat, forecasts, 'Grid Search') 142 | 143 | # forcasts results using itinial model 144 | xgb_model = xgb.train(xgb_params, dtrain, ntree, evals=watchlist, 145 | early_stopping_rounds=early_stop, verbose_eval=False) 146 | Y_hat = xgb_model.predict(X_test) 147 | Y_hat = pd.DataFrame(Y_hat, index=Y_test.index, columns=["test_predicted"]) 148 | unseen_y = xgb_model.predict(X_unseen) 149 | forecasts = pd.DataFrame( 150 | unseen_y, index=df_unseen.index, columns=["forecasts"]) 151 | plot_start = '2010-11-24 00:00:00' 152 | print('-----Xgboost Using Datetime Features Only------', 153 | '\n---Forecasting from initial---') 154 | forecasts_plot1 = xgb_forecasts_plot( 155 | plot_start, Y, Y_test, Y_hat, forecasts, 'Initial Model') 156 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Time Series Prediction for Individual Household Power 2 | Dateset: https://archive.ics.uci.edu/ml/datasets/individual+household+electric+power+consumption 3 | 4 | The data was collected with a one-minute sampling rate over a period between Dec 2006 5 | and Nov 2010 (47 months) were measured. Six independent variables (electrical quantities and sub-metering values) a numerical dependent variable Global active power with 2,075,259 observations are available. Our goal is to predict the Global active power into the future. 6 | 7 | Here, missing values are dropped for simplicity. Furthermore, we find that not all observations are ordered by the date time. Therefore we analyze the data with explicit time stamp as an index. In the preprocessing step, we perform a bucket-average of the raw data to reduce the noise from the one-minute sampling rate. For simplicity, we only focus on the last 18000 rows of raw dataset (the most recent data in Nov 2010). 8 | 9 | ### A list of python files: 10 | + *Gpower_Arima_Main.py* : The **executable** python program of a univariate ARIMA model. 11 | + myArima.py : implements a class with some callable methods used for the ARIMA model. 12 | + *Gpower_Xgb_Main.py* : The **executable** python program of a tree based model (xgboost). 13 | + myXgb.py : implements some functions used for the xgboost model. 14 | + *lstm_Main.py* : The **executable** python program of a LSTM model. 15 | + lstm.py : implements a class of a time series model using an LSTMCell. The credit should go to https://github.com/hzy46/TensorFlow-Time-Series-Examples/blob/master/train_lstm.py 16 | + util.py : implements various functions for data preprocessing. 17 | + Exploratory_analysis.py : exploratory analysis and plots of data. 18 | ```diff 19 | + Environment : Python 3.6, TensorFlow1.4. 20 | ``` 21 | ### Here, I used 3 different approaches to model the pattern of power consumption. 22 | - **Univariate time series ARIMA**.(30-min average was applied on the data to reduce noise.) 23 | ![onestep](https://user-images.githubusercontent.com/25689659/34470019-001ea4e0-eef7-11e7-822a-5a5132e8ca75.png) 24 | ![dynamic](https://user-images.githubusercontent.com/25689659/34470018-0011600a-eef7-11e7-89df-79372c49a791.png) 25 | ![forecast](https://user-images.githubusercontent.com/25689659/34470017-0004e848-eef7-11e7-9148-abfb62f95dcc.png) 26 | - **Regression tree-based xgboost**.(5-min average was performed.) 27 | ![xgbManual](https://user-images.githubusercontent.com/25689659/34470022-00463b90-eef7-11e7-8a3c-d80df291f7d6.png) 28 | - **Recurrent neural network univariate LSTM (long short-term memoery) model**. (15-min average was performed to reduce the noise.) 29 | ![predict_result](https://user-images.githubusercontent.com/25689659/34470791-a5047402-ef07-11e7-9111-ff1da558b6e1.png) 30 | 31 | ### Possible approaches to do in the future work: 32 | #### (i) Dynamic Regression Time Series Model 33 | Given the strong correlations between Sub metering 1, Sub metering 2 and Sub metering 3 and our target variable, 34 | these variables could be included into the dynamic regression model or regression time series model. 35 | 36 | #### (ii) Dynamic Xgboost Model 37 | Include the timestep-shifted Global active power columns as features. The target variable will be current Global active power. 38 | Recent history of Global active power up to this time stamp (say, from 100 timesteps before) should be included 39 | as extra features. 40 | 41 | #### (iii) Multivariate LSTM 42 | Include the features per timestamp Sub metering 1, Sub metering 2 and Sub metering 3, date, time and our target variable into the RNNCell for the multivariate time-series LSTM model. 43 | ![multivariate](https://user-images.githubusercontent.com/25689659/35536009-86ac3612-0513-11e8-9ccd-4311dff198ee.png) 44 | -------------------------------------------------------------------------------- /lstm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | # TensorFlow1.4 20 | import tensorflow as tf 21 | 22 | from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators 23 | from tensorflow.contrib.timeseries.python.timeseries import model as ts_model 24 | from tensorflow.contrib.timeseries.python.timeseries import NumpyReader 25 | 26 | 27 | ########################## 28 | 29 | class _LSTMModel(ts_model.SequentialTimeSeriesModel): 30 | """A time series model-building example using an RNNCell.""" 31 | 32 | def __init__(self, num_units, num_features, dtype=tf.float32): 33 | """Initialize/configure the model object. 34 | Note that we do not start graph building here. Rather, this object is a 35 | configurable factory for TensorFlow graphs which are run by an Estimator. 36 | Args: 37 | num_units: The number of units in the model's LSTMCell. 38 | num_features: The dimensionality of the time series (features per 39 | timestep). 40 | dtype: The floating point data type to use. 41 | """ 42 | super(_LSTMModel, self).__init__( 43 | # Pre-register the metrics we'll be outputting (just a mean here). 44 | train_output_names=["mean"], 45 | predict_output_names=["mean"], 46 | num_features=num_features, 47 | dtype=dtype) 48 | self._num_units = num_units 49 | # Filled in by initialize_graph() 50 | self._lstm_cell = None 51 | self._lstm_cell_run = None 52 | self._predict_from_lstm_output = None 53 | 54 | def initialize_graph(self, input_statistics): 55 | """Save templates for components, which can then be used repeatedly. 56 | This method is called every time a new graph is created. It's safe to start 57 | adding ops to the current default graph here, but the graph should be 58 | constructed from scratch. 59 | Args: 60 | input_statistics: A math_utils.InputStatistics object. 61 | """ 62 | super(_LSTMModel, self).initialize_graph(input_statistics=input_statistics) 63 | self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units) 64 | # Create templates so we don't have to worry about variable reuse. 65 | self._lstm_cell_run = tf.make_template( 66 | name_="lstm_cell", 67 | func_=self._lstm_cell, 68 | create_scope_now_=True) 69 | # Transforms LSTM output into mean predictions. 70 | self._predict_from_lstm_output = tf.make_template( 71 | name_="predict_from_lstm_output", 72 | func_=lambda inputs: tf.layers.dense(inputs=inputs, units=self.num_features), 73 | create_scope_now_=True) 74 | 75 | def get_start_state(self): 76 | """Return initial state for the time series model.""" 77 | return ( 78 | # Keeps track of the time associated with this state for error checking. 79 | tf.zeros([], dtype=tf.int64), 80 | # The previous observation or prediction. 81 | tf.zeros([self.num_features], dtype=self.dtype), 82 | # The state of the RNNCell (batch dimension removed since this parent 83 | # class will broadcast). 84 | [tf.squeeze(state_element, axis=0) 85 | for state_element 86 | in self._lstm_cell.zero_state(batch_size=1, dtype=self.dtype)]) 87 | 88 | def _transform(self, data): 89 | """Normalize data based on input statistics to encourage stable training.""" 90 | mean, variance = self._input_statistics.overall_feature_moments 91 | return (data - mean) / variance 92 | 93 | def _de_transform(self, data): 94 | """Transform data back to the input scale.""" 95 | mean, variance = self._input_statistics.overall_feature_moments 96 | return data * variance + mean 97 | 98 | def _filtering_step(self, current_times, current_values, state, predictions): 99 | """Update model state based on observations. 100 | Note that we don't do much here aside from computing a loss. In this case 101 | it's easier to update the RNN state in _prediction_step, since that covers 102 | running the RNN both on observations (from this method) and our own 103 | predictions. This distinction can be important for probabilistic models, 104 | where repeatedly predicting without filtering should lead to low-confidence 105 | predictions. 106 | Args: 107 | current_times: A [batch size] integer Tensor. 108 | current_values: A [batch size, self.num_features] floating point Tensor 109 | with new observations. 110 | state: The model's state tuple. 111 | predictions: The output of the previous `_prediction_step`. 112 | Returns: 113 | A tuple of new state and a predictions dictionary updated to include a 114 | loss (note that we could also return other measures of goodness of fit, 115 | although only "loss" will be optimized). 116 | """ 117 | state_from_time, prediction, lstm_state = state 118 | with tf.control_dependencies( 119 | [tf.assert_equal(current_times, state_from_time)]): 120 | transformed_values = self._transform(current_values) 121 | # Use mean squared error across features for the loss. 122 | predictions["loss"] = tf.reduce_mean( 123 | (prediction - transformed_values) ** 2, axis=-1) 124 | # Keep track of the new observation in model state. It won't be run 125 | # through the LSTM until the next _imputation_step. 126 | new_state_tuple = (current_times, transformed_values, lstm_state) 127 | return (new_state_tuple, predictions) 128 | 129 | def _prediction_step(self, current_times, state): 130 | """Advance the RNN state using a previous observation or prediction.""" 131 | _, previous_observation_or_prediction, lstm_state = state 132 | lstm_output, new_lstm_state = self._lstm_cell_run( 133 | inputs=previous_observation_or_prediction, state=lstm_state) 134 | next_prediction = self._predict_from_lstm_output(lstm_output) 135 | new_state_tuple = (current_times, next_prediction, new_lstm_state) 136 | return new_state_tuple, {"mean": self._de_transform(next_prediction)} 137 | 138 | def _imputation_step(self, current_times, state): 139 | """Advance model state across a gap.""" 140 | # Does not do anything special if we're jumping across a gap. More advanced 141 | # models, especially probabilistic ones, would want a special case that 142 | # depends on the gap size. 143 | return state 144 | 145 | def _exogenous_input_step( 146 | self, current_times, current_exogenous_regressors, state): 147 | """Update model state based on exogenous regressors.""" 148 | raise NotImplementedError( 149 | "Exogenous inputs are not implemented for this example.") 150 | 151 | ############################################################################## 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /lstm_Main.py: -------------------------------------------------------------------------------- 1 | # TensorFlow1.4 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | 7 | import tensorflow as tf 8 | 9 | from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators 10 | from tensorflow.contrib.timeseries.python.timeseries import model as ts_model 11 | from tensorflow.contrib.timeseries.python.timeseries import NumpyReader 12 | 13 | from lstm import _LSTMModel 14 | from util import preprocess, bucket_avg 15 | import pandas as pd 16 | import numpy as np 17 | import matplotlib 18 | matplotlib.use("agg") 19 | import matplotlib.pyplot as plt 20 | 21 | # get LSTM data 22 | def get_rnn_data(N_rows, bucket_size): 23 | parse_dates = [['Date', 'Time']] 24 | filename = "household_power_consumption.txt" 25 | df = preprocess(N_rows, parse_dates, filename) 26 | df = pd.DataFrame(bucket_avg(df["Global_active_power"], bucket_size)) 27 | df.dropna(inplace=True) 28 | x = np.array(range(df.shape[0])) 29 | y = np.array(df.Global_active_power) 30 | return x, y 31 | 32 | if __name__ == '__main__': 33 | tf.logging.set_verbosity(tf.logging.INFO) 34 | x, y = get_rnn_data(18000, "15T") 35 | x_train, y_train = x[:900],y[:900] # first 900 data points for training 36 | x_eval, y_eval = x[900:],y[900:] # last 300 data points for evaluation 37 | 38 | data_train = { 39 | tf.contrib.timeseries.TrainEvalFeatures.TIMES: x_train, 40 | tf.contrib.timeseries.TrainEvalFeatures.VALUES: y_train, 41 | } 42 | 43 | data_eval = { 44 | tf.contrib.timeseries.TrainEvalFeatures.TIMES: x_eval, 45 | tf.contrib.timeseries.TrainEvalFeatures.VALUES: y_eval, 46 | } 47 | 48 | reader = NumpyReader(data_train) 49 | reader_eval = NumpyReader(data_eval) 50 | 51 | train_input_fn = tf.contrib.timeseries.RandomWindowInputFn( 52 | reader, batch_size=4, window_size=100) 53 | 54 | estimator = ts_estimators.TimeSeriesRegressor( 55 | model=_LSTMModel(num_features=1, num_units=128), 56 | optimizer=tf.train.AdamOptimizer(0.001)) 57 | 58 | estimator.train(input_fn=train_input_fn, steps=2000) 59 | evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader_eval) 60 | evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1) 61 | # Predict starting after the evaluation 62 | (predictions,) = tuple(estimator.predict( 63 | input_fn=tf.contrib.timeseries.predict_continuation_input_fn( 64 | evaluation, steps=200))) 65 | 66 | observed_times = x 67 | observed = y 68 | evaluated_times = evaluation["times"][0] 69 | evaluated = evaluation["mean"][0] 70 | predicted_times = predictions['times'] 71 | predicted = predictions["mean"] 72 | 73 | plt.figure(figsize=(15, 8)) 74 | plt.axvline(1200, linestyle="dotted", linewidth=4, color='r') 75 | observed_lines = plt.plot(observed_times, observed, label="observation", color="k") 76 | evaluated_lines = plt.plot(evaluated_times, evaluated, label="evaluation", color="g") 77 | predicted_lines = plt.plot(predicted_times, predicted, label="forecasts", color="r") 78 | xlim = ((0,1401)) 79 | plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]], 80 | loc="upper left") 81 | plt.tight_layout() 82 | plt.savefig('predict_result.png',dpi=300) 83 | -------------------------------------------------------------------------------- /lstm_multivariate_Main.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from util import preprocess, bucket_avg 6 | 7 | import tensorflow as tf 8 | 9 | from tensorflow.contrib.timeseries.python.timeseries import NumpyReader 10 | 11 | from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators 12 | from tensorflow.contrib.timeseries.python.timeseries import model as ts_model 13 | 14 | from lstm import _LSTMModel 15 | import numpy as np 16 | import pandas as pd 17 | import matplotlib 18 | matplotlib.use("agg") 19 | import matplotlib.pyplot as plt 20 | 21 | 22 | 23 | # get data 24 | 25 | def get_rnn_data(N_rows, bucket_size): 26 | parse_dates = [['Date', 'Time']] 27 | filename = "household_power_consumption.txt" 28 | df = preprocess(N_rows, parse_dates, filename) 29 | global_power=pd.DataFrame(bucket_avg(df["Global_active_power"], bucket_size)) 30 | sub1=pd.DataFrame(bucket_avg(df["Sub_metering_1"], bucket_size)) 31 | sub2=pd.DataFrame(bucket_avg(df["Sub_metering_2"], bucket_size)) 32 | sub3=pd.DataFrame(bucket_avg(df["Sub_metering_3"], bucket_size)) 33 | 34 | #df.dropna(inplace=True) 35 | #df.iloc[-1, :].index # last time step #2010-11-26 21:00:00 36 | x = np.array(range(global_power.shape[0])) 37 | y = np.column_stack((sub1, sub2, sub3, global_power)) 38 | return x, y 39 | 40 | 41 | if __name__ == '__main__': 42 | tf.logging.set_verbosity(tf.logging.INFO) 43 | x, y = get_rnn_data(18000, "15T") 44 | data = { 45 | tf.contrib.timeseries.TrainEvalFeatures.TIMES: x, 46 | tf.contrib.timeseries.TrainEvalFeatures.VALUES: y, 47 | } 48 | reader = NumpyReader(data) 49 | train_input_fn = tf.contrib.timeseries.RandomWindowInputFn( 50 | reader, batch_size=4, window_size=100) 51 | 52 | estimator = ts_estimators.TimeSeriesRegressor( 53 | model=_LSTMModel(num_features=4, num_units=128), 54 | optimizer=tf.train.AdamOptimizer(0.001)) 55 | 56 | estimator.train(input_fn=train_input_fn, steps=1000) 57 | evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader) 58 | evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1) 59 | # Predict starting after the evaluation 60 | (predictions,) = tuple(estimator.predict( 61 | input_fn=tf.contrib.timeseries.predict_continuation_input_fn( 62 | evaluation, steps=100))) 63 | 64 | observed_times = evaluation["times"][0] 65 | observed = evaluation["observed"][0, :, :] 66 | evaluated_times = evaluation["times"][0] 67 | evaluated = evaluation["mean"][0] 68 | predicted_times = predictions['times'] 69 | predicted = predictions["mean"] 70 | #plot all 4 variables 71 | plt.figure(figsize=(15, 16)) 72 | plt.subplot(411) 73 | plt.axvline(1200, linestyle="dotted", linewidth=4, color='r') 74 | observed_lines = plt.plot(observed_times, observed[:,3], label="observation", color="k") 75 | evaluated_lines = plt.plot(evaluated_times, evaluated[:,3], label="evaluation", color="g") 76 | predicted_lines = plt.plot(predicted_times, predicted[:,3], label="prediction", color="r") 77 | plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],loc="upper left") 78 | plt.title("Global active power") 79 | 80 | plt.subplot(412) 81 | plt.axvline(1200, linestyle="dotted", linewidth=4, color='r') 82 | observed_lines = plt.plot(observed_times, observed[:,0], label="observation", color="k") 83 | evaluated_lines = plt.plot(evaluated_times, evaluated[:,0], label="evaluation", color="g") 84 | predicted_lines = plt.plot(predicted_times, predicted[:,0], label="prediction", color="r") 85 | plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],loc="upper left") 86 | plt.title("Sub_metering 1") 87 | 88 | plt.subplot(413) 89 | plt.axvline(1200, linestyle="dotted", linewidth=4, color='r') 90 | observed_lines = plt.plot(observed_times, observed[:,1], label="observation", color="k") 91 | evaluated_lines = plt.plot(evaluated_times, evaluated[:,1], label="evaluation", color="g") 92 | predicted_lines = plt.plot(predicted_times, predicted[:,1], label="prediction", color="r") 93 | plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],loc="upper left") 94 | plt.title("Sub_metering 2") 95 | 96 | plt.subplot(414) 97 | plt.axvline(1200, linestyle="dotted", linewidth=4, color='r') 98 | observed_lines = plt.plot(observed_times, observed[:,2], label="observation", color="k") 99 | evaluated_lines = plt.plot(evaluated_times, evaluated[:,2], label="evaluation", color="g") 100 | predicted_lines = plt.plot(predicted_times, predicted[:,2], label="prediction", color="r") 101 | plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],loc="upper left") 102 | plt.title("Sub_metering 3") 103 | 104 | plt.savefig('multivariate.png',dpi=300) -------------------------------------------------------------------------------- /myArima.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import warnings 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import statsmodels.api as sm 7 | 8 | 9 | class Arima_Class: 10 | def __init__(self, arima_para, seasonal_para): 11 | # Define the p, d and q parameters in Arima(p,d,q)(P,D,Q) models 12 | p = arima_para['p'] 13 | d = arima_para['d'] 14 | q = arima_para['q'] 15 | # Generate all different combinations of p, q and q triplets 16 | self.pdq = list(itertools.product(p, d, q)) 17 | # Generate all different combinations of seasonal p, q and q triplets 18 | self.seasonal_pdq = [(x[0], x[1], x[2], seasonal_para) 19 | for x in list(itertools.product(p, d, q))] 20 | 21 | def fit(self, ts): 22 | warnings.filterwarnings("ignore") 23 | results_list = [] 24 | for param in self.pdq: 25 | for param_seasonal in self.seasonal_pdq: 26 | try: 27 | mod = sm.tsa.statespace.SARIMAX(ts, 28 | order=param, 29 | seasonal_order=param_seasonal, 30 | enforce_stationarity=False, 31 | enforce_invertibility=False) 32 | results = mod.fit() 33 | 34 | print('ARIMA{}x{}seasonal - AIC:{}'.format(param, 35 | param_seasonal, results.aic)) 36 | results_list.append([param, param_seasonal, results.aic]) 37 | except: 38 | continue 39 | results_list = np.array(results_list) 40 | lowest_AIC = np.argmin(results_list[:, 2]) 41 | print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++') 42 | print('ARIMA{}x{}seasonal with lowest_AIC:{}'.format( 43 | results_list[lowest_AIC, 0], results_list[lowest_AIC, 1], results_list[lowest_AIC, 2])) 44 | print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++') 45 | 46 | mod = sm.tsa.statespace.SARIMAX(ts, 47 | order=results_list[lowest_AIC, 0], 48 | seasonal_order=results_list[lowest_AIC, 1], 49 | enforce_stationarity=False, 50 | enforce_invertibility=False) 51 | self.final_result = mod.fit() 52 | print('Final model summary:') 53 | print(self.final_result.summary().tables[1]) 54 | print('Final model diagnostics:') 55 | self.final_result.plot_diagnostics(figsize=(15, 12)) 56 | plt.tight_layout() 57 | plt.savefig('model_diagnostics.png', dpi=300) 58 | plt.show() 59 | 60 | def pred(self, ts, plot_start, pred_start, dynamic, ts_label): 61 | 62 | pred_dynamic = self.final_result.get_prediction( 63 | start=pd.to_datetime(pred_start), dynamic=dynamic, full_results=True) 64 | pred_dynamic_ci = pred_dynamic.conf_int() 65 | ax = ts[plot_start:].plot(label='observed', figsize=(15, 10)) 66 | 67 | if dynamic == False: 68 | pred_dynamic.predicted_mean.plot( 69 | label='One-step ahead Forecast', ax=ax) 70 | else: 71 | pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax) 72 | 73 | ax.fill_between(pred_dynamic_ci.index, 74 | pred_dynamic_ci.iloc[:, 0], 75 | pred_dynamic_ci.iloc[:, 1], color='k', alpha=.25) 76 | ax.fill_betweenx(ax.get_ylim(), pd.to_datetime(plot_start), ts.index[-1], 77 | alpha=.1, zorder=-1) 78 | ax.set_xlabel('Time') 79 | ax.set_ylabel(ts_label) 80 | plt.legend() 81 | plt.tight_layout() 82 | if dynamic == False: 83 | plt.savefig(ts_label + '_one_step_pred.png', dpi=300) 84 | else: 85 | plt.savefig(ts_label + '_dynamic_pred.png', dpi=300) 86 | plt.show() 87 | 88 | def forcast(self, ts, n_steps, ts_label): 89 | # Get forecast n_steps ahead in future 90 | pred_uc = self.final_result.get_forecast(steps=n_steps) 91 | 92 | # Get confidence intervals of forecasts 93 | pred_ci = pred_uc.conf_int() 94 | ax = ts.plot(label='observed', figsize=(15, 10)) 95 | pred_uc.predicted_mean.plot(ax=ax, label='Forecast in Future') 96 | ax.fill_between(pred_ci.index, 97 | pred_ci.iloc[:, 0], 98 | pred_ci.iloc[:, 1], color='k', alpha=.25) 99 | ax.set_xlabel('Time') 100 | ax.set_ylabel(ts_label) 101 | plt.tight_layout() 102 | plt.savefig(ts_label + '_forcast.png', dpi=300) 103 | plt.legend() 104 | plt.show() 105 | -------------------------------------------------------------------------------- /myXgb.py: -------------------------------------------------------------------------------- 1 | from util import * 2 | from sklearn.model_selection import train_test_split 3 | import pandas as pd 4 | import xgboost as xgb 5 | import operator 6 | import matplotlib.pyplot as plt 7 | 8 | # get data for train, test, and forecast(unseen) 9 | 10 | 11 | def xgb_data_split(df, bucket_size, unseen_start_date, steps, test_start_date, encode_cols): 12 | # generate unseen data 13 | unseen = get_unseen_data(unseen_start_date, steps, 14 | encode_cols, bucket_size) 15 | df = pd.concat([df, unseen], axis=0) 16 | df = date_transform(df, encode_cols) 17 | 18 | # data for forecast ,skip the connecting point 19 | df_unseen = df[unseen_start_date:].iloc[:, 1:] 20 | test_start = '2010-11-26 00:00:00' 21 | # skip the connecting point 22 | df_test = df[test_start_date: unseen_start_date].iloc[:-1, :] 23 | df_train = df[:test_start_date] 24 | return df_unseen, df_test, df_train 25 | 26 | 27 | def feature_importance_plot(importance_sorted, title): 28 | df = pd.DataFrame(importance_sorted, columns=['feature', 'fscore']) 29 | df['fscore'] = df['fscore'] / df['fscore'].sum() 30 | 31 | plt.figure() 32 | # df.plot() 33 | df.plot(kind='barh', x='feature', y='fscore', 34 | legend=False, figsize=(12, 10)) 35 | plt.title('XGBoost Feature Importance') 36 | plt.xlabel('relative importance') 37 | plt.tight_layout() 38 | plt.savefig(title + '.png', dpi=300) 39 | plt.show() 40 | 41 | 42 | def xgb_importance(df, test_ratio, xgb_params, ntree, early_stop, plot_title): 43 | df = pd.DataFrame(df) 44 | # split the data into train/test set 45 | Y = df.iloc[:, 0] 46 | X = df.iloc[:, 1:] 47 | X_train, X_test, y_train, y_test = train_test_split(X, Y, 48 | test_size=test_ratio, 49 | random_state=42) 50 | 51 | dtrain = xgb.DMatrix(X_train, y_train) 52 | dtest = xgb.DMatrix(X_test, y_test) 53 | 54 | watchlist = [(dtrain, 'train'), (dtest, 'validate')] 55 | 56 | xgb_model = xgb.train(xgb_params, dtrain, ntree, evals=watchlist, 57 | early_stopping_rounds=early_stop, verbose_eval=True) 58 | 59 | importance = xgb_model.get_fscore() 60 | importance_sorted = sorted(importance.items(), key=operator.itemgetter(1)) 61 | feature_importance_plot(importance_sorted, plot_title) 62 | 63 | 64 | def xgb_forecasts_plot(plot_start, Y, Y_test, Y_hat, forecasts, title): 65 | Y = pd.concat([Y, Y_test]) 66 | ax = Y[plot_start:].plot(label='observed', figsize=(15, 10)) 67 | #Y_test.plot(label='test_observed', ax=ax) 68 | Y_hat.plot(label="predicted", ax=ax) 69 | forecasts.plot(label="forecast", ax=ax) 70 | 71 | ax.fill_betweenx(ax.get_ylim(), pd.to_datetime(Y_test.index[0]), Y_test.index[-1], 72 | alpha=.1, zorder=-1) 73 | ax.set_xlabel('Time') 74 | ax.set_ylabel('Global Active Power') 75 | plt.legend() 76 | plt.tight_layout() 77 | plt.savefig(title + '.png', dpi=300) 78 | plt.show() 79 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from matplotlib import dates 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def preprocess(N_rows, parse_dates, filename): 8 | total_rows = sum(1 for l in open(filename)) 9 | variable_names = pd.read_csv( 10 | filename, header=0, delimiter=';', sep='', nrows=5) 11 | df = pd.read_csv(filename, header=0, delimiter=';', sep='', names=variable_names.columns, 12 | parse_dates=parse_dates, index_col=0, nrows=N_rows, skiprows=total_rows - N_rows) 13 | df_no_na = df.replace('?', np.NaN) 14 | df_no_na.dropna(inplace=True) 15 | return df_no_na.astype(float) 16 | 17 | 18 | def timeseries_plot(y, color, y_label): 19 | # y is Series with index of datetime 20 | days = dates.DayLocator() 21 | dfmt_minor = dates.DateFormatter('%m-%d') 22 | weekday = dates.WeekdayLocator(byweekday=(), interval=1) 23 | 24 | fig, ax = plt.subplots() 25 | ax.xaxis.set_minor_locator(days) 26 | ax.xaxis.set_minor_formatter(dfmt_minor) 27 | 28 | ax.xaxis.set_major_locator(weekday) 29 | ax.xaxis.set_major_formatter(dates.DateFormatter('\n\n%a')) 30 | 31 | ax.set_ylabel(y_label) 32 | ax.plot(y.index, y, color) 33 | fig.set_size_inches(12, 8) 34 | plt.tight_layout() 35 | plt.savefig(y_label + '.png', dpi=300) 36 | plt.show() 37 | 38 | # average time series 39 | 40 | 41 | def bucket_avg(ts, bucket): 42 | # ts is Sereis with index 43 | # bucket =["30T","60T","M".....] 44 | y = ts.resample(bucket).mean() 45 | return y 46 | 47 | 48 | def config_plot(): 49 | plt.style.use('seaborn-paper') 50 | # plt.rcParams.update({'axes.prop_cycle': cycler(color='jet')}) 51 | plt.rcParams.update({'axes.titlesize': 20}) 52 | plt.rcParams['legend.loc'] = 'best' 53 | plt.rcParams.update({'axes.labelsize': 22}) 54 | plt.rcParams.update({'xtick.labelsize': 16}) 55 | plt.rcParams.update({'ytick.labelsize': 16}) 56 | plt.rcParams.update({'figure.figsize': (10, 6)}) 57 | plt.rcParams.update({'legend.fontsize': 20}) 58 | return 1 59 | 60 | 61 | # static xgboost 62 | # get one-hot encoder for features 63 | def date_transform(df, encode_cols): 64 | # extract a few features from datetime 65 | df['Year'] = df.index.year 66 | df['Month'] = df.index.month 67 | df['WeekofYear'] = df.index.weekofyear 68 | df['DayofWeek'] = df.index.weekday 69 | df['Hour'] = df.index.hour 70 | df['Minute'] = df.index.minute 71 | # one hot encoder for categorical variables 72 | for col in encode_cols: 73 | df[col] = df[col].astype('category') 74 | df = pd.get_dummies(df, columns=encode_cols) 75 | return df 76 | 77 | 78 | def get_unseen_data(unseen_start, steps, encode_cols, bucket_size): 79 | index = pd.date_range(unseen_start, 80 | periods=steps, freq=bucket_size) 81 | df = pd.DataFrame(pd.Series(np.zeros(steps), index=index), 82 | columns=['Global_active_power']) 83 | return df 84 | 85 | # dynamic xgboost 86 | # shift 2 steps for every lag 87 | 88 | 89 | def data_add_timesteps(data, column, lag): 90 | column = data[column] 91 | step_columns = [column.shift(i) for i in range(2, lag + 1, 2)] 92 | df_steps = pd.concat(step_columns, axis=1) 93 | # current Global_active_power is at first columns 94 | df = pd.concat([data, df_steps], axis=1) 95 | return df 96 | --------------------------------------------------------------------------------