├── Exploratory_analysis.py
├── Gpower_Arima_Main.py
├── Gpower_Xgb_Main.py
├── README.md
├── lstm.py
├── lstm_Main.py
├── lstm_multivariate_Main.py
├── myArima.py
├── myXgb.py
└── util.py


/Exploratory_analysis.py:
--------------------------------------------------------------------------------
 1 | # basic + dates
 2 | import numpy as np
 3 | import pandas as pd
 4 | from util import *
 5 | import matplotlib.pyplot as plt
 6 | import seaborn as sn
 7 | 
 8 | 
 9 | 
10 | 
11 | parse_dates = [['Date', 'Time']]
12 | filename = "household_power_consumption.txt"
13 | encode_cols = ['Month', 'DayofWeek', 'Hour']
14 | bucket_size="60T"
15 | 
16 | # (1)  data is not datetime ordered. It is random
17 | N_rows = 60000
18 | df = preprocess(N_rows, parse_dates, filename)
19 | 
20 | G_power=df["Global_active_power"]
21 | #G_power_sort=G_power.sort_values('index')
22 | 
23 | df = pd.DataFrame(bucket_avg(G_power,bucket_size))
24 | df.dropna(inplace=True)
25 | df.Global_active_power.plot(style='b.')
26 | plt.ylabel('Global Active Power')
27 | plt.xlabel('Time')
28 | plt.title("Last 60000 rows")
29 | plt.savefig( 'EDA_2010.png', dpi=300)
30 | plt.show()
31 | 
32 | 
33 | ### (2)So we focus on the 2010-11 area which is around the last 19000 data rows
34 | N_rows = 19000
35 | df = preprocess(N_rows, parse_dates, filename)
36 | G_power=df["Global_active_power"]
37 | print(G_power.shape) # 21661 rows
38 | df_G = pd.DataFrame(bucket_avg(G_power,bucket_size))
39 | df_G.dropna(inplace=True)
40 | 
41 | # time series plot
42 | ts_label='Global_active_power_in_Nov_2010'
43 | timeseries_plot(df_G.Global_active_power,'g', ts_label)
44 | 
45 | # component plot
46 | from pylab import rcParams
47 | import statsmodels.api as sm
48 | rcParams['figure.figsize'] = 11, 9
49 | decomposition = sm.tsa.seasonal_decompose(df_G.Global_active_power, model='additive')
50 | fig = decomposition.plot()
51 | plt.tight_layout()
52 | plt.savefig( 'ts_decomposition_plot.png', dpi=300)
53 | plt.show()
54 | 
55 | 
56 | # heatmap
57 | df_G['Time of Day'] = df_G.index.time
58 | df_G['Date'] = df_G.index.date
59 | print(df_G.head())
60 | dfG_pivot = df_G.pivot_table(index="Date", columns="Time of Day",values='Global_active_power',fill_value=0)
61 | dfG_pivot.head()
62 | plt.figure(figsize=(12, 8))
63 | sn.heatmap(dfG_pivot,cmap='Blues')
64 | plt.tight_layout()
65 | plt.savefig( 'date_time_power_heatmap.png', dpi=300)
66 | plt.show()
67 | 
68 | # correlation plot
69 | names = df.columns
70 | correlations = df.corr()
71 | # plot correlation matrix
72 | fig = plt.figure()
73 | ax = fig.add_subplot(111)
74 | cax = ax.matshow(correlations, vmin=-1, vmax=1,cmap='Accent')
75 | fig.colorbar(cax)
76 | ticks = np.arange(0,7,1)
77 | ax.set_xticks(ticks)
78 | ax.set_yticks(ticks)
79 | ax.set_xticklabels(names, rotation = 90)
80 | ax.set_yticklabels(names)
81 | #plt.figure(figsize=(8,8))
82 | plt.tight_layout()
83 | plt.savefig('correlation.png', dpi=300)
84 | plt.show()
85 | 


--------------------------------------------------------------------------------
/Gpower_Arima_Main.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from util import timeseries_plot, bucket_avg, preprocess, config_plot
 3 | from myArima import *
 4 | 
 5 | config_plot()
 6 | 
 7 | 
 8 | # we focus on the last 10 days data in Nov 2010
 9 | N_rows = 15000
10 | parse_dates = [['Date', 'Time']]
11 | filename = "household_power_consumption.txt"
12 | 
13 | df = preprocess(N_rows, parse_dates, filename)
14 | 
15 | G_power = pd.to_numeric(df["Global_active_power"])
16 | # time series plot of one-minute sampling rate data
17 | timeseries_plot(G_power, 'g', 'Global_active_power')
18 | 
19 | # we take a 30 minutes bucket average of our time series data to reduce noise.
20 | bucket_size = "30T"
21 | G_power_avg = bucket_avg(G_power, bucket_size)
22 | # plot of 30 minutes average.
23 | ts_label = 'G_power_avg'
24 | timeseries_plot(G_power_avg, 'g', ts_label)
25 | 
26 | 
27 | # "Grid search" of seasonal ARIMA model.
28 | # the seasonal periodicy 24 hours, i.e. S=24*60/30 = 48 samples
29 | arima_para = {}
30 | arima_para['p'] = range(2)
31 | arima_para['d'] = range(2)
32 | arima_para['q'] = range(2)
33 | # the seasonal periodicy is  24 hours
34 | seasonal_para = round(24 * 60 / (float(bucket_size[:-1])))
35 | arima = Arima_Class(arima_para, seasonal_para)
36 | 
37 | arima.fit(G_power_avg)
38 | 
39 | # Prediction on observed data starting on pred_start
40 | # observed and prediction starting dates in plots
41 | plot_start = '2010-11-24 00:00:00'
42 | pred_start = '2010-11-25 14:00:00'
43 | 
44 | # One-step ahead forecasts
45 | dynamic = False
46 | arima.pred(G_power_avg, plot_start, pred_start, dynamic, ts_label)
47 | 
48 | # Dynamic forecasts
49 | dynamic = True
50 | arima.pred(G_power_avg, plot_start, pred_start, dynamic, ts_label)
51 | 
52 | # Forecasts to unseen future data
53 | n_steps = 100  # next 100 * 30 min = 50 hours
54 | arima.forcast(G_power_avg, n_steps, ts_label)
55 | 


--------------------------------------------------------------------------------
/Gpower_Xgb_Main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from util import *
  4 | from myXgb import *
  5 | import matplotlib.pyplot as plt
  6 | import xgboost as xgb
  7 | from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
  8 | from xgboost.sklearn import XGBRegressor  # wrapper
  9 | import scipy.stats as st
 10 | 
 11 | config_plot()
 12 | 
 13 | 
 14 | ##############################################################################
 15 | # we only focus on the last 18000 points for datetime information
 16 | # Run xgboost on all features
 17 | # get data
 18 | N_rows = 18000
 19 | parse_dates = [['Date', 'Time']]
 20 | filename = "household_power_consumption.txt"
 21 | encode_cols = ['Month', 'DayofWeek', 'Hour']
 22 | 
 23 | df = preprocess(N_rows, parse_dates, filename)
 24 | # keep all features
 25 | df = date_transform(df, encode_cols)
 26 | 
 27 | # base parameters
 28 | xgb_params = {
 29 |     'booster': 'gbtree',
 30 |     'objective': 'reg:linear',  # regression task
 31 |     'subsample': 0.80,  # 80% of data to grow trees and prevent overfitting
 32 |     'colsample_bytree': 0.85,  # 85% of features used
 33 |     'eta': 0.1,
 34 |     'max_depth': 10,
 35 |     'seed': 42}  # for reproducible results
 36 | 
 37 | val_ratio = 0.3
 38 | ntree = 300
 39 | early_stop = 50
 40 | 
 41 | print('-----Xgboost Using All Numeric Features-----',
 42 |       '\n---inital model feature importance---')
 43 | fig_allFeatures = xgb_importance(
 44 |     df, val_ratio, xgb_params, ntree, early_stop, 'All Features')
 45 | plt.show()
 46 | 
 47 | #############################################################################
 48 | # xgboost using only datetime information
 49 | bucket_size = "5T"
 50 | df = preprocess(N_rows, parse_dates, filename)
 51 | G_power = df["Global_active_power"]
 52 | 
 53 | df = pd.DataFrame(bucket_avg(G_power, bucket_size))
 54 | df.dropna(inplace=True)
 55 | df.iloc[-1, :].index  # last time step  #2010-11-26 21:00:00
 56 | 
 57 | test_start_date = '2010-11-25 20:00:00'
 58 | unseen_start_date = '2010-11-26 21:10:00'
 59 | steps = 200
 60 | # get splited data
 61 | df_unseen, df_test, df = xgb_data_split(
 62 |     df, bucket_size, unseen_start_date, steps, test_start_date, encode_cols)
 63 | print('\n-----Xgboost on only datetime information---------\n')
 64 | 
 65 | dim = {'train and validation data ': df.shape,
 66 |        'test data ': df_test.shape,
 67 |        'forecasting data ': df_unseen.shape}
 68 | print(pd.DataFrame(list(dim.items()), columns=['Data', 'dimension']))
 69 | 
 70 | # train model
 71 | Y = df.iloc[:, 0]
 72 | X = df.iloc[:, 1:]
 73 | X_train, X_val, y_train, y_val = train_test_split(X, Y,
 74 |                                                   test_size=val_ratio,
 75 |                                                   random_state=42)
 76 | 
 77 | X_test = xgb.DMatrix(df_test.iloc[:, 1:])
 78 | Y_test = df_test.iloc[:, 0]
 79 | X_unseen = xgb.DMatrix(df_unseen)
 80 | 
 81 | dtrain = xgb.DMatrix(X_train, y_train)
 82 | dval = xgb.DMatrix(X_val, y_val)
 83 | watchlist = [(dtrain, 'train'), (dval, 'validate')]
 84 | 
 85 | # Grid Search
 86 | params_sk = {
 87 |     'objective': 'reg:linear',
 88 |     'subsample': 0.8,
 89 |     'colsample_bytree': 0.85,
 90 |     'seed': 42}
 91 | 
 92 | skrg = XGBRegressor(**params_sk)
 93 | 
 94 | skrg.fit(X_train, y_train)
 95 | 
 96 | params_grid = {"n_estimators": st.randint(100, 500),
 97 |                #                "colsample_bytree": st.beta(10, 1),
 98 |                #                "subsample": st.beta(10, 1),
 99 |                #                "gamma": st.uniform(0, 10),
100 |                #                'reg_alpha': st.expon(0, 50),
101 |                #                "min_child_weight": st.expon(0, 50),
102 |                #               "learning_rate": st.uniform(0.06, 0.12),
103 |                'max_depth': st.randint(6, 30)
104 |                }
105 | search_sk = RandomizedSearchCV(
106 |     skrg, params_grid, cv=5, random_state=1, n_iter=20)  # 5 fold cross validation
107 | search_sk.fit(X, Y)
108 | 
109 | # best parameters
110 | print("best parameters:", search_sk.best_params_); print(
111 |     "best score:", search_sk.best_score_)
112 | # with new parameters
113 | params_new = {**params_sk, **search_sk.best_params_}
114 | 
115 | model_final = xgb.train(params_new, dtrain, evals=watchlist,
116 |                         early_stopping_rounds=early_stop, verbose_eval=True)
117 | 
118 | print('-----Xgboost Using Datetime Features Only------',
119 |       '\n---Grid Search model feature importance---')
120 | importance = model_final.get_fscore()
121 | importance_sorted = sorted(importance.items(), key=operator.itemgetter(1))
122 | fig1 = feature_importance_plot(importance_sorted, 'feature importance')
123 | plt.show()
124 | 
125 | #############################################################################
126 | # Forcasting
127 | # prediction to testing data
128 | Y_hat = model_final.predict(X_test)
129 | Y_hat = pd.DataFrame(Y_hat, index=Y_test.index, columns=["predicted"])
130 | 
131 | # predictions to unseen future data
132 | unseen_y = model_final.predict(X_unseen)
133 | forecasts = pd.DataFrame(
134 |     unseen_y, index=df_unseen.index, columns=["forecasts"])
135 | 
136 | # plot forcast results using grid search final model
137 | plot_start = '2010-11-24 00:00:00'
138 | print('-----Xgboost Using Datetime Features Only------',
139 |       '\n---Forecasting from Grid Search---')
140 | forecasts_plot2 = xgb_forecasts_plot(
141 |     plot_start, Y, Y_test, Y_hat, forecasts, 'Grid Search')
142 | 
143 | # forcasts results using itinial model
144 | xgb_model = xgb.train(xgb_params, dtrain, ntree, evals=watchlist,
145 |                       early_stopping_rounds=early_stop, verbose_eval=False)
146 | Y_hat = xgb_model.predict(X_test)
147 | Y_hat = pd.DataFrame(Y_hat, index=Y_test.index, columns=["test_predicted"])
148 | unseen_y = xgb_model.predict(X_unseen)
149 | forecasts = pd.DataFrame(
150 |     unseen_y, index=df_unseen.index, columns=["forecasts"])
151 | plot_start = '2010-11-24 00:00:00'
152 | print('-----Xgboost Using Datetime Features Only------',
153 |       '\n---Forecasting from initial---')
154 | forecasts_plot1 = xgb_forecasts_plot(
155 |     plot_start, Y, Y_test, Y_hat, forecasts, 'Initial Model')
156 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Time Series Prediction for Individual Household Power
 2 | Dateset: https://archive.ics.uci.edu/ml/datasets/individual+household+electric+power+consumption
 3 | 
 4 | The data was collected with a one-minute sampling rate over a period between Dec 2006
 5 | and Nov 2010 (47 months) were measured. Six independent variables (electrical quantities and sub-metering values) a numerical dependent variable Global active power with 2,075,259 observations are available. Our goal is to predict the Global active power into the future.
 6 | 
 7 | Here, missing values are dropped for simplicity. Furthermore, we find that not all observations are ordered by the date time. Therefore we analyze the data with explicit time stamp as an index. In the preprocessing step, we perform a bucket-average of the raw data to reduce the noise from the one-minute sampling rate. For simplicity, we only focus on the last 18000 rows of raw dataset (the most recent data in Nov 2010).
 8 | 
 9 | ### A list of python files:
10 | + *Gpower_Arima_Main.py* :  The **executable** python program of a univariate ARIMA model.
11 | + myArima.py : implements a class with some callable methods used for the ARIMA model.
12 | + *Gpower_Xgb_Main.py* : The **executable** python program of a tree based model (xgboost).
13 | + myXgb.py : implements some functions used for the xgboost model.
14 | + *lstm_Main.py* : The **executable** python program of a LSTM model.
15 | + lstm.py : implements a class of a time series model using an LSTMCell. The credit should go to https://github.com/hzy46/TensorFlow-Time-Series-Examples/blob/master/train_lstm.py
16 | + util.py : implements various functions for data preprocessing.
17 | + Exploratory_analysis.py : exploratory analysis and plots of data.
18 | ```diff
19 | + Environment : Python 3.6, TensorFlow1.4.
20 | ```
21 | ### Here, I used 3 different approaches to model the pattern of power consumption.
22 | - **Univariate time series ARIMA**.(30-min average was applied on the data to reduce noise.)
23 | ![onestep](https://user-images.githubusercontent.com/25689659/34470019-001ea4e0-eef7-11e7-822a-5a5132e8ca75.png)
24 | ![dynamic](https://user-images.githubusercontent.com/25689659/34470018-0011600a-eef7-11e7-89df-79372c49a791.png)
25 | ![forecast](https://user-images.githubusercontent.com/25689659/34470017-0004e848-eef7-11e7-9148-abfb62f95dcc.png)
26 | - **Regression tree-based xgboost**.(5-min average was performed.) 
27 | ![xgbManual](https://user-images.githubusercontent.com/25689659/34470022-00463b90-eef7-11e7-8a3c-d80df291f7d6.png)
28 | - **Recurrent neural network univariate LSTM (long short-term memoery) model**. (15-min average was performed to reduce the noise.)
29 | ![predict_result](https://user-images.githubusercontent.com/25689659/34470791-a5047402-ef07-11e7-9111-ff1da558b6e1.png)
30 | 
31 | ### Possible approaches to do in the future work:
32 | #### (i) Dynamic Regression Time Series Model
33 | Given the strong correlations between Sub metering 1, Sub metering 2 and Sub metering 3 and our target variable, 
34 | these variables could be included into the dynamic regression model or regression time series model.
35 | 
36 | #### (ii) Dynamic Xgboost Model
37 | Include the timestep-shifted Global active power columns as features. The target variable will be current Global active power. 
38 | Recent history of Global active power up to this time stamp (say, from 100 timesteps before) should be included
39 | as extra features.
40 | 
41 | #### (iii) Multivariate LSTM
42 | Include the features per timestamp Sub metering 1, Sub metering 2 and Sub metering 3, date, time and our target variable into the RNNCell for the multivariate time-series LSTM model.
43 | ![multivariate](https://user-images.githubusercontent.com/25689659/35536009-86ac3612-0513-11e8-9ccd-4311dff198ee.png)
44 | 


--------------------------------------------------------------------------------
/lstm.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | # TensorFlow1.4
 20 | import tensorflow as tf
 21 | 
 22 | from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
 23 | from tensorflow.contrib.timeseries.python.timeseries import model as ts_model
 24 | from tensorflow.contrib.timeseries.python.timeseries import  NumpyReader
 25 | 
 26 | 
 27 | ##########################
 28 | 
 29 | class _LSTMModel(ts_model.SequentialTimeSeriesModel):
 30 |   """A time series model-building example using an RNNCell."""
 31 | 
 32 |   def __init__(self, num_units, num_features, dtype=tf.float32):
 33 |     """Initialize/configure the model object.
 34 |     Note that we do not start graph building here. Rather, this object is a
 35 |     configurable factory for TensorFlow graphs which are run by an Estimator.
 36 |     Args:
 37 |       num_units: The number of units in the model's LSTMCell.
 38 |       num_features: The dimensionality of the time series (features per
 39 |         timestep).
 40 |       dtype: The floating point data type to use.
 41 |     """
 42 |     super(_LSTMModel, self).__init__(
 43 |         # Pre-register the metrics we'll be outputting (just a mean here).
 44 |         train_output_names=["mean"],
 45 |         predict_output_names=["mean"],
 46 |         num_features=num_features,
 47 |         dtype=dtype)
 48 |     self._num_units = num_units
 49 |     # Filled in by initialize_graph()
 50 |     self._lstm_cell = None
 51 |     self._lstm_cell_run = None
 52 |     self._predict_from_lstm_output = None
 53 | 
 54 |   def initialize_graph(self, input_statistics):
 55 |     """Save templates for components, which can then be used repeatedly.
 56 |     This method is called every time a new graph is created. It's safe to start
 57 |     adding ops to the current default graph here, but the graph should be
 58 |     constructed from scratch.
 59 |     Args:
 60 |       input_statistics: A math_utils.InputStatistics object.
 61 |     """
 62 |     super(_LSTMModel, self).initialize_graph(input_statistics=input_statistics)
 63 |     self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
 64 |     # Create templates so we don't have to worry about variable reuse.
 65 |     self._lstm_cell_run = tf.make_template(
 66 |         name_="lstm_cell",
 67 |         func_=self._lstm_cell,
 68 |         create_scope_now_=True)
 69 |     # Transforms LSTM output into mean predictions.
 70 |     self._predict_from_lstm_output = tf.make_template(
 71 |         name_="predict_from_lstm_output",
 72 |         func_=lambda inputs: tf.layers.dense(inputs=inputs, units=self.num_features),
 73 |         create_scope_now_=True)
 74 | 
 75 |   def get_start_state(self):
 76 |     """Return initial state for the time series model."""
 77 |     return (
 78 |         # Keeps track of the time associated with this state for error checking.
 79 |         tf.zeros([], dtype=tf.int64),
 80 |         # The previous observation or prediction.
 81 |         tf.zeros([self.num_features], dtype=self.dtype),
 82 |         # The state of the RNNCell (batch dimension removed since this parent
 83 |         # class will broadcast).
 84 |         [tf.squeeze(state_element, axis=0)
 85 |          for state_element
 86 |          in self._lstm_cell.zero_state(batch_size=1, dtype=self.dtype)])
 87 | 
 88 |   def _transform(self, data):
 89 |     """Normalize data based on input statistics to encourage stable training."""
 90 |     mean, variance = self._input_statistics.overall_feature_moments
 91 |     return (data - mean) / variance
 92 | 
 93 |   def _de_transform(self, data):
 94 |     """Transform data back to the input scale."""
 95 |     mean, variance = self._input_statistics.overall_feature_moments
 96 |     return data * variance + mean
 97 | 
 98 |   def _filtering_step(self, current_times, current_values, state, predictions):
 99 |     """Update model state based on observations.
100 |     Note that we don't do much here aside from computing a loss. In this case
101 |     it's easier to update the RNN state in _prediction_step, since that covers
102 |     running the RNN both on observations (from this method) and our own
103 |     predictions. This distinction can be important for probabilistic models,
104 |     where repeatedly predicting without filtering should lead to low-confidence
105 |     predictions.
106 |     Args:
107 |       current_times: A [batch size] integer Tensor.
108 |       current_values: A [batch size, self.num_features] floating point Tensor
109 |         with new observations.
110 |       state: The model's state tuple.
111 |       predictions: The output of the previous `_prediction_step`.
112 |     Returns:
113 |       A tuple of new state and a predictions dictionary updated to include a
114 |       loss (note that we could also return other measures of goodness of fit,
115 |       although only "loss" will be optimized).
116 |     """
117 |     state_from_time, prediction, lstm_state = state
118 |     with tf.control_dependencies(
119 |             [tf.assert_equal(current_times, state_from_time)]):
120 |       transformed_values = self._transform(current_values)
121 |       # Use mean squared error across features for the loss.
122 |       predictions["loss"] = tf.reduce_mean(
123 |           (prediction - transformed_values) ** 2, axis=-1)
124 |       # Keep track of the new observation in model state. It won't be run
125 |       # through the LSTM until the next _imputation_step.
126 |       new_state_tuple = (current_times, transformed_values, lstm_state)
127 |     return (new_state_tuple, predictions)
128 | 
129 |   def _prediction_step(self, current_times, state):
130 |     """Advance the RNN state using a previous observation or prediction."""
131 |     _, previous_observation_or_prediction, lstm_state = state
132 |     lstm_output, new_lstm_state = self._lstm_cell_run(
133 |         inputs=previous_observation_or_prediction, state=lstm_state)
134 |     next_prediction = self._predict_from_lstm_output(lstm_output)
135 |     new_state_tuple = (current_times, next_prediction, new_lstm_state)
136 |     return new_state_tuple, {"mean": self._de_transform(next_prediction)}
137 | 
138 |   def _imputation_step(self, current_times, state):
139 |     """Advance model state across a gap."""
140 |     # Does not do anything special if we're jumping across a gap. More advanced
141 |     # models, especially probabilistic ones, would want a special case that
142 |     # depends on the gap size.
143 |     return state
144 | 
145 |   def _exogenous_input_step(
146 |           self, current_times, current_exogenous_regressors, state):
147 |     """Update model state based on exogenous regressors."""
148 |     raise NotImplementedError(
149 |         "Exogenous inputs are not implemented for this example.")
150 | 
151 | ##############################################################################
152 | 
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/lstm_Main.py:
--------------------------------------------------------------------------------
 1 | # TensorFlow1.4
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
10 | from tensorflow.contrib.timeseries.python.timeseries import model as ts_model
11 | from tensorflow.contrib.timeseries.python.timeseries import  NumpyReader
12 | 
13 | from lstm import _LSTMModel
14 | from util import preprocess, bucket_avg
15 | import pandas as pd
16 | import numpy as np
17 | import matplotlib
18 | matplotlib.use("agg")
19 | import matplotlib.pyplot as plt
20 | 
21 | # get LSTM data
22 | def get_rnn_data(N_rows, bucket_size):
23 |     parse_dates = [['Date', 'Time']]
24 |     filename = "household_power_consumption.txt"
25 |     df = preprocess(N_rows, parse_dates, filename)
26 |     df = pd.DataFrame(bucket_avg(df["Global_active_power"], bucket_size))
27 |     df.dropna(inplace=True)
28 |     x = np.array(range(df.shape[0]))
29 |     y = np.array(df.Global_active_power)
30 |     return x, y
31 | 
32 | if __name__ == '__main__':
33 |   tf.logging.set_verbosity(tf.logging.INFO)
34 |   x, y = get_rnn_data(18000, "15T")
35 |   x_train, y_train = x[:900],y[:900] # first 900 data points for training
36 |   x_eval, y_eval = x[900:],y[900:] # last 300 data points for evaluation
37 | 
38 |   data_train = {
39 |       tf.contrib.timeseries.TrainEvalFeatures.TIMES: x_train,
40 |       tf.contrib.timeseries.TrainEvalFeatures.VALUES: y_train,
41 |   }
42 |   
43 |   data_eval = {
44 |       tf.contrib.timeseries.TrainEvalFeatures.TIMES: x_eval,
45 |       tf.contrib.timeseries.TrainEvalFeatures.VALUES: y_eval,
46 |   }  
47 | 
48 |   reader = NumpyReader(data_train)
49 |   reader_eval = NumpyReader(data_eval)
50 | 
51 |   train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
52 |       reader, batch_size=4, window_size=100)
53 | 
54 |   estimator = ts_estimators.TimeSeriesRegressor(
55 |       model=_LSTMModel(num_features=1, num_units=128),
56 |       optimizer=tf.train.AdamOptimizer(0.001))
57 | 
58 |   estimator.train(input_fn=train_input_fn, steps=2000)
59 |   evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader_eval)
60 |   evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
61 |   # Predict starting after the evaluation
62 |   (predictions,) = tuple(estimator.predict(
63 |       input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
64 |           evaluation, steps=200)))
65 | 
66 |   observed_times = x
67 |   observed = y
68 |   evaluated_times = evaluation["times"][0]
69 |   evaluated = evaluation["mean"][0]
70 |   predicted_times = predictions['times']
71 |   predicted = predictions["mean"]
72 | 
73 |   plt.figure(figsize=(15, 8))
74 |   plt.axvline(1200, linestyle="dotted", linewidth=4, color='r')
75 |   observed_lines = plt.plot(observed_times, observed, label="observation", color="k")
76 |   evaluated_lines = plt.plot(evaluated_times, evaluated, label="evaluation", color="g")
77 |   predicted_lines = plt.plot(predicted_times, predicted, label="forecasts", color="r")
78 |   xlim = ((0,1401))
79 |   plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],
80 |              loc="upper left")
81 |   plt.tight_layout()
82 |   plt.savefig('predict_result.png',dpi=300)
83 | 


--------------------------------------------------------------------------------
/lstm_multivariate_Main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from util import preprocess, bucket_avg
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | from tensorflow.contrib.timeseries.python.timeseries import  NumpyReader
 10 | 
 11 | from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
 12 | from tensorflow.contrib.timeseries.python.timeseries import model as ts_model
 13 | 
 14 | from lstm import _LSTMModel
 15 | import numpy as np
 16 | import pandas as pd
 17 | import matplotlib
 18 | matplotlib.use("agg")
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | 
 22 | 
 23 | # get data
 24 | 
 25 | def get_rnn_data(N_rows, bucket_size):
 26 |     parse_dates = [['Date', 'Time']]
 27 |     filename = "household_power_consumption.txt"
 28 |     df = preprocess(N_rows, parse_dates, filename)
 29 |     global_power=pd.DataFrame(bucket_avg(df["Global_active_power"], bucket_size))
 30 |     sub1=pd.DataFrame(bucket_avg(df["Sub_metering_1"], bucket_size))
 31 |     sub2=pd.DataFrame(bucket_avg(df["Sub_metering_2"], bucket_size))
 32 |     sub3=pd.DataFrame(bucket_avg(df["Sub_metering_3"], bucket_size))
 33 | 
 34 |     #df.dropna(inplace=True)
 35 |     #df.iloc[-1, :].index  # last time step  #2010-11-26 21:00:00
 36 |     x = np.array(range(global_power.shape[0]))
 37 |     y = np.column_stack((sub1, sub2, sub3, global_power))
 38 |     return x, y
 39 | 
 40 | 
 41 | if __name__ == '__main__':
 42 |   tf.logging.set_verbosity(tf.logging.INFO)
 43 |   x, y = get_rnn_data(18000, "15T")
 44 |   data = {
 45 |       tf.contrib.timeseries.TrainEvalFeatures.TIMES: x,
 46 |       tf.contrib.timeseries.TrainEvalFeatures.VALUES: y,
 47 |   }
 48 |   reader = NumpyReader(data)
 49 |   train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
 50 |       reader, batch_size=4, window_size=100)
 51 | 
 52 |   estimator = ts_estimators.TimeSeriesRegressor(
 53 |       model=_LSTMModel(num_features=4, num_units=128),
 54 |       optimizer=tf.train.AdamOptimizer(0.001))
 55 | 
 56 |   estimator.train(input_fn=train_input_fn, steps=1000)
 57 |   evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
 58 |   evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
 59 |   # Predict starting after the evaluation
 60 |   (predictions,) = tuple(estimator.predict(
 61 |       input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
 62 |           evaluation, steps=100)))
 63 | 
 64 |   observed_times = evaluation["times"][0]
 65 |   observed = evaluation["observed"][0, :, :]
 66 |   evaluated_times = evaluation["times"][0]
 67 |   evaluated = evaluation["mean"][0]
 68 |   predicted_times = predictions['times']
 69 |   predicted = predictions["mean"]
 70 | #plot all 4 variables
 71 |   plt.figure(figsize=(15, 16))
 72 |   plt.subplot(411)
 73 |   plt.axvline(1200, linestyle="dotted", linewidth=4, color='r')
 74 |   observed_lines = plt.plot(observed_times, observed[:,3], label="observation", color="k")
 75 |   evaluated_lines = plt.plot(evaluated_times, evaluated[:,3], label="evaluation", color="g")
 76 |   predicted_lines = plt.plot(predicted_times, predicted[:,3], label="prediction", color="r")
 77 |   plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],loc="upper left")
 78 |   plt.title("Global active power")
 79 |   
 80 |   plt.subplot(412)
 81 |   plt.axvline(1200, linestyle="dotted", linewidth=4, color='r')
 82 |   observed_lines = plt.plot(observed_times, observed[:,0], label="observation", color="k")
 83 |   evaluated_lines = plt.plot(evaluated_times, evaluated[:,0], label="evaluation", color="g")
 84 |   predicted_lines = plt.plot(predicted_times, predicted[:,0], label="prediction", color="r")
 85 |   plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],loc="upper left")
 86 |   plt.title("Sub_metering 1")
 87 |   
 88 |   plt.subplot(413)
 89 |   plt.axvline(1200, linestyle="dotted", linewidth=4, color='r')
 90 |   observed_lines = plt.plot(observed_times, observed[:,1], label="observation", color="k")
 91 |   evaluated_lines = plt.plot(evaluated_times, evaluated[:,1], label="evaluation", color="g")
 92 |   predicted_lines = plt.plot(predicted_times, predicted[:,1], label="prediction", color="r")
 93 |   plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],loc="upper left")
 94 |   plt.title("Sub_metering 2")  
 95 |   
 96 |   plt.subplot(414)
 97 |   plt.axvline(1200, linestyle="dotted", linewidth=4, color='r')
 98 |   observed_lines = plt.plot(observed_times, observed[:,2], label="observation", color="k")
 99 |   evaluated_lines = plt.plot(evaluated_times, evaluated[:,2], label="evaluation", color="g")
100 |   predicted_lines = plt.plot(predicted_times, predicted[:,2], label="prediction", color="r")
101 |   plt.legend(handles=[observed_lines[0], evaluated_lines[0], predicted_lines[0]],loc="upper left")
102 |   plt.title("Sub_metering 3")  
103 |   
104 |   plt.savefig('multivariate.png',dpi=300)


--------------------------------------------------------------------------------
/myArima.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import warnings
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import pandas as pd
  6 | import statsmodels.api as sm
  7 | 
  8 | 
  9 | class Arima_Class:
 10 |     def __init__(self, arima_para, seasonal_para):
 11 |         # Define the p, d and q parameters in Arima(p,d,q)(P,D,Q) models
 12 |         p = arima_para['p']
 13 |         d = arima_para['d']
 14 |         q = arima_para['q']
 15 |         # Generate all different combinations of p, q and q triplets
 16 |         self.pdq = list(itertools.product(p, d, q))
 17 |         # Generate all different combinations of seasonal p, q and q triplets
 18 |         self.seasonal_pdq = [(x[0], x[1], x[2], seasonal_para)
 19 |                              for x in list(itertools.product(p, d, q))]
 20 | 
 21 |     def fit(self, ts):
 22 |         warnings.filterwarnings("ignore")
 23 |         results_list = []
 24 |         for param in self.pdq:
 25 |             for param_seasonal in self.seasonal_pdq:
 26 |                 try:
 27 |                     mod = sm.tsa.statespace.SARIMAX(ts,
 28 |                                                     order=param,
 29 |                                                     seasonal_order=param_seasonal,
 30 |                                                     enforce_stationarity=False,
 31 |                                                     enforce_invertibility=False)
 32 |                     results = mod.fit()
 33 | 
 34 |                     print('ARIMA{}x{}seasonal - AIC:{}'.format(param,
 35 |                                                                param_seasonal, results.aic))
 36 |                     results_list.append([param, param_seasonal, results.aic])
 37 |                 except:
 38 |                     continue
 39 |         results_list = np.array(results_list)
 40 |         lowest_AIC = np.argmin(results_list[:, 2])
 41 |         print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
 42 |         print('ARIMA{}x{}seasonal with lowest_AIC:{}'.format(
 43 |             results_list[lowest_AIC, 0], results_list[lowest_AIC, 1], results_list[lowest_AIC, 2]))
 44 |         print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
 45 | 
 46 |         mod = sm.tsa.statespace.SARIMAX(ts,
 47 |                                         order=results_list[lowest_AIC, 0],
 48 |                                         seasonal_order=results_list[lowest_AIC, 1],
 49 |                                         enforce_stationarity=False,
 50 |                                         enforce_invertibility=False)
 51 |         self.final_result = mod.fit()
 52 |         print('Final model summary:')
 53 |         print(self.final_result.summary().tables[1])
 54 |         print('Final model diagnostics:')
 55 |         self.final_result.plot_diagnostics(figsize=(15, 12))
 56 |         plt.tight_layout()
 57 |         plt.savefig('model_diagnostics.png', dpi=300)
 58 |         plt.show()
 59 | 
 60 |     def pred(self, ts, plot_start, pred_start, dynamic, ts_label):
 61 | 
 62 |         pred_dynamic = self.final_result.get_prediction(
 63 |             start=pd.to_datetime(pred_start), dynamic=dynamic, full_results=True)
 64 |         pred_dynamic_ci = pred_dynamic.conf_int()
 65 |         ax = ts[plot_start:].plot(label='observed', figsize=(15, 10))
 66 | 
 67 |         if dynamic == False:
 68 |             pred_dynamic.predicted_mean.plot(
 69 |                 label='One-step ahead Forecast', ax=ax)
 70 |         else:
 71 |             pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax)
 72 | 
 73 |         ax.fill_between(pred_dynamic_ci.index,
 74 |                         pred_dynamic_ci.iloc[:, 0],
 75 |                         pred_dynamic_ci.iloc[:, 1], color='k', alpha=.25)
 76 |         ax.fill_betweenx(ax.get_ylim(), pd.to_datetime(plot_start), ts.index[-1],
 77 |                          alpha=.1, zorder=-1)
 78 |         ax.set_xlabel('Time')
 79 |         ax.set_ylabel(ts_label)
 80 |         plt.legend()
 81 |         plt.tight_layout()
 82 |         if dynamic == False:
 83 |             plt.savefig(ts_label + '_one_step_pred.png', dpi=300)
 84 |         else:
 85 |             plt.savefig(ts_label + '_dynamic_pred.png', dpi=300)
 86 |         plt.show()
 87 | 
 88 |     def forcast(self, ts, n_steps, ts_label):
 89 |         # Get forecast n_steps ahead in future
 90 |         pred_uc = self.final_result.get_forecast(steps=n_steps)
 91 | 
 92 |         # Get confidence intervals of forecasts
 93 |         pred_ci = pred_uc.conf_int()
 94 |         ax = ts.plot(label='observed', figsize=(15, 10))
 95 |         pred_uc.predicted_mean.plot(ax=ax, label='Forecast in Future')
 96 |         ax.fill_between(pred_ci.index,
 97 |                         pred_ci.iloc[:, 0],
 98 |                         pred_ci.iloc[:, 1], color='k', alpha=.25)
 99 |         ax.set_xlabel('Time')
100 |         ax.set_ylabel(ts_label)
101 |         plt.tight_layout()
102 |         plt.savefig(ts_label + '_forcast.png', dpi=300)
103 |         plt.legend()
104 |         plt.show()
105 | 


--------------------------------------------------------------------------------
/myXgb.py:
--------------------------------------------------------------------------------
 1 | from util import *
 2 | from sklearn.model_selection import train_test_split
 3 | import pandas as pd
 4 | import xgboost as xgb
 5 | import operator
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | # get data for train, test, and forecast(unseen)
 9 | 
10 | 
11 | def xgb_data_split(df, bucket_size, unseen_start_date, steps, test_start_date, encode_cols):
12 |     # generate unseen data
13 |     unseen = get_unseen_data(unseen_start_date, steps,
14 |                              encode_cols, bucket_size)
15 |     df = pd.concat([df, unseen], axis=0)
16 |     df = date_transform(df, encode_cols)
17 | 
18 |     # data for forecast ,skip the connecting point
19 |     df_unseen = df[unseen_start_date:].iloc[:, 1:]
20 |     test_start = '2010-11-26 00:00:00'
21 |     # skip the connecting point
22 |     df_test = df[test_start_date: unseen_start_date].iloc[:-1, :]
23 |     df_train = df[:test_start_date]
24 |     return df_unseen, df_test, df_train
25 | 
26 | 
27 | def feature_importance_plot(importance_sorted, title):
28 |     df = pd.DataFrame(importance_sorted, columns=['feature', 'fscore'])
29 |     df['fscore'] = df['fscore'] / df['fscore'].sum()
30 | 
31 |     plt.figure()
32 |     # df.plot()
33 |     df.plot(kind='barh', x='feature', y='fscore',
34 |             legend=False, figsize=(12, 10))
35 |     plt.title('XGBoost Feature Importance')
36 |     plt.xlabel('relative importance')
37 |     plt.tight_layout()
38 |     plt.savefig(title + '.png', dpi=300)
39 |     plt.show()
40 | 
41 | 
42 | def xgb_importance(df, test_ratio, xgb_params, ntree, early_stop, plot_title):
43 |     df = pd.DataFrame(df)
44 |     # split the data into train/test set
45 |     Y = df.iloc[:, 0]
46 |     X = df.iloc[:, 1:]
47 |     X_train, X_test, y_train, y_test = train_test_split(X, Y,
48 |                                                         test_size=test_ratio,
49 |                                                         random_state=42)
50 | 
51 |     dtrain = xgb.DMatrix(X_train, y_train)
52 |     dtest = xgb.DMatrix(X_test, y_test)
53 | 
54 |     watchlist = [(dtrain, 'train'), (dtest, 'validate')]
55 | 
56 |     xgb_model = xgb.train(xgb_params, dtrain, ntree, evals=watchlist,
57 |                           early_stopping_rounds=early_stop, verbose_eval=True)
58 | 
59 |     importance = xgb_model.get_fscore()
60 |     importance_sorted = sorted(importance.items(), key=operator.itemgetter(1))
61 |     feature_importance_plot(importance_sorted, plot_title)
62 | 
63 | 
64 | def xgb_forecasts_plot(plot_start, Y, Y_test, Y_hat, forecasts, title):
65 |     Y = pd.concat([Y, Y_test])
66 |     ax = Y[plot_start:].plot(label='observed', figsize=(15, 10))
67 |     #Y_test.plot(label='test_observed', ax=ax)
68 |     Y_hat.plot(label="predicted", ax=ax)
69 |     forecasts.plot(label="forecast", ax=ax)
70 | 
71 |     ax.fill_betweenx(ax.get_ylim(), pd.to_datetime(Y_test.index[0]), Y_test.index[-1],
72 |                      alpha=.1, zorder=-1)
73 |     ax.set_xlabel('Time')
74 |     ax.set_ylabel('Global Active Power')
75 |     plt.legend()
76 |     plt.tight_layout()
77 |     plt.savefig(title + '.png', dpi=300)
78 |     plt.show()
79 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from matplotlib import dates
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | 
 7 | def preprocess(N_rows, parse_dates, filename):
 8 |     total_rows = sum(1 for l in open(filename))
 9 |     variable_names = pd.read_csv(
10 |         filename, header=0, delimiter=';', sep='', nrows=5)
11 |     df = pd.read_csv(filename, header=0, delimiter=';', sep='', names=variable_names.columns,
12 |                      parse_dates=parse_dates, index_col=0, nrows=N_rows, skiprows=total_rows - N_rows)
13 |     df_no_na = df.replace('?', np.NaN)
14 |     df_no_na.dropna(inplace=True)
15 |     return df_no_na.astype(float)
16 | 
17 | 
18 | def timeseries_plot(y, color, y_label):
19 |     # y is Series with index of datetime
20 |     days = dates.DayLocator()
21 |     dfmt_minor = dates.DateFormatter('%m-%d')
22 |     weekday = dates.WeekdayLocator(byweekday=(), interval=1)
23 | 
24 |     fig, ax = plt.subplots()
25 |     ax.xaxis.set_minor_locator(days)
26 |     ax.xaxis.set_minor_formatter(dfmt_minor)
27 | 
28 |     ax.xaxis.set_major_locator(weekday)
29 |     ax.xaxis.set_major_formatter(dates.DateFormatter('\n\n%a'))
30 | 
31 |     ax.set_ylabel(y_label)
32 |     ax.plot(y.index, y, color)
33 |     fig.set_size_inches(12, 8)
34 |     plt.tight_layout()
35 |     plt.savefig(y_label + '.png', dpi=300)
36 |     plt.show()
37 | 
38 | # average time series
39 | 
40 | 
41 | def bucket_avg(ts, bucket):
42 |     # ts is Sereis with index
43 |     # bucket =["30T","60T","M".....]
44 |     y = ts.resample(bucket).mean()
45 |     return y
46 | 
47 | 
48 | def config_plot():
49 |     plt.style.use('seaborn-paper')
50 | #    plt.rcParams.update({'axes.prop_cycle': cycler(color='jet')})
51 |     plt.rcParams.update({'axes.titlesize': 20})
52 |     plt.rcParams['legend.loc'] = 'best'
53 |     plt.rcParams.update({'axes.labelsize': 22})
54 |     plt.rcParams.update({'xtick.labelsize': 16})
55 |     plt.rcParams.update({'ytick.labelsize': 16})
56 |     plt.rcParams.update({'figure.figsize': (10, 6)})
57 |     plt.rcParams.update({'legend.fontsize': 20})
58 |     return 1
59 | 
60 | 
61 | # static xgboost
62 | # get one-hot encoder for features
63 | def date_transform(df, encode_cols):
64 |     # extract a few features from datetime
65 |     df['Year'] = df.index.year
66 |     df['Month'] = df.index.month
67 |     df['WeekofYear'] = df.index.weekofyear
68 |     df['DayofWeek'] = df.index.weekday
69 |     df['Hour'] = df.index.hour
70 |     df['Minute'] = df.index.minute
71 |     # one hot encoder for categorical variables
72 |     for col in encode_cols:
73 |         df[col] = df[col].astype('category')
74 |     df = pd.get_dummies(df, columns=encode_cols)
75 |     return df
76 | 
77 | 
78 | def get_unseen_data(unseen_start, steps, encode_cols, bucket_size):
79 |     index = pd.date_range(unseen_start,
80 |                           periods=steps, freq=bucket_size)
81 |     df = pd.DataFrame(pd.Series(np.zeros(steps), index=index),
82 |                       columns=['Global_active_power'])
83 |     return df
84 | 
85 | # dynamic xgboost
86 | # shift 2 steps for every lag
87 | 
88 | 
89 | def data_add_timesteps(data, column, lag):
90 |     column = data[column]
91 |     step_columns = [column.shift(i) for i in range(2, lag + 1, 2)]
92 |     df_steps = pd.concat(step_columns, axis=1)
93 |     # current Global_active_power is at first columns
94 |     df = pd.concat([data, df_steps], axis=1)
95 |     return df
96 | 


--------------------------------------------------------------------------------