├── Methods
    ├── __init__.py
    ├── arima.py
    ├── clustering.py
    ├── dbn.py
    ├── ffnn.py
    ├── gbrt.py
    ├── lstm.py
    ├── rfr.py
    ├── seq2seq.py
    ├── svr.py
    └── xgboost_.py
└── data
    └── load.csv


/Methods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getBolted/LoadPredicting/ddf53ce3f49f9ea88d490e3a39759f2f67741830/Methods/__init__.py


--------------------------------------------------------------------------------
/Methods/arima.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import pywt
  6 | #print(pywt.families,pywt.wavelist('coif'))
  7 | import statistics
  8 | import math
  9 | import statsmodels.api as sm
 10 | from statsmodels.tsa.ar_model import AR
 11 | from statsmodels.tsa.arima_model import ARIMA
 12 | from statsmodels.tsa.arima_model import ARMA
 13 | import numpy as np
 14 | import math
 15 | import scipy.stats as stats
 16 | 
 17 | # Computes the Mean Squared Error for predicted values against
 18 | # actual values
 19 | def meanSquareError(actual,pred):
 20 | 	if (not len(actual) == len(pred) or len(actual) == 0):
 21 | 		return -1.0
 22 | 	total = 0.0
 23 | 	for x in range(len(actual)):
 24 | 		total += math.pow(actual[x]-pred[x],2)
 25 | 	return total/len(actual)
 26 | # actual values
 27 | def mse(actual,pred):
 28 | 	if (not len(actual) == len(pred) or len(actual) == 0):
 29 | 		return -1.0
 30 | 	total = 0.0
 31 | 	for x in range(len(actual)):
 32 | 		total += math.pow(actual[x]-pred[x],2)
 33 | 	return total/(len(actual)*1000000)
 34 | 
 35 | # Computes Normalized Root Mean Square Error (NRMSE) for
 36 | # predicted values against actual values
 37 | def normRmse(actual,pred):
 38 | 	if (not len(actual) == len(pred) or len(actual) == 0):
 39 | 		return -1.0
 40 | 	sumSquares = 0.0
 41 | 	maxY = actual[0]
 42 | 	minY = actual[0]
 43 | 	for x in range(len(actual)):
 44 | 		sumSquares += math.pow(pred[x]-actual[x],2.0)
 45 | 		maxY = max(maxY,actual[x])
 46 | 		minY = min(minY,actual[x])
 47 | 	return math.sqrt(sumSquares/len(actual))/(maxY-minY)
 48 | 
 49 | # Computes Root Mean Square Error (RMSE) for
 50 | # predicted values against actual values
 51 | def Rmse(actual,pred):
 52 | 	if (not len(actual) == len(pred) or len(actual) == 0):
 53 | 		return -1.0
 54 | 	sumSquares = 0.0
 55 | 	for x in range(len(actual)):
 56 | 		sumSquares += math.pow(pred[x]-actual[x],2.0)
 57 | 	return math.sqrt(sumSquares/len(actual))
 58 | 
 59 | # Computes Mean Absolute Percent Error (MAPE) for predicted
 60 | # values against actual values
 61 | def mape(actual,pred):
 62 | 	if (not len(actual) == len(pred) or len(actual) == 0):
 63 | 		return -1.0
 64 | 	total = 0.0
 65 | 	for x in range(len(actual)):
 66 | 		total += abs((actual[x]-pred[x])/actual[x])
 67 | 	return total/len(actual)
 68 | 
 69 | # Computes Mean Absolute Percent Error (MAPE) for predicted
 70 | # values against actual values
 71 | def mae(actual,pred):
 72 | 	if (not len(actual) == len(pred) or len(actual) == 0):
 73 | 		return -1.0
 74 | 	total = 0.0
 75 | 	for x in range(len(actual)):
 76 | 		total += abs(actual[x]-pred[x])
 77 | 	return total/len(actual)
 78 | 
 79 | # define a function to convert a vector of time series into a 2D matrix
 80 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 81 |     matrix=[]
 82 |     for i in range(len(vectorSeries)-sequence_length+1):
 83 |         matrix.append(vectorSeries[i:i+sequence_length])
 84 |     return matrix
 85 | 
 86 | def dwt(a):
 87 |     [ca, cd] = pywt.dwt(a,'haar')
 88 |     return ca,cd
 89 | 
 90 | def idwt(ca,cd):
 91 |     ori = pywt.idwt(ca,cd,'haar')
 92 |     return ori
 93 | 
 94 | def generateData(sample, outputnum):
 95 |     a = np.array(sample)
 96 |     mu = np.mean(a)
 97 |     #sigma_2 = np.var(a) / 2
 98 |     sigma_2 = np.var(a) / 24
 99 |     result = np.random.normal(loc = mu, scale = np.sqrt(sigma_2), size = outputnum)
100 |     # result = np.random.logistic(loc=mu, scale=np.sqrt(sigma_2), size=outputnum)
101 |     # result = np.random.laplace(loc=mu, scale=np.sqrt(sigma_2), size=outputnum)
102 |     print('mu = %f\tsigma^2 = %f'%(mu,sigma_2))
103 |     return mu,sigma_2,result
104 | 
105 | def drawResult(mu,sigma_2,result):
106 |     plt.figure(figsize=(10,8),dpi=80)
107 |     count, bins, ignored = plt.hist(result, 30, normed=True)
108 |     plt.plot(bins, 1/(np.sqrt(2 * np.pi * sigma_2)) *np.exp( - (bins - mu)**2 / (2 * sigma_2) ),linewidth=2, color='r')
109 | 
110 | def dataset(matrix_load,train_row):
111 |     matrix_load = np.array(matrix_load)
112 |     print("Data shape: ", matrix_load.shape)
113 |     train_set = matrix_load[:train_row, :]
114 |     # random seed
115 |     np.random.seed(1234)
116 |     # shuffle the training set (but do not shuffle the test set)
117 |     np.random.shuffle(train_set)
118 |     # the training set
119 |     X_train = train_set[:, :-1]
120 |     y_train = train_set[:, -1]
121 |     # the test set
122 |     X_test = matrix_load[train_row:, :-1]
123 |     y_test = matrix_load[train_row:, -1]
124 |     # the input to LSTM layer needs to have the shape of (number of samples, the dimension of each element)
125 |     X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
126 |     X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
127 |     print(np.shape(X_train), np.shape(X_test))
128 |     return X_train,y_train,X_test,y_test
129 | 
130 | # load raw data
131 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
132 | # numpy array
133 | df_raw_array = df_raw.values
134 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(1, len(df_raw))]
135 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
136 | k = 0
137 | for j in range(0, len(list_hourly_load)):
138 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
139 |         k = k + 1
140 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
141 |     sum = 0
142 |     num = 0
143 |     for t in range(1,8):
144 |         if(j - 24*t >= 0):
145 |             num = num + 1
146 |             sum = sum + list_hourly_load[j - 24*t]
147 |         if(j + 24*t < len(list_hourly_load)):
148 |             num = num + 1
149 |             sum = sum + list_hourly_load[j + 24*t]
150 |     sum = sum / num
151 |     if(abs(list_hourly_load[j] - sum)>3):
152 |         k = k + 1
153 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
154 |         else: list_hourly_load[j] = sum - 3
155 | print(k)
156 | list_hourly_load = np.array(list_hourly_load)
157 | shifted_value = list_hourly_load.mean()
158 | list_hourly_load -= shifted_value
159 | a2 , d2 , d1 = pywt.wavedec(list_hourly_load[:-48], 'db4', mode = 'sym', level = 2)
160 | # lhl = pywt.waverec([a2, d2, d1], 'db4')
161 | # print(np.shape(a2),np.shape(d2),np.shape(d1),np.shape(lhl))
162 | # order_a2 = sm.tsa.arma_order_select_ic(a2, ic='aic')['aic_min_order']
163 | # order_d2 = sm.tsa.arma_order_select_ic(d2, ic='aic')['aic_min_order']
164 | # order_d1 = sm.tsa.arma_order_select_ic(d1, ic='aic')['aic_min_order']
165 | order_a2 = [3, 2] # p ,q
166 | order_d2 = [4, 1, 2] # p, d ,q
167 | order_d1 = [4, 1, 2]
168 | print(order_a2,order_d2,order_d1)
169 | model_a2 = ARMA(a2, order = order_a2)
170 | model_d2 = ARIMA(d2, order = order_d2)
171 | model_d1 = ARIMA(d1, order = order_d1)
172 | result_a2 = model_a2.fit()
173 | result_d2 = model_d2.fit()
174 | result_d1 = model_d1.fit()
175 | plt.figure(figsize=(10,15))
176 | plt.subplot(3,1,1)
177 | plt.plot(a2,'blue')
178 | plt.plot(result_a2.fittedvalues,'red')
179 | plt.title('model_a2')
180 | plt.subplot(3,1,2)
181 | plt.plot(d2,'blue')
182 | plt.plot(result_d2.fittedvalues,'red')
183 | plt.title('model_d2')
184 | plt.subplot(3,1,3)
185 | plt.plot(d1,'blue')
186 | plt.plot(result_d1.fittedvalues,'red')
187 | plt.title('model_d1')
188 | plt.show()
189 | a2_all , d2_all , d1_all = pywt.wavedec(list_hourly_load, 'db4', mode = 'sym', level = 2)
190 | delta = [len(a2_all) - len(a2), len(d2_all) - len(d2), len(d1_all) - len(d1)]
191 | print(delta)
192 | pa2 = model_a2.predict(params = result_a2.params, start = 1, end = len(a2) + delta[0])
193 | pd2 = model_d2.predict(params = result_d2.params, start = 1, end = len(d2) + delta[1])
194 | pd1 = model_d1.predict(params = result_d1.params, start = 1, end = len(d1) + delta[2])
195 | predict_values = pywt.waverec([pa2, pd2, pd1], 'db4')
196 | print(np.shape(predict_values))
197 | plt.plot(list_hourly_load[20710:20758], label="$Observed$", c='green')
198 | plt.plot(predict_values[20710:20758],label="$Predicted",c='red')
199 | plt.xlabel('Hour')
200 | plt.ylabel('Electricity load, kW')
201 | plt.show()
202 | # mape = statistics.mape([y_test_true[i]*1000 for i in range(0,len(y_test_true))],(predicted_values)*1000
203 | print(len(list_hourly_load),len(predict_values))
204 | mape = mape((list_hourly_load+shifted_value)*1000,(predict_values+shifted_value)*1000)
205 | print('MAPE is ', mape)
206 | mae = mae((list_hourly_load+shifted_value)*1000,(predict_values+shifted_value)*1000)
207 | print('MAE is ', mae)
208 | mse = meanSquareError((list_hourly_load+shifted_value)*1000,(predict_values+shifted_value)*1000)
209 | print('MSE is ', mse)
210 | rmse = math.sqrt(mse)
211 | print('RMSE is ', rmse)
212 | nrmse = normRmse((list_hourly_load+shifted_value)*1000,(predict_values+shifted_value)*1000)
213 | print('NRMSE is ', nrmse)


--------------------------------------------------------------------------------
/Methods/clustering.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from tools import statistics
  3 | import pandas as pd
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | from scipy.spatial import distance
  7 | from scipy.cluster.vq import kmeans
  8 | from scipy.spatial.distance import euclidean
  9 | from sklearn.cluster import SpectralClustering
 10 | 
 11 | # Performs K-Means Clustering on the ordered sequence
 12 | # of vectors x with parameter k, and returns a 2-tuple:
 13 | # First tuple value is list of centroids
 14 | # Second tuple value is vector x' of length equal to that
 15 | # of x, such that the ith
 16 | # value of x' is the cluster label for the ith example
 17 | # of the input x
 18 | def kMeansClustering(x,k):
 19 | 
 20 |     # Convert list into numpy format
 21 |     conv = np.asarray(x)
 22 | 
 23 |     # Compute the centroids
 24 |     centroids = kmeans(conv,k,iter=10)[0]
 25 | 
 26 |     # Relabel the x's
 27 |     labels = []
 28 |     for y in range(len(x)):
 29 |         minDist = float('inf')
 30 |         minLabel = -1
 31 |         for z in range(len(centroids)):
 32 |             e = euclidean(conv[y],centroids[z]) # 欧式距离
 33 |             if (e < minDist):
 34 |                 minDist = e
 35 |                 minLabel = z
 36 |         labels.append(minLabel)
 37 | 
 38 |     # Return the list of centroids and labels
 39 |     return (centroids,labels)
 40 | 
 41 | # Performs a weighted clustering on the examples in xTest
 42 | # Returns a 1-d vector of predictions
 43 | def predictClustering(clusters,clusterSets,xTest,metric):
 44 |     clustLabels = []
 45 |     simFunction = getDistLambda(metric)
 46 |     for x in range(len(xTest)):
 47 |         clustDex = -1
 48 |         clustDist = float('inf')
 49 |         for y in range(len(clusters)):
 50 |             dist = simFunction(clusters[y],xTest[x])
 51 |             if (dist < clustDist):
 52 |                 clustDist = dist
 53 |                 clustDex = y
 54 |         clustLabels.append(clustDex)
 55 |     predict = np.zeros(len(xTest))
 56 |     for x in range(len(xTest)):
 57 |         predict[x] = weightedClusterClass(xTest[x],clusterSets[clustLabels[x]],simFunction)
 58 |     return predict
 59 | 
 60 | # Performs a weighted cluster classification
 61 | def weightedClusterClass(xVector,examples,simFunction):
 62 |     pred = 0.0
 63 |     normalizer = 0.0
 64 |     ctr = 0
 65 |     for x in examples:
 66 |         similarity = 1.0/simFunction(xVector,x[0])
 67 |         pred += similarity*x[1]
 68 |         normalizer += similarity
 69 |         ctr += 1
 70 |     return (pred/normalizer)
 71 | 
 72 | def getDistLambda(metric):
 73 |     if (metric == "manhattan"):
 74 |         return lambda x,y : distance.cityblock(x,y)
 75 |     elif (metric == "cosine"):
 76 |         return lambda x,y : distance.cosine(x,y)
 77 |     else:
 78 |         return lambda x,y : distance.euclidean(x,y)
 79 | 
 80 | # define a function to convert a vector of time series into a 2D matrix
 81 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 82 |     matrix=[]
 83 |     for i in range(len(vectorSeries)-sequence_length+1):
 84 |         matrix.append(vectorSeries[i:i+sequence_length])
 85 |     return matrix
 86 | 
 87 | # load raw data
 88 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
 89 | # numpy array
 90 | df_raw_array = df_raw.values
 91 | # daily load
 92 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(0, len(df_raw))]
 93 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
 94 | k = 0
 95 | for j in range(0, len(list_hourly_load)):
 96 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
 97 |         k = k + 1
 98 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
 99 |     sum = 0
100 |     num = 0
101 |     for t in range(1,8):
102 |         if(j - 24*t >= 0):
103 |             num = num + 1
104 |             sum = sum + list_hourly_load[j - 24*t]
105 |         if(j + 24*t < len(list_hourly_load)):
106 |             num = num + 1
107 |             sum = sum + list_hourly_load[j + 24*t]
108 |     sum = sum / num
109 |     if(abs(list_hourly_load[j] - sum)>3):
110 |         k = k + 1
111 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
112 |         else: list_hourly_load[j] = sum - 3
113 | # shift all data by mean
114 | list_hourly_load = np.array(list_hourly_load)
115 | shifted_value = list_hourly_load.mean()
116 | list_hourly_load -= shifted_value
117 | # the length of the sequnce for predicting the future value
118 | sequence_length = 25
119 | # convert the vector to a 2D matrix
120 | matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
121 | matrix_load = np.array(matrix_load)
122 | print ("Data shape: ", matrix_load.shape)
123 | # train_row = int(round(0.9 * matrix_load.shape[0]))
124 | train_row = matrix_load.shape[0] - 48
125 | print('train:',train_row,'test:', 48)
126 | train_set = matrix_load[:train_row, :]
127 | # random seed
128 | np.random.seed(1234)
129 | # shuffle the training set (but do not shuffle the test set)
130 | np.random.shuffle(train_set)
131 | # the training set
132 | X_train = train_set[:, :-1]
133 | # the last column is the true value to compute the mean-squared-error loss
134 | y_train = train_set[:, -1]
135 | print(X_train[0],y_train[0])
136 | # the test set
137 | X_test = matrix_load[train_row:, :-1]
138 | y_test = matrix_load[train_row:, -1]
139 | time_test = [df_raw_array[i,0] for i in range(train_row+23, len(df_raw))]
140 | # clustering
141 | # Compute centroids and labels of data
142 | ckmeans_365,lkmeans_365 = kMeansClustering(X_train,365)
143 | c = [ckmeans_365]
144 | l = [lkmeans_365]
145 | algNames = ["Observed","Predicted"]
146 | preds = []
147 | preds.append(y_test)
148 | for t in range(len(c)):
149 |     # The centroids computed by the current clustering algorithm
150 |     centroids = c[t]
151 |     # The labels for the examples defined by the current clustering assignment
152 |     labels = l[t]
153 |     # Separate the training samples into cluster sets
154 |     clusterSets = []
155 |     # Time labels for the examples, separated into clusters
156 |     timeLabels = []
157 |     for x in range(len(centroids)):
158 |         clusterSets.append([])
159 |     for x in range(len(labels)):
160 |         # Place the example into its cluster
161 |         clusterSets[labels[x]].append((X_train[x], y_train[x]))
162 |     # Compute predictions for each of the test examples
163 |     predicted_values = predictClustering(centroids, clusterSets, X_test, "euclidean")
164 |     mape = statistics.mape((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
165 |     print('MAPE is ', mape)
166 |     mae = statistics.mae((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
167 |     print('MAE is ', mae)
168 |     mse = statistics.meanSquareError((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
169 |     print('MSE is ', mse)
170 |     rmse = math.sqrt(mse)
171 |     print('RMSE is ', rmse)
172 |     nrmse = statistics.normRmse((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
173 |     print('NRMSE is ', nrmse)
174 |     preds.append(predicted_values)
175 | # show
176 | fig = plt.figure()
177 | colors = ["g","r","b","c","m","y","k","w"]
178 | legendVars = []
179 | for j in range(len(preds)):
180 |     print(j)
181 |     x, = plt.plot(preds[j]+shifted_value, color=colors[j])
182 |     legendVars.append(x)
183 | plt.xlabel('Hour')
184 | plt.ylabel('Electricity load, kW')
185 | plt.legend(legendVars, algNames)
186 | plt.show()
187 | plt.ylim(0,8)
188 | 


--------------------------------------------------------------------------------
/Methods/dbn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import math
  3 | import xlrd
  4 | from tools import statistics
  5 | #import visualizer
  6 | import pandas as pd
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | # from keras.layers.core import Dense, Activation, Dropout
 10 | # from keras.layers.recurrent import LSTM
 11 | # from keras.models import Sequential
 12 | # #from datagen import constructData
 13 | # from keras import backend as K
 14 | from keras import Sequential
 15 | from keras.layers import Dense
 16 | # from keras.optimizers import SGD
 17 | from keras import regularizers
 18 | from sklearn.neural_network import BernoulliRBM
 19 | 
 20 | # define a function to convert a vector of time series into a 2D matrix
 21 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 22 |     matrix=[]
 23 |     for i in range(len(vectorSeries)-sequence_length+1):
 24 |         matrix.append(vectorSeries[i:i+sequence_length])
 25 |     return matrix
 26 | 
 27 | # load raw data
 28 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
 29 | # numpy array
 30 | df_raw_array = df_raw.values
 31 | # daily load
 32 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(0, len(df_raw))]
 33 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
 34 | # 异常值处理
 35 | k = 0
 36 | for j in range(0, len(list_hourly_load)):
 37 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
 38 |         k = k + 1
 39 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
 40 |     sum = 0
 41 |     num = 0
 42 |     for t in range(1,8):
 43 |         if(j - 24*t >= 0):
 44 |             num = num + 1
 45 |             sum = sum + list_hourly_load[j - 24*t]
 46 |         if(j + 24*t < len(list_hourly_load)):
 47 |             num = num + 1
 48 |             sum = sum + list_hourly_load[j + 24*t]
 49 |     sum = sum / num
 50 |     if(abs(list_hourly_load[j] - sum)>3):
 51 |         k = k + 1
 52 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
 53 |         else: list_hourly_load[j] = sum - 3
 54 | print(k)
 55 | plt.plot(list_hourly_load)
 56 | plt.show()
 57 | # shift all data by mean
 58 | list_hourly_load = np.array(list_hourly_load)
 59 | shifted_value = list_hourly_load.mean()
 60 | list_hourly_load -= shifted_value
 61 | # the length of the sequnce for predicting the future value
 62 | sequence_length = 25
 63 | # convert the vector to a 2D matrix
 64 | matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
 65 | matrix_load = np.array(matrix_load)
 66 | print ("Data shape: ", matrix_load.shape)
 67 | # train_row = int(round(0.9 * matrix_load.shape[0]))
 68 | train_row = matrix_load.shape[0] - 48
 69 | print('train:',train_row,'test:',48)
 70 | train_set = matrix_load[:train_row, :]
 71 | # random seed
 72 | np.random.seed(1234)
 73 | # shuffle the training set (but do not shuffle the test set)
 74 | np.random.shuffle(train_set)
 75 | # the training set
 76 | X_train = train_set[:, :-1]
 77 | # the last column is the true value to compute the mean-squared-error loss
 78 | y_train = train_set[:, -1]
 79 | print(X_train[0],y_train[0])
 80 | # the test set
 81 | X_test = matrix_load[train_row:, :-1]
 82 | y_test = matrix_load[train_row:, -1]
 83 | time_test = [df_raw_array[i,0] for i in range(train_row+23, len(df_raw))]
 84 | X_train = np.reshape(X_train, (np.shape(X_train)[0], np.shape(X_train)[1]))
 85 | X_test = np.reshape(X_test, (np.shape(X_test)[0], np.shape(X_test)[1]))
 86 | print(np.shape(X_train), np.shape(X_test))
 87 | print(np.shape(y_train), np.shape(y_test))
 88 | # dbn
 89 | input_layer = X_train
 90 | hidden_layer=[250,500,200]
 91 | weight_rbm = []
 92 | bias_rbm = []
 93 | for i in range(len(hidden_layer)):
 94 |     print("DBN Layer {0} Pre-training".format(i + 1))
 95 |     rbm = BernoulliRBM(n_components=hidden_layer[i],learning_rate=0.0005,batch_size=512,n_iter=200,verbose=2,random_state=1)
 96 |     rbm.fit(input_layer)
 97 |     # size of weight matrix is [input_layer, hidden_layer]
 98 |     weight_rbm.append(rbm.components_.T)
 99 |     bias_rbm.append(rbm.intercept_hidden_)
100 |     input_layer = rbm.transform(input_layer)
101 | print('Pre-training finish.',np.shape(weight_rbm[0]),np.shape(bias_rbm[0]))
102 | test_rms = 0
103 | result = []
104 | model = Sequential()
105 | print('Fine-tuning start.')
106 | for i in range(0, len(hidden_layer)):
107 |     print('i:',i)
108 |     if i == 0:
109 |         model.add(Dense(hidden_layer[i], activation='sigmoid',input_dim=np.shape(X_train)[1]))
110 |     elif i >= 1:
111 |         model.add(Dense(hidden_layer[i], activation='sigmoid'))
112 |     else:
113 |         pass
114 |     layer = model.layers[i]
115 |     layer.set_weights([weight_rbm[i], bias_rbm[i]])
116 | # model.add(Dense(np.shape(yTrain)[1], activation='linear'))
117 | model.add(Dense(1, activation='linear', kernel_regularizer=regularizers.l2(0.01)))
118 | # sgd = SGD(lr=0.005, decay=0)
119 | model.compile(loss='mse',optimizer="rmsprop")#sgd
120 | model.fit(X_train, y_train, batch_size=150, epochs=100, verbose=5)
121 | # save model
122 | model.save('../model/dbn.h5')
123 | print('Fine-tuning finish.')
124 | # get the predicted values
125 | predicted_values = model.predict(X_test)
126 | num_test_samples = len(predicted_values)
127 | predicted_values = np.reshape(predicted_values, (num_test_samples,1))
128 | # evaluation
129 | mape = statistics.mape((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
130 | print('MAPE is ', mape)
131 | mae = statistics.mae((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
132 | print('MAE is ', mae)
133 | mse = statistics.meanSquareError((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
134 | print('MSE is ', mse)
135 | rmse = math.sqrt(mse)
136 | print('RMSE is ', rmse)
137 | nrmse = statistics.normRmse((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
138 | print('NRMSE is ', nrmse)
139 | # plot the results
140 | fig = plt.figure()
141 | plt.plot(y_test + shifted_value, label="$Observed$", c='green')
142 | plt.plot(predicted_values + shifted_value, label="$Predicted$", c='red')
143 | plt.xlabel('Hour')
144 | plt.ylabel('Electricity load, kW ')
145 | plt.legend()
146 | plt.show()
147 | 
148 | 


--------------------------------------------------------------------------------
/Methods/ffnn.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from tools import statistics
  3 | import pandas as pd
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | from pybrain.structure import FeedForwardNetwork
  7 | from pybrain.structure import FullConnection
  8 | from pybrain.structure import LinearLayer, SigmoidLayer
  9 | from pybrain.datasets import SupervisedDataSet
 10 | from pybrain.supervised.trainers import BackpropTrainer
 11 | from sklearn.decomposition import PCA
 12 | 
 13 | # Constructs and fits a neural network with the given number of neurons
 14 | # to the training data for the specified number of epochs and returns a
 15 | # vector of the predicted values for the given test data - assumes the target
 16 | # is univariate (e.g. single valued output)
 17 | def fit_predict(xTrain, yTrain, xTest, epochs, neurons):
 18 |     # Check edge cases
 19 |     if (not len(xTrain) == len(yTrain) or len(xTrain) == 0 or
 20 |                 len(xTest) == 0 or epochs <= 0):
 21 |         return
 22 | 
 23 |     # Randomize the training data (probably not necessary but pybrain might
 24 |     # not shuffle the data itself, so perform as safety check)
 25 |     indices = np.arange(len(xTrain))
 26 |     np.random.shuffle(indices)
 27 | 
 28 |     trainSwapX = [xTrain[x] for x in indices]
 29 |     trainSwapY = [yTrain[x] for x in indices]
 30 | 
 31 |     supTrain = SupervisedDataSet(len(xTrain[0]), 1)
 32 |     for x in range(len(trainSwapX)):
 33 |         supTrain.addSample(trainSwapX[x], trainSwapY[x])
 34 | 
 35 |     # Construct the feed-forward neural network
 36 | 
 37 |     n = FeedForwardNetwork()
 38 | 
 39 |     inLayer = LinearLayer(len(xTrain[0]))
 40 |     hiddenLayer1 = SigmoidLayer(neurons)
 41 |     outLayer = LinearLayer(1)
 42 | 
 43 |     n.addInputModule(inLayer)
 44 |     n.addModule(hiddenLayer1)
 45 |     n.addOutputModule(outLayer)
 46 | 
 47 |     in_to_hidden = FullConnection(inLayer, hiddenLayer1)
 48 |     hidden_to_out = FullConnection(hiddenLayer1, outLayer)
 49 | 
 50 |     n.addConnection(in_to_hidden)
 51 |     n.addConnection(hidden_to_out)
 52 | 
 53 |     n.sortModules()
 54 | 
 55 |     # Train the neural network on the training partition, validating
 56 |     # the training progress on the validation partition
 57 | 
 58 |     trainer = BackpropTrainer(n, dataset=supTrain, momentum=0.1, learningrate=0.01
 59 |                               , verbose=False, weightdecay=0.01)
 60 | 
 61 |     trainer.trainUntilConvergence(dataset=supTrain,
 62 |                                   maxEpochs=epochs, validationProportion=0.30)
 63 | 
 64 |     outputs = []
 65 |     for x in xTest:
 66 |         outputs.append(n.activate(x))
 67 | 
 68 |     return outputs
 69 | 
 70 | # define a function to convert a vector of time series into a 2D matrix 定义将时间序列向量转换为二维矩阵的函数
 71 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 72 |     matrix=[]
 73 |     for i in range(len(vectorSeries)-sequence_length+1):
 74 |         matrix.append(vectorSeries[i:i+sequence_length])
 75 |     return matrix
 76 | 
 77 | # load raw data
 78 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
 79 | # numpy array
 80 | df_raw_array = df_raw.values
 81 | # daily load
 82 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(0, len(df_raw))]
 83 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
 84 | k = 0
 85 | for j in range(0, len(list_hourly_load)):
 86 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
 87 |         k = k + 1
 88 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
 89 |     sum = 0
 90 |     num = 0
 91 |     for t in range(1,8):
 92 |         if(j - 24*t >= 0):
 93 |             num = num + 1
 94 |             sum = sum + list_hourly_load[j - 24*t]
 95 |         if(j + 24*t < len(list_hourly_load)):
 96 |             num = num + 1
 97 |             sum = sum + list_hourly_load[j + 24*t]
 98 |     sum = sum / num
 99 |     if(abs(list_hourly_load[j] - sum)>3):
100 |         k = k + 1
101 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
102 |         else: list_hourly_load[j] = sum - 3
103 | # shift all data by mean
104 | list_hourly_load = np.array(list_hourly_load)
105 | shifted_value = list_hourly_load.mean()
106 | list_hourly_load -= shifted_value
107 | # the length of the sequnce for predicting the future value
108 | sequence_length = 25
109 | # convert the vector to a 2D matrix
110 | matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
111 | matrix_load = np.array(matrix_load)
112 | print ("Data shape: ", matrix_load.shape)
113 | # train_row = int(round(0.9 * matrix_load.shape[0]))
114 | train_row = matrix_load.shape[0] - 48
115 | print('train:',train_row,'test:',48)
116 | train_set = matrix_load[:train_row, :]
117 | # random seed
118 | np.random.seed(1234)
119 | # shuffle the training set (but do not shuffle the test set)
120 | np.random.shuffle(train_set)
121 | # the training set
122 | X_train = train_set[:, :-1]
123 | # the last column is the true value to compute the mean-squared-error loss
124 | y_train = train_set[:, -1]
125 | print(X_train[0],y_train[0])
126 | # the test set
127 | X_test = matrix_load[train_row:, :-1]
128 | y_test = matrix_load[train_row:, -1]
129 | time_test = [df_raw_array[i,0] for i in range(train_row+23, len(df_raw))]
130 | # nn
131 | dimensions = [18]
132 | neurons = [75]
133 | names = []
134 | names.append('true')
135 | for x in range(len(dimensions)):
136 |     s = "d=" + str(dimensions[x]) + ",h=" + str(neurons[x])
137 |     names.append(s)
138 | preds = []
139 | preds.append(y_test)
140 | for x in range(len(dimensions)):
141 |     i = 0
142 |     # Perform dimensionality reduction on the feature vectors
143 |     pca = PCA(n_components=dimensions[x])
144 |     pca.fit(X_train)
145 |     xTrainRed = pca.transform(X_train)
146 |     xTestRed = pca.transform(X_test)
147 |     predicted_values = fit_predict(xTrainRed, y_train, xTestRed, 40, neurons[x])
148 |     mape = statistics.mape((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
149 |     print('MAPE is ', mape)
150 |     mae = statistics.mae((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
151 |     print('MAE is ', mae)
152 |     mse = statistics.meanSquareError((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
153 |     print('MSE is ', mse)
154 |     rmse = math.sqrt(mse)
155 |     print('RMSE is ', rmse)
156 |     nrmse = statistics.normRmse((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
157 |     print('NRMSE is ', nrmse)
158 |     print (i+1)
159 |     preds.append(predicted_values)
160 | # show
161 | fig = plt.figure()
162 | colors = ["g","r","b","c","m","y","k","w"]
163 | legendVars = []
164 | for j in range(len(preds)):
165 |     print(j)
166 |     x, = plt.plot(preds[j]+shifted_value, color=colors[j])
167 |     legendVars.append(x)
168 | plt.xlabel('Hour')
169 | plt.ylabel('Electricity load, kW')
170 | plt.legend(legendVars, names)
171 | plt.show()
172 | 


--------------------------------------------------------------------------------
/Methods/gbrt.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from sklearn.model_selection import GridSearchCV
  6 | from sklearn.ensemble import GradientBoostingRegressor
  7 | from tools import statistics
  8 | import math
  9 | import time
 10 | import operator as op
 11 | from matplotlib.font_manager import FontProperties
 12 | from tools import statistics
 13 | 
 14 | # define a function to convert a vector of time series into a 2D matrix
 15 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 16 |     matrix=[]
 17 |     for i in range(len(vectorSeries)-sequence_length+1):
 18 |         matrix.append(vectorSeries[i:i+sequence_length])
 19 |     return matrix
 20 | 
 21 | # load raw data
 22 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
 23 | # numpy array
 24 | df_raw_array = df_raw.values
 25 | # daily load
 26 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(0, len(df_raw))]
 27 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
 28 | k = 0
 29 | for j in range(0, len(list_hourly_load)):
 30 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
 31 |         k = k + 1
 32 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
 33 |     sum = 0
 34 |     num = 0
 35 |     for t in range(1,8):
 36 |         if(j - 24*t >= 0):
 37 |             num = num + 1
 38 |             sum = sum + list_hourly_load[j - 24*t]
 39 |         if(j + 24*t < len(list_hourly_load)):
 40 |             num = num + 1
 41 |             sum = sum + list_hourly_load[j + 24*t]
 42 |     sum = sum / num
 43 |     if(abs(list_hourly_load[j] - sum)>3):
 44 |         k = k + 1
 45 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
 46 |         else: list_hourly_load[j] = sum - 3
 47 | # shift all data by mean
 48 | list_hourly_load = np.array(list_hourly_load)
 49 | shifted_value = list_hourly_load.mean()
 50 | list_hourly_load -= shifted_value
 51 | # the length of the sequnce for predicting the future value
 52 | sequence_length = 25
 53 | # convert the vector to a 2D matrix
 54 | matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
 55 | matrix_load = np.array(matrix_load)
 56 | print ("Data shape: ", matrix_load.shape)
 57 | # split dataset: 90% for training and 10% for testing
 58 | # train_row = int(round(0.9 * matrix_load.shape[0]))
 59 | train_row = matrix_load.shape[0] - 48
 60 | print('train:',train_row,'test:',48)
 61 | train_set = matrix_load[:train_row, :]
 62 | # random seed
 63 | np.random.seed(1234)
 64 | # shuffle the training set (but do not shuffle the test set)
 65 | np.random.shuffle(train_set)
 66 | # the training set
 67 | X_train = train_set[:, :-1]
 68 | # the last column is the true value to compute the mean-squared-error loss
 69 | y_train = train_set[:, -1]
 70 | print(X_train[0],y_train[0])
 71 | # the test set
 72 | X_test = matrix_load[train_row:, :-1]
 73 | y_test = matrix_load[train_row:, -1]
 74 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
 75 | # gbdt
 76 | # gbdt = GradientBoostingRegressor(subsample=1,
 77 | #                                  min_samples_split=2, min_samples_leaf=1, max_depth=3, alpha=0.9,
 78 | #                                  verbose=0)
 79 | # param_grid = {
 80 | #     'loss': ['ls', 'lad', 'huber'],
 81 | #     'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2],
 82 | #     'n_estimators': [100, 200, 400, 800, 1000],
 83 | #     'max_depth': [3, 4, 5, 6],
 84 | #     'alpha': [0.7, 0.8, 0.9]}
 85 | # gbm = GridSearchCV(gbdt, param_grid)
 86 | # gbm.fit(X_train, y_train[:,i])
 87 | # print('Best parameters found by grid search are:', gbm.best_params_)
 88 | gbdt = GradientBoostingRegressor(loss='ls', learning_rate=0.2, n_estimators=400, subsample=1,
 89 |                                     min_samples_split=2, min_samples_leaf=1, max_depth=3, alpha=0.7,
 90 |                                     verbose=0)
 91 | gbdt.fit(X_train, y_train)
 92 | feature_importance = gbdt.feature_importances_
 93 | # get the predicted values
 94 | start = time.clock()
 95 | predicted_values = gbdt.predict(X_test)
 96 | print('预测耗时：', time.clock() - start, 's')
 97 | plt.figure()
 98 | plt.scatter(np.arange(1, len(feature_importance) + 1), feature_importance, c='r', zorder=10)
 99 | plt.plot(np.arange(1, len(feature_importance) + 1), feature_importance)
100 | plt.xlabel('Feature index')
101 | plt.ylabel('Feature importance')
102 | plt.show()
103 | # evaluation
104 | mape = statistics.mape((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
105 | print('MAPE is ', mape)
106 | mae = statistics.mae((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
107 | print('MAE is ', mae)
108 | mse = statistics.meanSquareError((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
109 | print('MSE is ', mse)
110 | rmse = math.sqrt(mse)
111 | print('RMSE is ', rmse)
112 | nrmse = statistics.normRmse((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
113 | print('NRMSE is ', nrmse)
114 | # plot the results
115 | fig = plt.figure()
116 | plt.plot(y_test + shifted_value, label="$Observed$", c='green')
117 | plt.plot(predicted_values + shifted_value, label="$Predicted$", c='red')
118 | plt.xlabel('Hour')
119 | plt.ylabel('Electricity load, kW')
120 | plt.legend()
121 | plt.show()


--------------------------------------------------------------------------------
/Methods/lstm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from keras.layers.core import Dense, Activation, Dropout
  6 | from keras.layers.recurrent import LSTM
  7 | from keras.models import Sequential
  8 | from keras.models import load_model
  9 | from tools import statistics
 10 | import math
 11 | 
 12 | # define a function to convert a vector of time series into a 2D matrix
 13 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 14 |     matrix=[]
 15 |     for i in range(len(vectorSeries)-sequence_length+1):
 16 |         matrix.append(vectorSeries[i:i+sequence_length])
 17 |     return matrix
 18 | 
 19 | # load raw data
 20 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
 21 | # numpy array
 22 | df_raw_array = df_raw.values
 23 | # daily load
 24 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(0, len(df_raw))]
 25 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
 26 | k = 0
 27 | for j in range(0, len(list_hourly_load)):
 28 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
 29 |         k = k + 1
 30 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
 31 |     sum = 0
 32 |     num = 0
 33 |     for t in range(1,8):
 34 |         if(j - 24*t >= 0):
 35 |             num = num + 1
 36 |             sum = sum + list_hourly_load[j - 24*t]
 37 |         if(j + 24*t < len(list_hourly_load)):
 38 |             num = num + 1
 39 |             sum = sum + list_hourly_load[j + 24*t]
 40 |     sum = sum / num
 41 |     if(abs(list_hourly_load[j] - sum)>3):
 42 |         k = k + 1
 43 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
 44 |         else: list_hourly_load[j] = sum - 3
 45 | print(k)
 46 | plt.plot(list_hourly_load)
 47 | plt.show()
 48 | # shift all data by mean
 49 | list_hourly_load = np.array(list_hourly_load)
 50 | shifted_value = list_hourly_load.mean()
 51 | list_hourly_load -= shifted_value
 52 | # the length of the sequnce for predicting the future value
 53 | sequence_length = 25
 54 | # convert the vector to a 2D matrix
 55 | matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
 56 | matrix_load = np.array(matrix_load)
 57 | print ("Data shape: ", matrix_load.shape)
 58 | # train_row = int(round(0.9 * matrix_load.shape[0]))
 59 | train_row = matrix_load.shape[0] - 48
 60 | print('train:',train_row,'test:',48)
 61 | train_set = matrix_load[:train_row, :]
 62 | # random seed
 63 | np.random.seed(1234)
 64 | # shuffle the training set (but do not shuffle the test set)
 65 | np.random.shuffle(train_set)
 66 | # the training set
 67 | X_train = train_set[:, :-1]
 68 | # the last column is the true value to compute the mean-squared-error loss
 69 | y_train = train_set[:, -1]
 70 | print(X_train[0],y_train[0])
 71 | # the test set
 72 | X_test = matrix_load[train_row:, :-1]
 73 | y_test = matrix_load[train_row:, -1]
 74 | time_test = [df_raw_array[i,0] for i in range(train_row+23, len(df_raw))]
 75 | # print(time_test[0]) # 7/10/2016 19:00
 76 | # the input to LSTM layer needs to have the shape of (number of samples, the dimension of each element)
 77 | X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
 78 | X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
 79 | # lstm
 80 | # build the model
 81 | model = Sequential()
 82 | # layer 1: LSTM
 83 | model.add(LSTM( input_dim=1, output_dim=50, return_sequences=True))
 84 | model.add(Dropout(0.2))
 85 | # layer 2: LSTM
 86 | model.add(LSTM(output_dim=100, return_sequences=False))
 87 | model.add(Dropout(0.2))
 88 | # layer 3: dense
 89 | # linear activation: a(x) = x
 90 | model.add(Dense(output_dim=1, activation='linear'))
 91 | # show model
 92 | model.summary()
 93 | # compile the model
 94 | model.compile(loss="mse", optimizer="rmsprop")
 95 | # train the model
 96 | model.fit(X_train, y_train, batch_size=1024, nb_epoch=100, validation_split=0.05, verbose=2)
 97 | # save model
 98 | model.save('../lstm.h5')
 99 | # load model
100 | # from keras.models import load_model
101 | model = load_model('../lstm.h5')
102 | # evaluate the result
103 | test_mse = model.evaluate(X_test, y_test, verbose=2)
104 | print ('\nThe MSE on the test data set is %.3f over %d test samples.' % (test_mse, len(y_test)))
105 | # get the predicted values
106 | predicted_values = model.predict(X_test)
107 | num_test_samples = len(predicted_values)
108 | predicted_values = np.reshape(predicted_values, (num_test_samples,1))
109 | # evaluation
110 | mape = statistics.mape((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
111 | print('MAPE is ', mape)
112 | mae = statistics.mae((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
113 | print('MAE is ', mae)
114 | mse = statistics.meanSquareError((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
115 | print('MSE is ', mse)
116 | rmse = math.sqrt(mse)
117 | print('RMSE is ', rmse)
118 | nrmse = statistics.normRmse((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
119 | print('NRMSE is ', nrmse)
120 | # plot the results
121 | fig = plt.figure()
122 | plt.plot(y_test + shifted_value, label="$Observed$", c='green')
123 | plt.plot(predicted_values + shifted_value, label="$Predicted$", c='red')
124 | plt.xlabel('Hour')
125 | plt.ylabel('Electricity load, kW')
126 | plt.legend()
127 | plt.show()
128 | 
129 | 


--------------------------------------------------------------------------------
/Methods/rfr.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from sklearn.ensemble import RandomForestRegressor
  6 | from tools import statistics
  7 | import math
  8 | import time
  9 | import operator as op
 10 | from matplotlib.font_manager import FontProperties
 11 | from sklearn.externals import joblib
 12 | from tools import statistics
 13 | 
 14 | # define a function to convert a vector of time series into a 2D matrix
 15 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 16 |     matrix=[]
 17 |     for i in range(len(vectorSeries)-sequence_length+1):
 18 |         matrix.append(vectorSeries[i:i+sequence_length])
 19 |     return matrix
 20 | 
 21 | # load raw data
 22 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
 23 | # numpy array
 24 | df_raw_array = df_raw.values
 25 | # daily load
 26 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(0, len(df_raw))]
 27 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
 28 | k = 0
 29 | for j in range(0, len(list_hourly_load)):
 30 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
 31 |         k = k + 1
 32 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
 33 |     sum = 0
 34 |     num = 0
 35 |     for t in range(1,8):
 36 |         if(j - 24*t >= 0):
 37 |             num = num + 1
 38 |             sum = sum + list_hourly_load[j - 24*t]
 39 |         if(j + 24*t < len(list_hourly_load)):
 40 |             num = num + 1
 41 |             sum = sum + list_hourly_load[j + 24*t]
 42 |     sum = sum / num
 43 |     if(abs(list_hourly_load[j] - sum)>3):
 44 |         k = k + 1
 45 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
 46 |         else: list_hourly_load[j] = sum - 3
 47 | # shift all data by mean
 48 | list_hourly_load = np.array(list_hourly_load)
 49 | shifted_value = list_hourly_load.mean()
 50 | list_hourly_load -= shifted_value
 51 | # the length of the sequnce for predicting the future value
 52 | sequence_length = 25
 53 | # convert the vector to a 2D matrix
 54 | matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
 55 | matrix_load = np.array(matrix_load)
 56 | print ("Data shape: ", matrix_load.shape)
 57 | # train_row = int(round(0.9 * matrix_load.shape[0]))
 58 | train_row = matrix_load.shape[0] - 48
 59 | print('train:',train_row,'test:',48)
 60 | train_set = matrix_load[:train_row, :]
 61 | # random seed
 62 | np.random.seed(1234)
 63 | # shuffle the training set (but do not shuffle the test set)
 64 | np.random.shuffle(train_set)
 65 | # the training set
 66 | X_train = train_set[:, :-1]
 67 | # the last column is the true value to compute the mean-squared-error loss
 68 | y_train = train_set[:, -1]
 69 | print(X_train[0],y_train[0])
 70 | # the test set
 71 | X_test = matrix_load[train_row:, :-1]
 72 | y_test = matrix_load[train_row:, -1]
 73 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
 74 | 
 75 | # rfr
 76 | model = RandomForestRegressor(n_estimators = 100, max_features=5)
 77 | model.fit(X_train, y_train)
 78 | joblib.dump(model, '../rfr.model')
 79 | model =joblib.load('../rfr.model')
 80 | 
 81 | feature_importance = model.feature_importances_
 82 | X = [' Lag_24 ',' Lag_23 ',' Lag_22 ',' Lag_21 ',' Lag_20 ',' Lag_19 ',' Lag_18 ',' Lag_17 ',' Lag_16 ',
 83 |      ' Lag_15 ',' Lag_14 ',' Lag_13 ',' Lag_12 ',' Lag_11 ',' Lag_10 ',' Lag_9 ',' Lag_8 ',' Lag_7 ',
 84 |      ' Lag_6 ',' Lag_5 ',' Lag_4 ',' Lag_3 ',' Lag_2 ',' Lag_1 ']
 85 | s = 0
 86 | for i in range(len(feature_importance)):
 87 |     s += feature_importance[i]
 88 | 
 89 | plt.figure()
 90 | plt.bar(np.arange(1, len(feature_importance) + 1), feature_importance/s, color='lightsteelblue')
 91 | plt.plot(np.arange(1, len(feature_importance) + 1), feature_importance/s)
 92 | plt.xticks(np.arange(1, len(feature_importance) + 1),X)
 93 | plt.xlabel('Feature')
 94 | plt.ylabel('Feature importance')
 95 | plt.grid(True)
 96 | plt.show()
 97 | 
 98 | # get the predicted values
 99 | start = time.clock()
100 | predicted_values = model.predict(X_test)
101 | # evaluation
102 | mape = statistics.mape((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
103 | print('MAPE is ', mape)
104 | mae = statistics.mae((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
105 | print('MAE is ', mae)
106 | mse = statistics.meanSquareError((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
107 | print('MSE is ', mse)
108 | rmse = math.sqrt(mse)
109 | print('RMSE is ', rmse)
110 | nrmse = statistics.normRmse((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
111 | print('NRMSE is ', nrmse)
112 | # plot the results
113 | fig = plt.figure()
114 | plt.plot(y_test + shifted_value, label="$Observed$", c='green')
115 | plt.plot(predicted_values + shifted_value, label="$Predicted$", c='red')
116 | plt.xlabel('Hour')
117 | plt.ylabel('Electricity load, kW')
118 | plt.legend()
119 | plt.show()
120 | 


--------------------------------------------------------------------------------
/Methods/seq2seq.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from math import sqrt
  6 | from pandas import DataFrame
  7 | from pandas import concat
  8 | from numpy import argmax
  9 | from keras.models import Sequential
 10 | from keras.layers.core import Dense,Dropout
 11 | from keras.layers.recurrent import LSTM
 12 | from keras.layers import TimeDistributed
 13 | from keras.layers import RepeatVector
 14 | from sklearn.metrics import mean_squared_error
 15 | from pandas import read_csv
 16 | from keras.models import load_model
 17 | from tools import statistics
 18 | import math
 19 | 
 20 | # convert time series into supervised learning problem
 21 | # data: Sequence of observations as a list or 2D NumPy array. Required.
 22 | # n_in: Number of lag observations as input (X). Values may be between [1..len(data)] Optional. Defaults to 1.
 23 | # n_out: Number of observations as output (y). Values may be between [0..len(data)-1]. Optional. Defaults to 1.
 24 | # dropnan: Boolean whether or not to drop rows with NaN values. Optional. Defaults to True.
 25 | def series_to_supervised(data, n_in, n_out, dropnan=True):
 26 |     n_vars = 1 if type(data) is list else data.shape[1]
 27 |     df = DataFrame(data)
 28 |     cols, names = [], []
 29 |     # input sequence (t-n, ... t-1)
 30 |     for i in range(n_in, 0, -1):
 31 |         cols.append(df.shift(i))
 32 |         names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
 33 |         # forecast sequence (t, t+1, ... t+n)
 34 |     for i in range(0, n_out):
 35 |         cols.append(df.shift(-i))
 36 |         if i == 0:
 37 |             names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
 38 |         else:
 39 |             names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
 40 |     # put it all together
 41 |     agg = concat(cols, axis=1)
 42 |     agg.columns = names
 43 |     # drop rows with NaN values
 44 |     if dropnan:
 45 |         agg.dropna(inplace=True)
 46 |     agg = agg.applymap(lambda x: np.int32(x))
 47 |     return agg
 48 | 
 49 | # convert data to strings
 50 | def to_string(X, y, n_numbers, largest):
 51 |     max_length = 3
 52 |     Xstr = []
 53 |     for pattern in X:
 54 |         element_list=[]
 55 |         for element in pattern:
 56 |             strp =str(element)
 57 |             strp=''.join([' ' for _ in range(max_length-len(strp))])+strp
 58 |             element_list.append(strp)
 59 |         element_ensem=','.join([aa for aa in element_list])
 60 |         Xstr.append(element_ensem)
 61 |     ystr=[]
 62 |     for pattern in y:
 63 |         element_list=[]
 64 |         for element in pattern:
 65 |             strp =str(element)
 66 |             strp=''.join([' ' for _ in range(max_length-len(strp))])+strp
 67 |             element_list.append(strp)
 68 |         element_ensem=','.join([aa for aa in element_list])
 69 |         ystr.append(element_ensem)
 70 |     return Xstr,ystr
 71 | 
 72 | def one_hot_encode(X, series_min,series_max,n_unique):
 73 |     gap=(series_max-series_min)/n_unique
 74 |     Xenc=[]
 75 |     for sequence in X:
 76 |         new_index_ensem=[]
 77 |         for value in sequence:
 78 |             new_index=(value-series_min)/gap
 79 |             if value == 18544:
 80 |                 new_index = new_index-0.1
 81 |             new_index_ensem.append(int(new_index))
 82 |         encoding=[]
 83 |         if value == 18544:
 84 |             print(new_index_ensem, new_index, value, series_max, series_min, gap)
 85 |         for index in new_index_ensem:
 86 |             vector=[0 for _ in range(n_unique)]
 87 |             vector[index]=1
 88 |             encoding.append(vector)
 89 |         Xenc.append(encoding)
 90 |     return np.array(Xenc)
 91 | 
 92 | # decode a one hot encoded string
 93 | def one_hot_decode(y,series_min,series_max,n_unique):
 94 |     gap=(series_max-series_min)/n_unique
 95 |     y_dec=[]
 96 |     for encoded_seq in y:
 97 |         decoded_seq=[argmax(vector) for vector in encoded_seq]
 98 |         decoded_seq=np.array(decoded_seq)
 99 |         decoded_seq_tran=list(decoded_seq*gap+series_min)
100 |         y_dec.append(decoded_seq_tran)
101 |     return y_dec
102 | 
103 | 
104 | def convertSeriesToMatrix(vectorSeries, sequence_length):
105 |     matrix=[]
106 |     for i in range(len(vectorSeries)-sequence_length+1):
107 |         matrix.append(vectorSeries[i:i+sequence_length])
108 |     return matrix
109 | 
110 | if __name__=='__main__':
111 |     # load raw data
112 |     df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0, 1])
113 |     # numpy array
114 |     df_raw_array = df_raw.values
115 |     # daily load
116 |     list_hourly_load = [df_raw_array[i, 1] / 1000 for i in range(0, len(df_raw))]
117 |     print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
118 |     k = 0
119 |     for j in range(0, len(list_hourly_load)):
120 |         if (abs(list_hourly_load[j] - list_hourly_load[j - 1]) > 2 and abs(
121 |                     list_hourly_load[j] - list_hourly_load[j + 1]) > 2):
122 |             k = k + 1
123 |             list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - \
124 |                                   list_hourly_load[j - 24 - 1] / 2
125 |         sum = 0
126 |         num = 0
127 |         for t in range(1, 8):
128 |             if (j - 24 * t >= 0):
129 |                 num = num + 1
130 |                 sum = sum + list_hourly_load[j - 24 * t]
131 |             if (j + 24 * t < len(list_hourly_load)):
132 |                 num = num + 1
133 |                 sum = sum + list_hourly_load[j + 24 * t]
134 |         sum = sum / num
135 |         if (abs(list_hourly_load[j] - sum) > 3):
136 |             k = k + 1
137 |             if (list_hourly_load[j] > sum):
138 |                 list_hourly_load[j] = sum + 3
139 |             else:
140 |                 list_hourly_load[j] = sum - 3
141 |     print(k)
142 |     # plt.plot(list_hourly_load)
143 |     # plt.show()
144 |     # shift all data by mean
145 |     list_hourly_load = np.array(list_hourly_load)
146 |     shifted_value = list_hourly_load.mean()
147 |     list_hourly_load -= shifted_value
148 |     # the length of the sequnce for predicting the future value
149 |     sequence_length = 25
150 |     # convert the vector to a 2D matrix
151 |     matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
152 |     matrix_load = np.array(matrix_load)
153 |     print ("Data shape: ", matrix_load.shape)
154 |     # train_row = int(round(0.9 * matrix_load.shape[0]))
155 |     train_row = matrix_load.shape[0] - 24*14
156 |     print('train:', train_row, 'test:', 24*14)
157 |     train_set = matrix_load[:train_row, :]
158 |     # random seed
159 |     np.random.seed(1234)
160 |     # shuffle the training set (but do not shuffle the test set)
161 |     np.random.shuffle(train_set)
162 |     # the training set
163 |     X_train = train_set[:, :-1]
164 |     # the last column is the true value to compute the mean-squared-error loss
165 |     y_train = train_set[:, -1]
166 |     # print(X_train[0], y_train[0])
167 |     # the test set
168 |     X_test = matrix_load[train_row:, :-1]
169 |     y_test = matrix_load[train_row:, -1]
170 |     time_test = [df_raw_array[i, 0] for i in range(train_row + 23, len(df_raw))]
171 |     # print(time_test[0])
172 |     # the input to LSTM layer needs to have the shape of (number of samples, the dimension of each element)
173 |     X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
174 |     y_train = np.reshape(y_train, (y_train.shape[0], 1, 1))
175 |     X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
176 |     print(np.shape(X_train), np.shape(y_train))
177 |     # create LSTM
178 |     model = Sequential()
179 |     model.add(LSTM(150, batch_input_shape=(None,X_train.shape[1], X_train.shape[2])))
180 |     model.add(Dropout(0.2))
181 |     model.add(RepeatVector(1))
182 |     model.add(LSTM(150, return_sequences=True))  #decoder
183 |     model.add(Dropout(0.2))
184 |     model.add(LSTM(150, return_sequences=True))  #decoder
185 |     model.add(Dropout(0.3))
186 |     model.add(TimeDistributed(Dense(1, activation='linear')))
187 |     model.compile(loss='mse', optimizer='rmsprop', metrics=['accuracy'])
188 |     # show model
189 |     # print(model.summary())
190 |     # train LSTM
191 |     history=model.fit(X_train, y_train, epochs=50, batch_size=50, validation_split=0.05, shuffle=False, verbose=2)
192 |     # save model
193 |     model.save('../seq2seq.h5')
194 |     # model = load_model('../model/seq2seq.h5')
195 |     # evaluate on some new patterns
196 |     predicted_values = model.predict(X_test)
197 |     num_test_samples = len(predicted_values)
198 |     predicted_values = np.reshape(predicted_values, (num_test_samples, 1))
199 |     # evaluation
200 |     mape = statistics.mape((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
201 |     print('MAPE is ', mape)
202 |     mae = statistics.mae((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
203 |     print('MAE is ', mae)
204 |     mse = statistics.meanSquareError((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
205 |     print('MSE is ', mse)
206 |     rmse = math.sqrt(mse)
207 |     print('RMSE is ', rmse)
208 |     nrmse = statistics.normRmse((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
209 |     print('NRMSE is ', nrmse)
210 |     # plot the results
211 |     fig = plt.figure()
212 |     plt.plot(y_test + shifted_value, label="$Observed$", c='green')
213 |     plt.plot(predicted_values + shifted_value, label="$Predicted$", c='red')
214 |     plt.xlabel('Hour')
215 |     plt.ylabel('Electricity load, kW')
216 |     plt.legend()
217 |     plt.show()
218 | 


--------------------------------------------------------------------------------
/Methods/svr.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from tools import statistics
  3 | import pandas as pd
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | from sklearn import svm
  7 | 
  8 | # define a function to convert a vector of time series into a 2D matrix
  9 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 10 |     matrix=[]
 11 |     for i in range(len(vectorSeries)-sequence_length+1):
 12 |         matrix.append(vectorSeries[i:i+sequence_length])
 13 |     return matrix
 14 | 
 15 | # load raw data
 16 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
 17 | # numpy array
 18 | df_raw_array = df_raw.values
 19 | # daily load
 20 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(0, len(df_raw))]
 21 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
 22 | k = 0
 23 | for j in range(0, len(list_hourly_load)):
 24 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
 25 |         k = k + 1
 26 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
 27 |     sum = 0
 28 |     num = 0
 29 |     for t in range(1,8):
 30 |         if(j - 24*t >= 0):
 31 |             num = num + 1
 32 |             sum = sum + list_hourly_load[j - 24*t]
 33 |         if(j + 24*t < len(list_hourly_load)):
 34 |             num = num + 1
 35 |             sum = sum + list_hourly_load[j + 24*t]
 36 |     sum = sum / num
 37 |     if(abs(list_hourly_load[j] - sum)>3):
 38 |         k = k + 1
 39 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
 40 |         else: list_hourly_load[j] = sum - 3
 41 | # shift all data by mean
 42 | list_hourly_load = np.array(list_hourly_load)
 43 | shifted_value = list_hourly_load.mean()
 44 | list_hourly_load -= shifted_value
 45 | # the length of the sequnce for predicting the future value
 46 | sequence_length = 25
 47 | # convert the vector to a 2D matrix
 48 | matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
 49 | matrix_load = np.array(matrix_load)
 50 | print ("Data shape: ", matrix_load.shape)
 51 | # train_row = int(round(0.9 * matrix_load.shape[0]))
 52 | train_row = matrix_load.shape[0] - 24*7
 53 | print('train:',train_row,'test:',24*7)
 54 | train_set = matrix_load[:train_row, :]
 55 | # random seed
 56 | np.random.seed(1234)
 57 | # shuffle the training set (but do not shuffle the test set)
 58 | np.random.shuffle(train_set)
 59 | # the training set
 60 | X_train = train_set[:, :-1]
 61 | # the last column is the true value to compute the mean-squared-error loss
 62 | y_train = train_set[:, -1]
 63 | print(X_train[0],y_train[0])
 64 | # the test set
 65 | X_test = matrix_load[train_row:, :-1]
 66 | y_test = matrix_load[train_row:, -1]
 67 | time_test = [df_raw_array[i,0] for i in range(train_row+23, len(df_raw))]
 68 | # svr
 69 | kernelList = ["rbf"]
 70 | names = ["Observed","Predicted"]
 71 | preds = []
 72 | preds.append(y_test)
 73 | for i in range(len(kernelList)):
 74 |     clf = svm.SVR(C=2.0, kernel=kernelList[i])
 75 |     clf.fit(X_train, y_train)
 76 |     predicted_values = clf.predict(X_test)
 77 |     mape = statistics.mape((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
 78 |     print('MAPE is ', mape)
 79 |     mae = statistics.mae((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
 80 |     print('MAE is ', mae)
 81 |     mse = statistics.meanSquareError((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
 82 |     print('MSE is ', mse)
 83 |     rmse = math.sqrt(mse)
 84 |     print('RMSE is ', rmse)
 85 |     nrmse = statistics.normRmse((y_test + shifted_value) * 1000, (predicted_values + shifted_value) * 1000)
 86 |     print('NRMSE is ', nrmse)
 87 |     preds.append(predicted_values)
 88 | # show
 89 | fig = plt.figure()
 90 | colors = ["g","r","b","c","m","y","k","w"]
 91 | legendVars = []
 92 | for j in range(len(preds)):
 93 |     print(j)
 94 |     x, = plt.plot(preds[j]+shifted_value, color=colors[j])
 95 |     legendVars.append(x)
 96 | plt.xlabel('Hour')
 97 | plt.ylabel('Electricity load, kW')
 98 | plt.legend(legendVars, names)
 99 | plt.show()
100 | 


--------------------------------------------------------------------------------
/Methods/xgboost_.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import xgboost as xgb
  6 | from tools import statistics
  7 | import math
  8 | import time
  9 | import operator as op
 10 | from matplotlib.font_manager import FontProperties
 11 | 
 12 | 
 13 | def convertSeriesToMatrix(vectorSeries, sequence_length):
 14 |     matrix=[]
 15 |     for i in range(len(vectorSeries)-sequence_length+1):
 16 |             matrix.append(vectorSeries[i:i+sequence_length])
 17 |     return matrix
 18 | 
 19 | 
 20 | def create_feature_map(before):
 21 |     outfile = open('../xgb.fmap', 'w')
 22 |     for i in range(before):
 23 |         j = before - i
 24 |         outfile.write('{0}\t{1}\tq\n'.format(i, 'Lag_'+str(j)))
 25 |     outfile.close()
 26 | 
 27 | # load raw data
 28 | df_raw = pd.read_csv('../data/load.csv', header=0, usecols=[0,1])
 29 | # numpy array
 30 | df_raw_array = df_raw.values
 31 | # daily load
 32 | list_hourly_load = [df_raw_array[i,1]/1000 for i in range(0, len(df_raw))]
 33 | print ("Data shape of list_hourly_load: ", np.shape(list_hourly_load))
 34 | k = 0
 35 | for j in range(0, len(list_hourly_load)):
 36 |     if(abs(list_hourly_load[j]-list_hourly_load[j-1])>2 and abs(list_hourly_load[j]-list_hourly_load[j+1])>2):
 37 |         k = k + 1
 38 |         list_hourly_load[j] = (list_hourly_load[j - 1] + list_hourly_load[j + 1]) / 2 + list_hourly_load[j - 24] - list_hourly_load[j - 24 - 1] / 2
 39 |     sum = 0
 40 |     num = 0
 41 |     for t in range(1,8):
 42 |         if(j - 24*t >= 0):
 43 |             num = num + 1
 44 |             sum = sum + list_hourly_load[j - 24*t]
 45 |         if(j + 24*t < len(list_hourly_load)):
 46 |             num = num + 1
 47 |             sum = sum + list_hourly_load[j + 24*t]
 48 |     sum = sum / num
 49 |     if(abs(list_hourly_load[j] - sum)>3):
 50 |         k = k + 1
 51 |         if(list_hourly_load[j] > sum): list_hourly_load[j] = sum + 3
 52 |         else: list_hourly_load[j] = sum - 3
 53 | # print(k)
 54 | # plt.plot(list_hourly_load)
 55 | # plt.show()
 56 | # shift all data by mean
 57 | list_hourly_load = np.array(list_hourly_load)
 58 | shifted_value = list_hourly_load.mean()
 59 | list_hourly_load -= shifted_value
 60 | # the length of the sequnce for predicting the future value
 61 | sequence_length = 25
 62 | # convert the vector to a 2D matrix
 63 | matrix_load = convertSeriesToMatrix(list_hourly_load, sequence_length)
 64 | matrix_load = np.array(matrix_load)
 65 | print ("Data shape: ", matrix_load.shape)
 66 | # train_row = int(round(0.9 * matrix_load.shape[0]))
 67 | train_row = matrix_load.shape[0] - matrix_load.shape[0]
 68 | print('train:',train_row,'test:', 24*14)
 69 | train_set = matrix_load[:train_row, :]
 70 | # random seed
 71 | np.random.seed(1234)
 72 | # shuffle the training set (but do not shuffle the test set)
 73 | np.random.shuffle(train_set)
 74 | # the training set
 75 | X_train = train_set[:, :-1]
 76 | # the last column is the true value to compute the mean-squared-error loss
 77 | y_train = train_set[:, -1]
 78 | print(X_train[0],y_train[0])
 79 | # the test set
 80 | X_test = matrix_load[train_row:, :-1]
 81 | y_test = matrix_load[train_row:, -1]
 82 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
 83 | 
 84 | # xgboost
 85 | create_feature_map(24)
 86 | X_train, X_test = map(lambda a: np.array(a), [X_train, X_test])
 87 | data_train = xgb.DMatrix(X_train, label=y_train)
 88 | data_test = xgb.DMatrix(X_test, label=y_test)
 89 | watch_list = [(data_test, 'eval'), (data_train, 'train')]
 90 | param = {'max_depth': 6, 'eta': 0.1, 'silent': 1, 'objective': 'reg:linear'}
 91 | bst = xgb.train(param, data_train, num_boost_round=60, evals=watch_list)
 92 | # save model
 93 | bst.save_model('../xgboost.model')
 94 | # load model
 95 | bst = xgb.Booster()
 96 | bst.load_model('../xgboost.model')
 97 | 
 98 | # xgb.plot_importance(bst)
 99 | importance = bst.get_fscore(fmap='../xgb.fmap')
100 | print(importance)
101 | importance = sorted(importance.items(), key=op.itemgetter(1))
102 | df = pd.DataFrame(importance, columns=['feature', 'fscore'])
103 | df['fscore'] = df['fscore'] / df['fscore'].sum()
104 | df.plot(kind='barh', x='feature', y='fscore')
105 | font = FontProperties(fname='C:\Windows\Fonts\simsun.ttc', size=12)
106 | font_title = FontProperties(fname='C:\Windows\Fonts\simsun.ttc', size=14)
107 | plt.show()
108 | # get the predicted values
109 | start = time.clock()
110 | predicted_values = bst.predict(data_test)
111 | # evaluation
112 | mape = statistics.mape((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
113 | print('MAPE is ', mape)
114 | mae = statistics.mae((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
115 | print('MAE is ', mae)
116 | mse = statistics.meanSquareError((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
117 | print('MSE is ', mse)
118 | rmse = math.sqrt(mse)
119 | print('RMSE is ', rmse)
120 | nrmse = statistics.normRmse((y_test+shifted_value)*1000,(predicted_values+shifted_value)*1000)
121 | print('NRMSE is ', nrmse)
122 | # plot the results
123 | fig = plt.figure()
124 | plt.plot(y_test + shifted_value, label="$Observed$", c='green')
125 | plt.plot(predicted_values + shifted_value, label="$Predicted$", c='red')
126 | plt.xlabel('Hour')
127 | plt.ylabel('Electricity load, kW')
128 | plt.legend()
129 | plt.show()
130 | 


--------------------------------------------------------------------------------