├── .DS_Store
├── SVM.pptx
├── doc
    └── 1-s2.0-S0378778816308970-main.pdf
├── README.md
├── RFE.py
├── heatmap_app_24hrs.py
├── Dataset Description.txt
├── GBRegression.py
├── GBRegression1.py
└── RandomForest.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/data-driven-prediction-models-of-energy-use-of-appliances/HEAD/.DS_Store


--------------------------------------------------------------------------------
/SVM.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/data-driven-prediction-models-of-energy-use-of-appliances/HEAD/SVM.pptx


--------------------------------------------------------------------------------
/doc/1-s2.0-S0378778816308970-main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/data-driven-prediction-models-of-energy-use-of-appliances/HEAD/doc/1-s2.0-S0378778816308970-main.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # data-driven-prediction-models-of-energy-use-of-appliances-in-a-low-energy-house
2 | It is a machine learning based regression model implemented in python in which predicts the energy consumption of a house at a particular time span based on temperature and humidity of each rooms and other external factors such wind speed, visibility, dew point, etc.
3 | 


--------------------------------------------------------------------------------
/RFE.py:
--------------------------------------------------------------------------------
 1 | #Recursive Feature Elimination
 2 | from sklearn import datasets
 3 | from sklearn.feature_selection import RFE
 4 | from sklearn.svm import SVR
 5 | import pandas as pd
 6 | import numpy as np
 7 | a=["date","Appliances","T1","RH_1","T2","RH_2","T3","RH_3","T4","RH_4","T5","RH_5","T6","RH_6","T7","RH_7","T8","RH_8","T9","RH_9","T_out","RH_out","Windspeed","Visibility","rv1","rv2"]
 8 | #load the datasets
 9 | df = pd.read_csv(r"C:\Users\T00538\Desktop\training.csv",usecols=a,nrows=677)
10 | df["date1"]=(df['date'].str.split(':').str[0])
11 | del a[0]
12 | table = pd.pivot_table(df,index=["date1"],
13 |                values = a,
14 |                aggfunc=[np.sum],fill_value=0)
15 | 
16 | 
17 | y = np.array(table[("sum",'Appliances')])
18 | del table[("sum","Appliances")]
19 | X = np.array(table.values.tolist())
20 | 
21 | 
22 | 
23 | #create a base classifier used to evaluate a subset of attributes
24 | model=SVR(kernel='linear')
25 | 
26 | #create the RFE model and select 3 attributes
27 | rfe=RFE(model,22)
28 | rfe=rfe.fit(X,y)
29 | 
30 | #summarize the selection of the attributes
31 | n=list(table.columns.get_level_values(1))
32 | for i in range(len(a)-1):
33 |     print(str(n[i])+" : "+str(rfe.support_[i])+", rank: "+str(rfe.ranking_[i]))
34 | 
35 | 


--------------------------------------------------------------------------------
/heatmap_app_24hrs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import plotly.plotly as py
 4 | import plotly.graph_objs as go
 5 | #from pandas import DataFrame
 6 | import matplotlib.pyplot as plt
 7 | from matplotlib import cm
 8 | 
 9 | df = pd.read_csv(r"C:/Users/T00537/Desktop/Data Set/training.csv")
10 | #df = df[df["date"] > "2016-1-12 00:00:00" & df["date"] < "2016-1-17 23:59:59"]
11 | df=df[df["date"]< "2016-01-17 23:59:59"]
12 | df["date1"]=(df['date'].str.split(':').str[0].str.split(" ").str[1])
13 | 
14 | arranged_day = pd.Categorical(df["Day_of_week"], categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday","Sunday"],ordered=True)
15 | day_series = pd.Series(arranged_day)
16 | 
17 | table = pd.pivot_table(df,index=["date1"],
18 |                values="Appliances",columns=day_series,
19 |                aggfunc=[np.sum],fill_value=0)
20 | 
21 | 
22 | print(table)
23 | fig, ax = plt.subplots()
24 | ax.set_title('Heatmap : Appliances(wh)')
25 | 
26 | heatmap = ax.pcolor(table)
27 | plt.colorbar(heatmap)
28 | 
29 | #cbar.ax.set_xticklabels(['< -1', '0', '> 1'])
30 | #cbar.ax.set_yticklabels(table.columns.get_level_values(1))
31 | #plt.pcolor(table)
32 | 
33 | #ax.set_yticks(np.arange(len(table.index)))
34 | #ax.set_xticks(np.arange(len(table.columns)))
35 | 
36 | ax.set_yticks(range(len(table.index)+1))
37 | ax.set_xticks(range(len(table.columns)+1))
38 | 
39 | ax.set_xticklabels(table.columns.get_level_values(1))
40 | 
41 | plt.xlabel("Week 2")
42 | plt.ylabel("Hours of Day")
43 | plt.show()
44 | 


--------------------------------------------------------------------------------
/Dataset Description.txt:
--------------------------------------------------------------------------------
 1 | Data Set Information:
 2 | 
 3 | The data set is at 10 min for about 4.5 months. The house temperature and humidity
 4 | conditions were monitored with a wireless sensor network. Each wireless node
 5 | transmitted the temperature and humidity conditions around 3.3 min. Then, the
 6 | wireless data was averaged for 10 minutes periods. The energy data was logged
 7 | every 10 minutes with m-bus energy meters. Weather from the nearest airport
 8 | weather station was downloaded from a public data set, and merged together with the
 9 | experimental data sets using the date and time column. Two random variables have
10 | been included in the data set for testing the regression models and to filter out
11 | non predictive attributes (parameters).
12 | 
13 | Attribute Information:
14 | 
15 | date time year-month-day hour:minute:second 
16 | Appliances, energy use in Wh 
17 | lights, energy use of light fixtures in the house in Wh 
18 | T1, Temperature in kitchen area, in Celsius 
19 | RH_1, Humidity in kitchen area, in % 
20 | T2, Temperature in living room area, in Celsius 
21 | RH_2, Humidity in living room area, in % 
22 | T3, Temperature in laundry room area 
23 | RH_3, Humidity in laundry room area, in % 
24 | T4, Temperature in office room, in Celsius 
25 | RH_4, Humidity in office room, in % 
26 | T5, Temperature in bathroom, in Celsius 
27 | RH_5, Humidity in bathroom, in % 
28 | T6, Temperature outside the building (north side), in Celsius 
29 | RH_6, Humidity outside the building (north side), in % 
30 | T7, Temperature in ironing room , in Celsius 
31 | RH_7, Humidity in ironing room, in % 
32 | T8, Temperature in teenager room 2, in Celsius 
33 | RH_8, Humidity in teenager room 2, in % 
34 | T9, Temperature in parents room, in Celsius 
35 | RH_9, Humidity in parents room, in % 
36 | To, Temperature outside (from Chievres weather station), in Celsius 
37 | Pressure (from Chievres weather station), in mm Hg 
38 | RH_out, Humidity outside (from Chievres weather station), in % 
39 | Wind speed (from Chievres weather station), in m/s 
40 | Visibility (from Chievres weather station), in km 
41 | Tdewpoint (from Chievres weather station), °C 
42 | rv1, Random variable 1, nondimensional 
43 | rv2, Random variable 2, nondimensional 


--------------------------------------------------------------------------------
/GBRegression.py:
--------------------------------------------------------------------------------
 1 | print(__doc__)
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import pandas as pd
 6 | from sklearn import ensemble
 7 | from sklearn import datasets
 8 | from sklearn.utils import shuffle
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | #Load Data
12 | 
13 | a=["date","Appliances","T1","RH_1","T2","RH_2","T3","RH_3","T4","RH_4","T5","RH_5","T6","RH_6","T7","RH_7","T8","RH_8","T9","RH_9","NSM","T_out","RH_out","Windspeed","Visibility","rv1","rv2"]
14 | #load the datasets
15 | df = pd.read_csv(r"C:\Users\T00538\Desktop\training.csv",usecols=a,nrows=677)
16 | df["date1"]=(df['date'].str.split(':').str[0])
17 | del a[0]
18 | table = pd.pivot_table(df,index=["date1"],
19 |                values = a,
20 |                aggfunc=[np.sum],fill_value=0)
21 | 
22 | 
23 | y = np.array(table[("sum",'Appliances')])
24 | del table[("sum","Appliances")]
25 | X = np.array(table.values.tolist())
26 | 
27 | 
28 | ##boston=datasets.load_boston()
29 | ##X,y=shuffle(boston.data,boston.target,random_state=13)
30 | ##X=X.astype(np.float32)
31 | ##offset=int(X.shape[0]*0.9)
32 | ##X_train,y_train=X[:offset],y[:offset]
33 | ##X_test,y_test=X[offset:],y[offset:]
34 | 
35 | #Fit Regression Model
36 | 
37 | params={'n_estimators':500,'max_depth':4,'min_samples_split':2,
38 |         'learning_rate':0.01,'loss':'ls'}
39 | clf=ensemble.GradientBoostingRegressor(**params)
40 | 
41 | clf.fit(X,y)
42 | mse=mean_squared_error(y,clf.predict(X))
43 | print("MSE:%.4f" % mse)
44 | 
45 | #Plot Training Deviance
46 | 
47 | # compute test set deviance
48 | test_score=np.zeros((params['n_estimators'],), dtype=np.float64)
49 | 
50 | for i,y_pred in enumerate(clf.staged_predict(X)):
51 |     test_score[i]=clf.loss_(y,y_pred)
52 | 
53 | plt.figure(figsize=(12,6))
54 | plt.subplot(1,2,1)
55 | plt.title('Deviance')
56 | ##plt.plot(np.arange(params['n_estimators'])+1,clf.train_score_,'b-',
57 | ##         label='Training Set Deviance')
58 | ##plt.plot(np.arange(params['n_estimators'])+1, clf.train_score_,'b-',
59 | ##         label='Training Set Deviance')
60 | plt.plot(np.arange(params['n_estimators'])+1,test_score,'r-',
61 |          label='Test Set Deviance')
62 | plt.legend(loc='upper right')
63 | plt.xlabel('Boosting Iterations')
64 | plt.ylabel('Deviance')
65 | #plt.show()
66 | 
67 | #Plot Feature Importance
68 | feature_importance=clf.feature_importances_
69 | # make importances relative to max importance
70 | feature_importance=100.0*(feature_importance/feature_importance.max())
71 | sorted_idx=np.argsort(feature_importance)
72 | pos=np.arange(sorted_idx.shape[0])
73 | plt.subplot(1,2,2)
74 | plt.barh(pos,feature_importance[sorted_idx],align='center')
75 | plt.yticks(pos,table.columns.get_level_values(1))
76 | plt.xlabel('Relative Importance')
77 | plt.title('Variable Importance')
78 | plt.show()
79 | 


--------------------------------------------------------------------------------
/GBRegression1.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import pandas as pd
 6 | from sklearn import ensemble
 7 | from sklearn import datasets
 8 | from sklearn.utils import shuffle
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | #Load Data
12 | a=["date","Appliances","T1","RH_1","T2","RH_2","T3","RH_3","T4","RH_4","T5","RH_5","T6","RH_6","T7","RH_7","T8","RH_8","T9","RH_9","NSM","T_out","RH_out","Windspeed","Visibility","rv1","rv2"]
13 | #load the datasets
14 | 
15 | #raw_data = urllib.urlopen(r"C:\Users\T00538\Desktop\training.csv")
16 | # load the CSV file as a numpy matrix
17 | #dataset = np.loadtxt(raw_data, delimiter=",")
18 | 
19 | df = pd.read_csv(r"C:\Users\T00538\Desktop\training.csv",usecols=a, nrows = 677)
20 | df1 = pd.read_csv(r"C:\Users\T00538\Desktop\testing_validation.csv",usecols=a, nrows=677)
21 | df1["date1"]=(df1['date'].str.split(':').str[0])
22 | df["date1"]=(df['date'].str.split(':').str[0])
23 | del a[0]
24 | 
25 | 
26 | table = pd.pivot_table(df,index=["date1"],
27 |                values = a,
28 |                aggfunc=[np.mean],fill_value=0)
29 | table1 = pd.pivot_table(df1,index=["date1"],
30 |                values = a,
31 |                aggfunc=[np.mean],fill_value=0)
32 | 
33 | y = np.array(table[("mean",'Appliances')])
34 | y_test = np.array(table1[("mean",'Appliances')])
35 | del table[("mean","Appliances")]
36 | del table1[("mean","Appliances")]
37 | X = np.array(table.values.tolist())
38 | X_test = np.array(table1.values.tolist())
39 | 
40 | #Fit Regression Model
41 | 
42 | params={'n_estimators':500,'max_depth':4,'min_samples_split':2,
43 |         'learning_rate':0.01,'loss':'ls'}
44 | clf=ensemble.GradientBoostingRegressor(**params)
45 | 
46 | clf.fit(X,y)
47 | mse=mean_squared_error(y_test,clf.predict(X_test))
48 | print("MSE:%.4f" % mse)
49 | 
50 | #Plot Training Deviance
51 | 
52 | # compute test set deviance
53 | test_score=np.zeros((params['n_estimators'],), dtype=np.float64)
54 | 
55 | for i,y_pred in enumerate(clf.staged_predict(X_test)):
56 |     test_score[i]=clf.loss_(y_test,y_pred)
57 | 
58 | plt.figure(figsize=(12,6))
59 | plt.subplot(1,2,1)
60 | plt.title('Deviance')
61 | 
62 | plt.plot(np.arange(params['n_estimators'])+1, clf.train_score_,'b-',
63 |          label='Training Set Deviance')
64 | plt.plot(np.arange(params['n_estimators'])+1,test_score,'r-',
65 |          label='Test Set Deviance')
66 | plt.legend(loc='upper right')
67 | plt.xlabel('Boosting Iterations')
68 | plt.ylabel('Deviance')
69 | 
70 | #Plot Feature Importance
71 | feature_importance=clf.feature_importances_
72 | # make importances relative to max importance
73 | feature_importance=100.0*(feature_importance/feature_importance.max())
74 | ##sorted_idx=np.argsort(feature_importance, order = None)
75 | ##sorted_idx=np.array(feature_importance).argsort()[::-1]
76 | sorted_idx = np.argsort(feature_importance)
77 | ypos=[]
78 | del a[0]
79 | for i in sorted_idx :
80 |     ypos.append(a[i])
81 | pos=np.arange(sorted_idx.shape[0])+.5
82 | plt.subplot(1,2,2)
83 | plt.barh(pos,feature_importance[sorted_idx],align='center')
84 | plt.yticks(pos,ypos)
85 | plt.xlabel('Relative Importance')
86 | plt.title('Variable Importance')
87 | plt.show()
88 | 


--------------------------------------------------------------------------------
/RandomForest.py:
--------------------------------------------------------------------------------
  1 | #Random Forest Algorithm on Engery Dataset
  2 | from random import seed
  3 | from random import randrange
  4 | from csv import reader
  5 | from math import sqrt
  6 | 
  7 | #Load a CSV File
  8 | def load_csv(filename):
  9 |     dataset=list()
 10 |     with open(filename,'r') as file:
 11 |         csv_reader=reader(file)
 12 |         for row in csv_reader:
 13 |             if not row:
 14 |                 continue
 15 |             dataset.append(row)
 16 |     return dataset
 17 | 
 18 | #Convert string column to float
 19 | def str_column_to_float(dataset,column):
 20 |     for row in dataset:
 21 |         row[column]=float(row[column].strip())
 22 | 
 23 | #Convert string column to integer
 24 | def str_column_to_int(dataset,column):
 25 |     class_values=[row[column] for row in dataset]
 26 |     unique=set(class_values)
 27 |     lookup=dict()
 28 |     for i, value in enumerate(unique):
 29 |         lookup[value]=i
 30 |     for row in dataset:
 31 |         row[column]=lookup[row[column]]
 32 |     return lookup
 33 | 
 34 | #Split a dataset into k folds
 35 | def cross_validation_split(dataset,n_folds):
 36 |     dataset_split=list()
 37 |     dataset_copy=list(dataset)
 38 |     fold_size=int(len(dataset)/n_folds)
 39 |     for i in range(n_folds):
 40 |         fold=list()
 41 |         while len(fold)<fold_size:
 42 |             index=randrange(len(dataset_copy))
 43 |             fold.append(dataset_copy.pop(index))
 44 |         dataset_split.append(fold)
 45 |     return dataset_split
 46 | 
 47 | #Calculate accuracy percentage
 48 | def accuracy_metric(actual,predicted):
 49 |     correct=0
 50 |     for i in range(len(actual)):
 51 |         if actual[i]==predicted[i]:
 52 |             correct+=1
 53 |     return correct/float(len(actual))*100.0
 54 | 
 55 | #Evaluate an algorithm using a cross validation split
 56 | def evaluate_algorithm(dataset,algorithm,n_folds,*args):
 57 |     folds=cross_validation_split(dataset,n_folds)
 58 |     scores=list()
 59 |     for fold in folds:
 60 |         train_set=list(folds)
 61 |         train_set.remove(fold)
 62 |         train_set=sum(train_set,[])
 63 |         test_set=list()
 64 |         for row in fold:
 65 |             row_copy=list(row)
 66 |             test_set.append(row_copy)
 67 |             row_copy[-1]=None
 68 |         predicted=algorithm(train_set,test_set,*args)
 69 |         actual=[row[-1] for row in fold]
 70 |         accuracy=accuracy_metric(actual,predicted)
 71 |         scores.append(accuracy)
 72 |     return scores
 73 | #Split a dataset based on attribute and an attribute value
 74 | def test_split(index,value,dataset):
 75 |     left,right=list(),list()
 76 |     for row in dataset:
 77 |         if row[index]<value:
 78 |             left.append(row)
 79 |         else:
 80 |             right.append(row)
 81 |     return left,right
 82 | 
 83 | #Calculate the Gini index for a split dataset
 84 | def gini_index(groups,class_values):
 85 |     gini=0.0
 86 |     for class_value in class_values:
 87 |         for group in groups:
 88 |             size=len(group)
 89 |             if size==0:
 90 |                 continue
 91 |             proportion=[row[-1] for row in group].count(class_value)/float(size)
 92 |             gini+=(proportion*(1.0-proportion))
 93 |     return gini
 94 | 
 95 | # Select the best split point for a dataset
 96 | def get_split(dataset, n_features):
 97 | 	class_values = list(set(row[-1] for row in dataset))
 98 | 	b_index, b_value, b_score, b_groups = 999, 999, 999, None
 99 | 	features = list()
100 | 	while len(features) < n_features:
101 | 		index = randrange(len(dataset[0])-1)
102 | 		if index not in features:
103 | 			features.append(index)
104 | 	for index in features:
105 | 		for row in dataset:
106 | 			groups = test_split(index, row[index], dataset)
107 | 			gini = gini_index(groups, class_values)
108 | 			if gini < b_score:
109 | 				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
110 | 	return {'index':b_index, 'value':b_value, 'groups':b_groups}
111 |  
112 | # Create a terminal node value
113 | def to_terminal(group):
114 | 	outcomes = [row[-1] for row in group]
115 | 	return max(set(outcomes), key=outcomes.count)
116 |  
117 | # Create child splits for a node or make terminal
118 | def split(node, max_depth, min_size, n_features, depth):
119 | 	left, right = node['groups']
120 | 	del(node['groups'])
121 | 	# check for a no split
122 | 	if not left or not right:
123 | 		node['left'] = node['right'] = to_terminal(left + right)
124 | 		return
125 | 	# check for max depth
126 | 	if depth >= max_depth:
127 | 		node['left'], node['right'] = to_terminal(left), to_terminal(right)
128 | 		return
129 | 	# process left child
130 | 	if len(left) <= min_size:
131 | 		node['left'] = to_terminal(left)
132 | 	else:
133 | 		node['left'] = get_split(left, n_features)
134 | 		split(node['left'], max_depth, min_size, n_features, depth+1)
135 | 	# process right child
136 | 	if len(right) <= min_size:
137 | 		node['right'] = to_terminal(right)
138 | 	else:
139 | 		node['right'] = get_split(right, n_features)
140 | 		split(node['right'], max_depth, min_size, n_features, depth+1)
141 |  
142 | # Build a decision tree
143 | def build_tree(train, max_depth, min_size, n_features):
144 | 	root = get_split(train, n_features)
145 | 	split(root, max_depth, min_size, n_features, 1)
146 | 	return root
147 |  
148 | # Make a prediction with a decision tree
149 | def predict(node, row):
150 | 	if row[node['index']] < node['value']:
151 | 		if isinstance(node['left'], dict):
152 | 			return predict(node['left'], row)
153 | 		else:
154 | 			return node['left']
155 | 	else:
156 | 		if isinstance(node['right'], dict):
157 | 			return predict(node['right'], row)
158 | 		else:
159 | 			return node['right']
160 |  
161 | # Create a random subsample from the dataset with replacement
162 | def subsample(dataset, ratio):
163 | 	sample = list()
164 | 	n_sample = round(len(dataset) * ratio)
165 | 	while len(sample) < n_sample:
166 | 		index = randrange(len(dataset))
167 | 		sample.append(dataset[index])
168 | 	return sample
169 |  
170 | # Make a prediction with a list of bagged trees
171 | def bagging_predict(trees, row):
172 | 	predictions = [predict(tree, row) for tree in trees]
173 | 	return max(set(predictions), key=predictions.count)
174 | 
175 | #Random Forest Algorithm
176 | def random_forest(train,test,max_depth,min_size,sample_size,n_trees,n_features):
177 |     trees=list()
178 |     for i in range(n_trees):
179 |         sample=subsample(train,sample_size)
180 |         tree=build_tree(sample,max_depth,min_size,n_features)
181 |         trees.append(tree)
182 |     predictions=[bagging_predict(trees,row) for row in test]
183 |     return(predictions)
184 | 
185 | # Test the random forest algorithm
186 | seed(1)
187 | #load and prepare data
188 | filename='training.csv'
189 | dataset=load_csv(filename)
190 | #convert string attributes to integers
191 | for i in range(31,len(dataset[0])-1):
192 |     str_column_to_float(dataset,i)
193 | #convert class column to integers
194 | str_column_to_int(dataset,len(dataset[0])-1)
195 | #evaluate algorithm
196 | n_folds=5
197 | max_depth=10
198 | min_size=1
199 | sample_size=1.0
200 | n_features=int(sqrt(len(dataset[0])-1))
201 | for n_trees in [1,5,10]:
202 |     scores=evaluate_algorithm(dataset,random_forest,n_folds,max_depth,min_size,sample_size,n_trees,n_features)
203 |     print('Trees:%d' % n_trees)
204 |     print('Scores:%s' % scores)
205 |     print('Mean Accuracy:%.3f%%'%(sum(scores)/float(len(scores))))
206 | 
207 | 
208 | 


--------------------------------------------------------------------------------