├── .DS_Store ├── SVM.pptx ├── doc └── 1-s2.0-S0378778816308970-main.pdf ├── README.md ├── RFE.py ├── heatmap_app_24hrs.py ├── Dataset Description.txt ├── GBRegression.py ├── GBRegression1.py └── RandomForest.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/data-driven-prediction-models-of-energy-use-of-appliances/HEAD/.DS_Store -------------------------------------------------------------------------------- /SVM.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/data-driven-prediction-models-of-energy-use-of-appliances/HEAD/SVM.pptx -------------------------------------------------------------------------------- /doc/1-s2.0-S0378778816308970-main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/data-driven-prediction-models-of-energy-use-of-appliances/HEAD/doc/1-s2.0-S0378778816308970-main.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data-driven-prediction-models-of-energy-use-of-appliances-in-a-low-energy-house 2 | It is a machine learning based regression model implemented in python in which predicts the energy consumption of a house at a particular time span based on temperature and humidity of each rooms and other external factors such wind speed, visibility, dew point, etc. 3 | -------------------------------------------------------------------------------- /RFE.py: -------------------------------------------------------------------------------- 1 | #Recursive Feature Elimination 2 | from sklearn import datasets 3 | from sklearn.feature_selection import RFE 4 | from sklearn.svm import SVR 5 | import pandas as pd 6 | import numpy as np 7 | a=["date","Appliances","T1","RH_1","T2","RH_2","T3","RH_3","T4","RH_4","T5","RH_5","T6","RH_6","T7","RH_7","T8","RH_8","T9","RH_9","T_out","RH_out","Windspeed","Visibility","rv1","rv2"] 8 | #load the datasets 9 | df = pd.read_csv(r"C:\Users\T00538\Desktop\training.csv",usecols=a,nrows=677) 10 | df["date1"]=(df['date'].str.split(':').str[0]) 11 | del a[0] 12 | table = pd.pivot_table(df,index=["date1"], 13 | values = a, 14 | aggfunc=[np.sum],fill_value=0) 15 | 16 | 17 | y = np.array(table[("sum",'Appliances')]) 18 | del table[("sum","Appliances")] 19 | X = np.array(table.values.tolist()) 20 | 21 | 22 | 23 | #create a base classifier used to evaluate a subset of attributes 24 | model=SVR(kernel='linear') 25 | 26 | #create the RFE model and select 3 attributes 27 | rfe=RFE(model,22) 28 | rfe=rfe.fit(X,y) 29 | 30 | #summarize the selection of the attributes 31 | n=list(table.columns.get_level_values(1)) 32 | for i in range(len(a)-1): 33 | print(str(n[i])+" : "+str(rfe.support_[i])+", rank: "+str(rfe.ranking_[i])) 34 | 35 | -------------------------------------------------------------------------------- /heatmap_app_24hrs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import plotly.plotly as py 4 | import plotly.graph_objs as go 5 | #from pandas import DataFrame 6 | import matplotlib.pyplot as plt 7 | from matplotlib import cm 8 | 9 | df = pd.read_csv(r"C:/Users/T00537/Desktop/Data Set/training.csv") 10 | #df = df[df["date"] > "2016-1-12 00:00:00" & df["date"] < "2016-1-17 23:59:59"] 11 | df=df[df["date"]< "2016-01-17 23:59:59"] 12 | df["date1"]=(df['date'].str.split(':').str[0].str.split(" ").str[1]) 13 | 14 | arranged_day = pd.Categorical(df["Day_of_week"], categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday","Sunday"],ordered=True) 15 | day_series = pd.Series(arranged_day) 16 | 17 | table = pd.pivot_table(df,index=["date1"], 18 | values="Appliances",columns=day_series, 19 | aggfunc=[np.sum],fill_value=0) 20 | 21 | 22 | print(table) 23 | fig, ax = plt.subplots() 24 | ax.set_title('Heatmap : Appliances(wh)') 25 | 26 | heatmap = ax.pcolor(table) 27 | plt.colorbar(heatmap) 28 | 29 | #cbar.ax.set_xticklabels(['< -1', '0', '> 1']) 30 | #cbar.ax.set_yticklabels(table.columns.get_level_values(1)) 31 | #plt.pcolor(table) 32 | 33 | #ax.set_yticks(np.arange(len(table.index))) 34 | #ax.set_xticks(np.arange(len(table.columns))) 35 | 36 | ax.set_yticks(range(len(table.index)+1)) 37 | ax.set_xticks(range(len(table.columns)+1)) 38 | 39 | ax.set_xticklabels(table.columns.get_level_values(1)) 40 | 41 | plt.xlabel("Week 2") 42 | plt.ylabel("Hours of Day") 43 | plt.show() 44 | -------------------------------------------------------------------------------- /Dataset Description.txt: -------------------------------------------------------------------------------- 1 | Data Set Information: 2 | 3 | The data set is at 10 min for about 4.5 months. The house temperature and humidity 4 | conditions were monitored with a wireless sensor network. Each wireless node 5 | transmitted the temperature and humidity conditions around 3.3 min. Then, the 6 | wireless data was averaged for 10 minutes periods. The energy data was logged 7 | every 10 minutes with m-bus energy meters. Weather from the nearest airport 8 | weather station was downloaded from a public data set, and merged together with the 9 | experimental data sets using the date and time column. Two random variables have 10 | been included in the data set for testing the regression models and to filter out 11 | non predictive attributes (parameters). 12 | 13 | Attribute Information: 14 | 15 | date time year-month-day hour:minute:second 16 | Appliances, energy use in Wh 17 | lights, energy use of light fixtures in the house in Wh 18 | T1, Temperature in kitchen area, in Celsius 19 | RH_1, Humidity in kitchen area, in % 20 | T2, Temperature in living room area, in Celsius 21 | RH_2, Humidity in living room area, in % 22 | T3, Temperature in laundry room area 23 | RH_3, Humidity in laundry room area, in % 24 | T4, Temperature in office room, in Celsius 25 | RH_4, Humidity in office room, in % 26 | T5, Temperature in bathroom, in Celsius 27 | RH_5, Humidity in bathroom, in % 28 | T6, Temperature outside the building (north side), in Celsius 29 | RH_6, Humidity outside the building (north side), in % 30 | T7, Temperature in ironing room , in Celsius 31 | RH_7, Humidity in ironing room, in % 32 | T8, Temperature in teenager room 2, in Celsius 33 | RH_8, Humidity in teenager room 2, in % 34 | T9, Temperature in parents room, in Celsius 35 | RH_9, Humidity in parents room, in % 36 | To, Temperature outside (from Chievres weather station), in Celsius 37 | Pressure (from Chievres weather station), in mm Hg 38 | RH_out, Humidity outside (from Chievres weather station), in % 39 | Wind speed (from Chievres weather station), in m/s 40 | Visibility (from Chievres weather station), in km 41 | Tdewpoint (from Chievres weather station), °C 42 | rv1, Random variable 1, nondimensional 43 | rv2, Random variable 2, nondimensional -------------------------------------------------------------------------------- /GBRegression.py: -------------------------------------------------------------------------------- 1 | print(__doc__) 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | from sklearn import ensemble 7 | from sklearn import datasets 8 | from sklearn.utils import shuffle 9 | from sklearn.metrics import mean_squared_error 10 | 11 | #Load Data 12 | 13 | a=["date","Appliances","T1","RH_1","T2","RH_2","T3","RH_3","T4","RH_4","T5","RH_5","T6","RH_6","T7","RH_7","T8","RH_8","T9","RH_9","NSM","T_out","RH_out","Windspeed","Visibility","rv1","rv2"] 14 | #load the datasets 15 | df = pd.read_csv(r"C:\Users\T00538\Desktop\training.csv",usecols=a,nrows=677) 16 | df["date1"]=(df['date'].str.split(':').str[0]) 17 | del a[0] 18 | table = pd.pivot_table(df,index=["date1"], 19 | values = a, 20 | aggfunc=[np.sum],fill_value=0) 21 | 22 | 23 | y = np.array(table[("sum",'Appliances')]) 24 | del table[("sum","Appliances")] 25 | X = np.array(table.values.tolist()) 26 | 27 | 28 | ##boston=datasets.load_boston() 29 | ##X,y=shuffle(boston.data,boston.target,random_state=13) 30 | ##X=X.astype(np.float32) 31 | ##offset=int(X.shape[0]*0.9) 32 | ##X_train,y_train=X[:offset],y[:offset] 33 | ##X_test,y_test=X[offset:],y[offset:] 34 | 35 | #Fit Regression Model 36 | 37 | params={'n_estimators':500,'max_depth':4,'min_samples_split':2, 38 | 'learning_rate':0.01,'loss':'ls'} 39 | clf=ensemble.GradientBoostingRegressor(**params) 40 | 41 | clf.fit(X,y) 42 | mse=mean_squared_error(y,clf.predict(X)) 43 | print("MSE:%.4f" % mse) 44 | 45 | #Plot Training Deviance 46 | 47 | # compute test set deviance 48 | test_score=np.zeros((params['n_estimators'],), dtype=np.float64) 49 | 50 | for i,y_pred in enumerate(clf.staged_predict(X)): 51 | test_score[i]=clf.loss_(y,y_pred) 52 | 53 | plt.figure(figsize=(12,6)) 54 | plt.subplot(1,2,1) 55 | plt.title('Deviance') 56 | ##plt.plot(np.arange(params['n_estimators'])+1,clf.train_score_,'b-', 57 | ## label='Training Set Deviance') 58 | ##plt.plot(np.arange(params['n_estimators'])+1, clf.train_score_,'b-', 59 | ## label='Training Set Deviance') 60 | plt.plot(np.arange(params['n_estimators'])+1,test_score,'r-', 61 | label='Test Set Deviance') 62 | plt.legend(loc='upper right') 63 | plt.xlabel('Boosting Iterations') 64 | plt.ylabel('Deviance') 65 | #plt.show() 66 | 67 | #Plot Feature Importance 68 | feature_importance=clf.feature_importances_ 69 | # make importances relative to max importance 70 | feature_importance=100.0*(feature_importance/feature_importance.max()) 71 | sorted_idx=np.argsort(feature_importance) 72 | pos=np.arange(sorted_idx.shape[0]) 73 | plt.subplot(1,2,2) 74 | plt.barh(pos,feature_importance[sorted_idx],align='center') 75 | plt.yticks(pos,table.columns.get_level_values(1)) 76 | plt.xlabel('Relative Importance') 77 | plt.title('Variable Importance') 78 | plt.show() 79 | -------------------------------------------------------------------------------- /GBRegression1.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | from sklearn import ensemble 7 | from sklearn import datasets 8 | from sklearn.utils import shuffle 9 | from sklearn.metrics import mean_squared_error 10 | 11 | #Load Data 12 | a=["date","Appliances","T1","RH_1","T2","RH_2","T3","RH_3","T4","RH_4","T5","RH_5","T6","RH_6","T7","RH_7","T8","RH_8","T9","RH_9","NSM","T_out","RH_out","Windspeed","Visibility","rv1","rv2"] 13 | #load the datasets 14 | 15 | #raw_data = urllib.urlopen(r"C:\Users\T00538\Desktop\training.csv") 16 | # load the CSV file as a numpy matrix 17 | #dataset = np.loadtxt(raw_data, delimiter=",") 18 | 19 | df = pd.read_csv(r"C:\Users\T00538\Desktop\training.csv",usecols=a, nrows = 677) 20 | df1 = pd.read_csv(r"C:\Users\T00538\Desktop\testing_validation.csv",usecols=a, nrows=677) 21 | df1["date1"]=(df1['date'].str.split(':').str[0]) 22 | df["date1"]=(df['date'].str.split(':').str[0]) 23 | del a[0] 24 | 25 | 26 | table = pd.pivot_table(df,index=["date1"], 27 | values = a, 28 | aggfunc=[np.mean],fill_value=0) 29 | table1 = pd.pivot_table(df1,index=["date1"], 30 | values = a, 31 | aggfunc=[np.mean],fill_value=0) 32 | 33 | y = np.array(table[("mean",'Appliances')]) 34 | y_test = np.array(table1[("mean",'Appliances')]) 35 | del table[("mean","Appliances")] 36 | del table1[("mean","Appliances")] 37 | X = np.array(table.values.tolist()) 38 | X_test = np.array(table1.values.tolist()) 39 | 40 | #Fit Regression Model 41 | 42 | params={'n_estimators':500,'max_depth':4,'min_samples_split':2, 43 | 'learning_rate':0.01,'loss':'ls'} 44 | clf=ensemble.GradientBoostingRegressor(**params) 45 | 46 | clf.fit(X,y) 47 | mse=mean_squared_error(y_test,clf.predict(X_test)) 48 | print("MSE:%.4f" % mse) 49 | 50 | #Plot Training Deviance 51 | 52 | # compute test set deviance 53 | test_score=np.zeros((params['n_estimators'],), dtype=np.float64) 54 | 55 | for i,y_pred in enumerate(clf.staged_predict(X_test)): 56 | test_score[i]=clf.loss_(y_test,y_pred) 57 | 58 | plt.figure(figsize=(12,6)) 59 | plt.subplot(1,2,1) 60 | plt.title('Deviance') 61 | 62 | plt.plot(np.arange(params['n_estimators'])+1, clf.train_score_,'b-', 63 | label='Training Set Deviance') 64 | plt.plot(np.arange(params['n_estimators'])+1,test_score,'r-', 65 | label='Test Set Deviance') 66 | plt.legend(loc='upper right') 67 | plt.xlabel('Boosting Iterations') 68 | plt.ylabel('Deviance') 69 | 70 | #Plot Feature Importance 71 | feature_importance=clf.feature_importances_ 72 | # make importances relative to max importance 73 | feature_importance=100.0*(feature_importance/feature_importance.max()) 74 | ##sorted_idx=np.argsort(feature_importance, order = None) 75 | ##sorted_idx=np.array(feature_importance).argsort()[::-1] 76 | sorted_idx = np.argsort(feature_importance) 77 | ypos=[] 78 | del a[0] 79 | for i in sorted_idx : 80 | ypos.append(a[i]) 81 | pos=np.arange(sorted_idx.shape[0])+.5 82 | plt.subplot(1,2,2) 83 | plt.barh(pos,feature_importance[sorted_idx],align='center') 84 | plt.yticks(pos,ypos) 85 | plt.xlabel('Relative Importance') 86 | plt.title('Variable Importance') 87 | plt.show() 88 | -------------------------------------------------------------------------------- /RandomForest.py: -------------------------------------------------------------------------------- 1 | #Random Forest Algorithm on Engery Dataset 2 | from random import seed 3 | from random import randrange 4 | from csv import reader 5 | from math import sqrt 6 | 7 | #Load a CSV File 8 | def load_csv(filename): 9 | dataset=list() 10 | with open(filename,'r') as file: 11 | csv_reader=reader(file) 12 | for row in csv_reader: 13 | if not row: 14 | continue 15 | dataset.append(row) 16 | return dataset 17 | 18 | #Convert string column to float 19 | def str_column_to_float(dataset,column): 20 | for row in dataset: 21 | row[column]=float(row[column].strip()) 22 | 23 | #Convert string column to integer 24 | def str_column_to_int(dataset,column): 25 | class_values=[row[column] for row in dataset] 26 | unique=set(class_values) 27 | lookup=dict() 28 | for i, value in enumerate(unique): 29 | lookup[value]=i 30 | for row in dataset: 31 | row[column]=lookup[row[column]] 32 | return lookup 33 | 34 | #Split a dataset into k folds 35 | def cross_validation_split(dataset,n_folds): 36 | dataset_split=list() 37 | dataset_copy=list(dataset) 38 | fold_size=int(len(dataset)/n_folds) 39 | for i in range(n_folds): 40 | fold=list() 41 | while len(fold)= max_depth: 127 | node['left'], node['right'] = to_terminal(left), to_terminal(right) 128 | return 129 | # process left child 130 | if len(left) <= min_size: 131 | node['left'] = to_terminal(left) 132 | else: 133 | node['left'] = get_split(left, n_features) 134 | split(node['left'], max_depth, min_size, n_features, depth+1) 135 | # process right child 136 | if len(right) <= min_size: 137 | node['right'] = to_terminal(right) 138 | else: 139 | node['right'] = get_split(right, n_features) 140 | split(node['right'], max_depth, min_size, n_features, depth+1) 141 | 142 | # Build a decision tree 143 | def build_tree(train, max_depth, min_size, n_features): 144 | root = get_split(train, n_features) 145 | split(root, max_depth, min_size, n_features, 1) 146 | return root 147 | 148 | # Make a prediction with a decision tree 149 | def predict(node, row): 150 | if row[node['index']] < node['value']: 151 | if isinstance(node['left'], dict): 152 | return predict(node['left'], row) 153 | else: 154 | return node['left'] 155 | else: 156 | if isinstance(node['right'], dict): 157 | return predict(node['right'], row) 158 | else: 159 | return node['right'] 160 | 161 | # Create a random subsample from the dataset with replacement 162 | def subsample(dataset, ratio): 163 | sample = list() 164 | n_sample = round(len(dataset) * ratio) 165 | while len(sample) < n_sample: 166 | index = randrange(len(dataset)) 167 | sample.append(dataset[index]) 168 | return sample 169 | 170 | # Make a prediction with a list of bagged trees 171 | def bagging_predict(trees, row): 172 | predictions = [predict(tree, row) for tree in trees] 173 | return max(set(predictions), key=predictions.count) 174 | 175 | #Random Forest Algorithm 176 | def random_forest(train,test,max_depth,min_size,sample_size,n_trees,n_features): 177 | trees=list() 178 | for i in range(n_trees): 179 | sample=subsample(train,sample_size) 180 | tree=build_tree(sample,max_depth,min_size,n_features) 181 | trees.append(tree) 182 | predictions=[bagging_predict(trees,row) for row in test] 183 | return(predictions) 184 | 185 | # Test the random forest algorithm 186 | seed(1) 187 | #load and prepare data 188 | filename='training.csv' 189 | dataset=load_csv(filename) 190 | #convert string attributes to integers 191 | for i in range(31,len(dataset[0])-1): 192 | str_column_to_float(dataset,i) 193 | #convert class column to integers 194 | str_column_to_int(dataset,len(dataset[0])-1) 195 | #evaluate algorithm 196 | n_folds=5 197 | max_depth=10 198 | min_size=1 199 | sample_size=1.0 200 | n_features=int(sqrt(len(dataset[0])-1)) 201 | for n_trees in [1,5,10]: 202 | scores=evaluate_algorithm(dataset,random_forest,n_folds,max_depth,min_size,sample_size,n_trees,n_features) 203 | print('Trees:%d' % n_trees) 204 | print('Scores:%s' % scores) 205 | print('Mean Accuracy:%.3f%%'%(sum(scores)/float(len(scores)))) 206 | 207 | 208 | --------------------------------------------------------------------------------