├── CAISO Load.xlsx ├── Data_Workbook.xlsx ├── ElectricityByCounty.xlsx ├── CAISO Load Forecasting.pdf ├── City_Electricity_Data_1.xlsx ├── README.md └── COMP152_FinalProject_TwoStagePerformance.ipynb /CAISO Load.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/CAISO Load.xlsx -------------------------------------------------------------------------------- /Data_Workbook.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/Data_Workbook.xlsx -------------------------------------------------------------------------------- /ElectricityByCounty.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/ElectricityByCounty.xlsx -------------------------------------------------------------------------------- /CAISO Load Forecasting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/CAISO Load Forecasting.pdf -------------------------------------------------------------------------------- /City_Electricity_Data_1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/City_Electricity_Data_1.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # load-forecasting-CAISO 2 | This novel model and associated paper proposes the use of a two-stage K- means clustering for variable selection and then using decision trees and support vector regressors for day-ahead load forecasting in the CAISO electricity market. 3 | 4 | Find an account of the results and methodology in the report: [CAISO Load Forecasting.pdf] 5 | 6 | The team for the project includes: Martin Gao, Daniel Lee, Paul Maina, and Zohaib Siddique \ 7 | martin.gao@tufts.edu; daniel.c_lee@tufts.edu; paul.maina@tufts.edu; zohaib.siddique@tufts.edu \ 8 | Department of Computer Science, Tufts University, 161 College Ave, 02155 Medford, MA, USA 9 | -------------------------------------------------------------------------------- /COMP152_FinalProject_TwoStagePerformance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Call required libraries\n", 10 | "import time \n", 11 | "import warnings \n", 12 | "import numpy as np \n", 13 | "import pandas as pd \n", 14 | "import matplotlib.pyplot as plt \n", 15 | "import seaborn as sns\n", 16 | "from sklearn.preprocessing import StandardScaler \n", 17 | "from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation\n", 18 | "from sklearn.mixture import GaussianMixture \n", 19 | "\n", 20 | "#inputs\n", 21 | "holiday_pd = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'Holiday Dates')\n", 22 | "lat_long_data = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'City-Lat-Long')\n", 23 | "county_load = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'Load By County 2018')\n", 24 | "county_gdp = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'GDP By County 2018')\n", 25 | "city_pop = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'City Population')\n", 26 | "caiso_2016 = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/CAISO Load.xlsx',sheet_name = '2016')\n", 27 | "caiso_2017 = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/CAISO Load.xlsx',sheet_name = '2017')\n", 28 | "caiso_2018 = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/CAISO Load.xlsx',sheet_name = '2018')\n", 29 | "city_wx_data = pd.read_csv('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/all_wx_data.csv')\n", 30 | "\n", 31 | "#select specific columns not all\n", 32 | "city_wx_data = city_wx_data[['City', 'Year', 'Month', 'Day', 'Hour', 'Temperature',\n", 33 | " 'Relative Humidity',\n", 34 | " 'Cloud Type', 'Dew Point', 'Wind Speed']]\n", 35 | "city_wx_data = city_wx_data.drop_duplicates()\n", 36 | "caiso_total = caiso_2016.append([caiso_2017,caiso_2018])\n", 37 | "caiso_total['Date'] = pd.to_datetime(caiso_total['Date'])\n", 38 | "caiso_total['Month'] = caiso_total['Date'].dt.month\n", 39 | "caiso_total['Day'] = caiso_total['Date'].dt.day\n", 40 | "caiso_total['Year'] = caiso_total['Date'].dt.year\n", 41 | "caiso_total['Day_of_week'] = caiso_total['Date'].dt.dayofweek\n", 42 | "caiso_total['Week'] = caiso_total['Date'].dt.week\n", 43 | "caiso_total = caiso_total.rename(columns={\"HE\": \"Hour\"})\n", 44 | "\n", 45 | "#read-in\n", 46 | "city_elec = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/City_Electricity_Data_1.xlsx')\n", 47 | "city_elec_complete = city_elec.dropna()\n", 48 | "city_elec_complete = city_elec_complete.drop_duplicates()\n", 49 | "city_elec_complete['total_elec_mwh'] = city_elec_complete['res_elec_mwh']+city_elec_complete['com_elec_mwh']+city_elec_complete['ind_elec_mwh']\n", 50 | "city_elec_complete.sort_values(ascending = False,by = 'total_elec_mwh').head(10)\n", 51 | "city_elec_complete= city_elec_complete[['City','housing_units', 'total_pop', 'res_elec_mwh', 'com_units',\n", 52 | " 'com_elec_mwh', 'ind_units', 'ind_elec_mwh','total_elec_mwh']]\n", 53 | "#city_elec_complete.sort_values(ascending = False,by = 'com_elec_mwh').head(10)\n", 54 | "#gather top10 commercial,industrial,residential\n", 55 | "top10_commercial = city_elec_complete.sort_values(ascending = False,by = 'com_elec_mwh').head(10)[['City', 'total_pop', 'com_elec_mwh']]\n", 56 | "top10_commercial.columns = ['City', 'Total Population' ,'Commercial MWh Usage']\n", 57 | "top10_industrial = city_elec_complete.sort_values(ascending = False,by = 'ind_elec_mwh').head(10)[['City', 'total_pop', 'ind_elec_mwh']]\n", 58 | "top10_industrial.columns = ['City', 'Total Population' ,'Industrial MWh Usage']\n", 59 | "top10_residential = city_elec_complete.sort_values(ascending = False,by = 'res_elec_mwh').head(10)[['City', 'total_pop', 'res_elec_mwh']]\n", 60 | "top10_residential.columns = ['City', 'Total Population' ,'Residential MWh Usage']\n", 61 | "top10_consumption = city_elec_complete.sort_values(ascending = False,by = 'total_elec_mwh').head(10)[['City', 'total_pop', 'total_elec_mwh']]\n", 62 | "top10_consumption.columns = ['City', 'Total Population' ,'Total MWh Usage'] \n", 63 | "#create percentages\n", 64 | "city_elec_complete['ind_pct'] = city_elec_complete['ind_elec_mwh']/city_elec_complete['total_elec_mwh']\n", 65 | "city_elec_complete['res_pct'] = city_elec_complete['res_elec_mwh']/city_elec_complete['total_elec_mwh']\n", 66 | "city_elec_complete['com_pct'] = city_elec_complete['com_elec_mwh']/city_elec_complete['total_elec_mwh']\n", 67 | "#correlation charts\n", 68 | "correlations_2 = city_elec_complete[['total_pop','res_pct', 'com_pct', 'ind_pct']].corr()\n", 69 | "correlations_1 = city_elec_complete[['total_pop','res_elec_mwh','com_elec_mwh','ind_elec_mwh']].corr()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stderr", 79 | "output_type": "stream", 80 | "text": [ 81 | "C:\\Users\\zohai\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:144: FutureWarning: The sklearn.cluster.k_means_ module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.cluster. Anything that cannot be imported from sklearn.cluster is now part of the private API.\n", 82 | " warnings.warn(message, FutureWarning)\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "#Time series clustering\n", 88 | "from tslearn.clustering import TimeSeriesKMeans\n", 89 | "import warnings\n", 90 | "warnings.filterwarnings('ignore')\n", 91 | "\n", 92 | "#Scale the data as the data is in different units\n", 93 | "city_wx_cluster = city_wx_data[['City', 'Temperature',\n", 94 | " 'Relative Humidity',\n", 95 | " 'Cloud Type', 'Dew Point', 'Wind Speed']]\n", 96 | "from sklearn.preprocessing import StandardScaler\n", 97 | "scaler = StandardScaler()\n", 98 | "\n", 99 | "req_cols = ['Temperature',\n", 100 | " 'Relative Humidity',\n", 101 | " 'Cloud Type', 'Dew Point', 'Wind Speed']\n", 102 | "\n", 103 | "#scale for clustering\n", 104 | "city_wx_cluster_scaled = scaler.fit_transform(city_wx_cluster[req_cols])\n", 105 | "city_wx_cluster_scaled_df = pd.DataFrame(city_wx_cluster_scaled)\n", 106 | "\n", 107 | "#Add back the city column\n", 108 | "city_wx_cluster_scaled_df['City'] = city_wx_cluster['City']\n", 109 | "\n", 110 | "list_of_cities = list(set(city_wx_cluster.City.values))\n", 111 | "#np.array(city_wx_cluster_scaled_df[city_wx_cluster_scaled_df.City == list_of_cities[0]].iloc[:,0:5])\n", 112 | "wx_cluster = []\n", 113 | "\n", 114 | "for i in range(0,len(list_of_cities)):\n", 115 | " wx_cluster.append(np.array(city_wx_cluster_scaled_df[city_wx_cluster_scaled_df.City == list_of_cities[i]].iloc[:,0:5]))\n", 116 | "\n", 117 | "\n", 118 | "#Assign Labels\n", 119 | "city_wx_cluster = pd.DataFrame(list_of_cities)\n", 120 | "ts_kmeans_optimal = TimeSeriesKMeans(n_clusters = 5, metric=\"euclidean\", max_iter = 3, max_iter_barycenter=3, random_state = 5).fit(wx_cluster)\n", 121 | "wx_cluster_label = list(ts_kmeans_optimal.labels_)\n", 122 | "city_wx_cluster['Weather Cluster'] = wx_cluster_label\n", 123 | "city_wx_cluster.columns = ['City', 'Weather Cluster']\n", 124 | "\n", 125 | "#get all stats\n", 126 | "wx_cluster_0 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 0].merge(city_elec_complete,on='City')\n", 127 | "wx_cluster_1 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 1].merge(city_elec_complete,on='City')\n", 128 | "wx_cluster_2 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 2].merge(city_elec_complete,on='City')\n", 129 | "wx_cluster_3 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 3].merge(city_elec_complete,on='City')\n", 130 | "wx_cluster_4 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 4].merge(city_elec_complete,on='City')" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 3, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "#create elbow function plot\n", 140 | "def plot_elbow_method(cluster_df):\n", 141 | " wcss = []\n", 142 | " for i in range(1,5):\n", 143 | " kmeans = KMeans(n_clusters = i, init = 'k-means++',max_iter = 300, n_init = 10, random_state = 5)\n", 144 | " kmeans.fit(cluster_df)\n", 145 | " wcss.append(kmeans.inertia_)\n", 146 | " \n", 147 | " plt.plot(range(1,5),wcss)\n", 148 | " plt.title('Optimal K/Elbow method visualization')\n", 149 | " plt.ylabel('Within Cluster Sum-of-Squares')\n", 150 | " plt.xlabel('Number of Clusters/K')\n", 151 | " plt.show()\n", 152 | " \n", 153 | "req_columns = ['res_pct', 'com_pct', 'ind_pct']\n", 154 | "\n", 155 | "#create kmeans labels\n", 156 | "def get_kmeans_labels(cluster_df):\n", 157 | " kmeans = KMeans(n_clusters = 2, init = 'k-means++',max_iter = 300, n_init = 10, random_state = 5).fit(cluster_df)\n", 158 | " labels = list(kmeans.labels_)\n", 159 | " return labels\n", 160 | "\n", 161 | "#create centroid \n", 162 | "def get_closest_to_centroid(cluster_df):\n", 163 | " from sklearn.metrics import pairwise_distances_argmin_min\n", 164 | " kmeans = KMeans(n_clusters = 2, init = 'k-means++',max_iter = 300, n_init = 10, random_state = 5).fit(cluster_df)\n", 165 | " closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, cluster_df)\n", 166 | " return closest\n", 167 | "\n", 168 | "cluster_0_sub = wx_cluster_0[req_columns]\n", 169 | "cluster_1_sub = wx_cluster_1[req_columns]\n", 170 | "cluster_2_sub = wx_cluster_2[req_columns]\n", 171 | "cluster_3_sub = wx_cluster_3[req_columns]\n", 172 | "cluster_4_sub = wx_cluster_4[req_columns]\n", 173 | "\n", 174 | "#scaling for clustering\n", 175 | "from sklearn.preprocessing import StandardScaler\n", 176 | "scaler = StandardScaler()\n", 177 | "\n", 178 | "cluster_0_sub_scaled = scaler.fit_transform(cluster_0_sub)\n", 179 | "cluster_1_sub_scaled = scaler.fit_transform(cluster_1_sub)\n", 180 | "cluster_2_sub_scaled = scaler.fit_transform(cluster_2_sub)\n", 181 | "cluster_3_sub_scaled = scaler.fit_transform(cluster_3_sub)\n", 182 | "cluster_4_sub_scaled = scaler.fit_transform(cluster_4_sub)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 5, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "wx_cluster_0['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_0[req_columns])\n", 192 | "wx_cluster_1['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_1[req_columns])\n", 193 | "wx_cluster_2['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_2[req_columns])\n", 194 | "wx_cluster_3['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_3[req_columns])\n", 195 | "wx_cluster_4['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_4[req_columns])\n", 196 | "\n", 197 | "ex_wx_cluster_0 = wx_cluster_0[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n", 198 | "ex_wx_cluster_1 = wx_cluster_1[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n", 199 | "ex_wx_cluster_2 = wx_cluster_2[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n", 200 | "ex_wx_cluster_3 = wx_cluster_3[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n", 201 | "ex_wx_cluster_4 = wx_cluster_4[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n", 202 | "\n", 203 | "all_ex_wx_cluster = ex_wx_cluster_0.append([ex_wx_cluster_1, ex_wx_cluster_2, ex_wx_cluster_3,ex_wx_cluster_4])\n", 204 | "#arcgis data\n", 205 | "arcgis_data_all = all_ex_wx_cluster[['City', 'Weather Cluster', 'Electric Sub-Cluster', 'total_pop' , 'total_elec_mwh']]" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 38, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "wx00_max_load = wx_cluster_0[wx_cluster_0['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n", 215 | "wx01_max_load = wx_cluster_0[wx_cluster_0['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n", 216 | "wx10_max_load = wx_cluster_1[wx_cluster_1['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n", 217 | "wx11_max_load = wx_cluster_1[wx_cluster_1['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n", 218 | "wx20_max_load = wx_cluster_2[wx_cluster_2['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n", 219 | "wx21_max_load = wx_cluster_2[wx_cluster_2['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n", 220 | "wx30_max_load = wx_cluster_3[wx_cluster_3['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n", 221 | "wx31_max_load = wx_cluster_3[wx_cluster_3['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n", 222 | "wx40_max_load = wx_cluster_4[wx_cluster_4['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n", 223 | "wx41_max_load = wx_cluster_4[wx_cluster_4['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n", 224 | "\n", 225 | "#Cities List based on max load in each subcluster\n", 226 | "city_1=wx_cluster_0[wx_cluster_0['total_elec_mwh'] == wx00_max_load]['City'].item()\n", 227 | "city_2=wx_cluster_0[wx_cluster_0['total_elec_mwh'] == wx01_max_load]['City'].item()\n", 228 | "city_3=wx_cluster_1[wx_cluster_1['total_elec_mwh'] == wx10_max_load]['City'].item()\n", 229 | "city_4=wx_cluster_1[wx_cluster_1['total_elec_mwh'] == wx11_max_load]['City'].item()\n", 230 | "city_5=wx_cluster_2[wx_cluster_2['total_elec_mwh'] == wx20_max_load]['City'].item()\n", 231 | "city_6=wx_cluster_2[wx_cluster_2['total_elec_mwh'] == wx21_max_load]['City'].item()\n", 232 | "city_7=wx_cluster_3[wx_cluster_3['total_elec_mwh'] == wx30_max_load]['City'].item()\n", 233 | "city_8=wx_cluster_3[wx_cluster_3['total_elec_mwh'] == wx31_max_load]['City'].item()\n", 234 | "city_9=wx_cluster_4[wx_cluster_4['total_elec_mwh'] == wx40_max_load]['City'].item()\n", 235 | "city_10=wx_cluster_4[wx_cluster_4['total_elec_mwh'] == wx41_max_load]['City'].item()\n", 236 | "\n", 237 | "cities_list = ['Petaluma' , 'San Francisco', 'Sacramento', 'San Jose', 'Los Angeles', 'San Diego', 'Roseville', 'Chico', 'Fresno', 'Lemoore']\n", 238 | "cities_list_df = pd.DataFrame(cities_list)\n", 239 | "cities_list_df.columns = ['City']\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 40, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "{'max_depth': 7, 'max_features': 50, 'max_leaf_nodes': 192}\n", 252 | "0.9542729942738374\n", 253 | "0.7294860967589885\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "# tuned tree\n", 259 | "from sklearn.model_selection import RandomizedSearchCV\n", 260 | "from scipy.stats import randint\n", 261 | "\n", 262 | "parameters = {'max_leaf_nodes': randint(3, 200),\n", 263 | " 'max_features': randint(2, 76),\n", 264 | " 'max_depth': randint(1, 10)}\n", 265 | "rnd_params = RandomizedSearchCV(tree, param_distributions = parameters, cv = 10, n_iter = 200)\n", 266 | "rnd_params.fit(final_data_X_train, final_data_Y_train)\n", 267 | "\n", 268 | "best_parameters = rnd_params.best_params_\n", 269 | "print(best_parameters)\n", 270 | "\n", 271 | "tree.set_params(max_features=best_parameters['max_features'], max_leaf_nodes=best_parameters['max_leaf_nodes'])\n", 272 | "tree.fit(final_data_X_train, final_data_Y_train)\n", 273 | "\n", 274 | "tuned_train_score = tree.score(final_data_X_train, final_data_Y_train)\n", 275 | "tuned_test_score = tree.score(final_data_X_test, final_data_Y_test)\n", 276 | "y_pred_tuned_tree = tree.predict(final_data_X_test)\n", 277 | "\n", 278 | "from sklearn.model_selection import RandomizedSearchCV\n", 279 | "from scipy.stats import randint\n", 280 | "from sklearn.ensemble import RandomForestRegressor\n", 281 | "\n", 282 | "rf_tree = RandomForestRegressor()\n", 283 | "rf_tree.fit(final_data_X_train, final_data_Y_train)\n", 284 | "\n", 285 | "param_dist = {'n_estimators': randint(10, 100),\n", 286 | " 'max_leaf_nodes': randint(3, 100),\n", 287 | " 'max_features': [\"auto\"],\n", 288 | " 'max_depth': randint(1, 10),\n", 289 | " 'min_samples_leaf': randint(1, 30),\n", 290 | " 'min_samples_split': randint(2, 20)}\n", 291 | "\n", 292 | "rnd_search_rf = RandomizedSearchCV(rf_tree, param_distributions=param_dist, \n", 293 | " cv=10, n_iter=50)\n", 294 | "rnd_search_rf.fit(final_data_X_train, final_data_Y_train)\n", 295 | "\n", 296 | "rf_tree.set_params(n_estimators=rnd_search_rf.best_params_['n_estimators'], \n", 297 | " max_leaf_nodes=rnd_search_rf.best_params_['max_leaf_nodes'], \n", 298 | " max_features = rnd_search_rf.best_params_['max_features'],\n", 299 | " max_depth = rnd_search_rf.best_params_['max_depth'],\n", 300 | " min_samples_leaf = rnd_search_rf.best_params_['min_samples_leaf'],\n", 301 | " min_samples_split = rnd_search_rf.best_params_['min_samples_split'])\n", 302 | "\n", 303 | "\n", 304 | "#Tuned SVR\n", 305 | "from sklearn.model_selection import GridSearchCV \n", 306 | "from sklearn.svm import SVR\n", 307 | "svr_regressor = SVR()\n", 308 | "svr_regressor.fit(StandardScaler().fit_transform(final_data_X_train), final_data_Y_train) \n", 309 | "param_grid = {'C': [0.1, 1, 10, 100, 1000], \n", 310 | " 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], \n", 311 | " 'kernel': ['linear','rbf']} \n", 312 | " \n", 313 | "svr_regressor_1 = RandomizedSearchCV(SVR(), param_distributions = param_grid) \n", 314 | " \n", 315 | "# fitting the model for grid search \n", 316 | "svr_regressor_1.fit(StandardScaler().fit_transform(final_data_X_train), final_data_Y_train) \n", 317 | "y_pred_svr = svr_regressor_1.predict(StandardScaler().fit_transform(final_data_X_test))" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 43, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "[744, 672, 744, 720, 744, 720, 744, 744, 720, 744, 720, 720]\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "results_df = final_data[final_data.Year == 2018]\n", 335 | "results_df = results_df[['Day', 'Month','Hour', 'Year', 'CAISO Total']]\n", 336 | "results_df['RF_Tuned_Prediction'] = rf_tree.predict(final_data_X_test)\n", 337 | "results_df['SVR_Prediction'] = y_pred_svr\n", 338 | "results_df['DTree_Tuned_Prediction'] = y_pred_tuned_tree\n", 339 | "\n", 340 | "results_df['Hourly_MAPE_RF_Tuned'] = abs((results_df['CAISO Total'] - results_df['RF_Tuned_Prediction'])/results_df['CAISO Total'])*100\n", 341 | "results_df['Hourly_MAPE_SVR'] = abs((results_df['CAISO Total'] - results_df['SVR_Prediction'])/results_df['CAISO Total'])*100\n", 342 | "results_df['Hourly_MAPE_DTree_Tuned'] = abs((results_df['CAISO Total'] - results_df['DTree_Tuned_Prediction'])/results_df['CAISO Total'])*100\n", 343 | "results_df\n", 344 | "\n", 345 | "#mape functions\n", 346 | "\n", 347 | "def get_month_count(df):\n", 348 | " month = [0]*12 \n", 349 | " for i in range(0, len(df)):\n", 350 | " if df['Month'].iloc[i] == 1:\n", 351 | " month[0] += 1\n", 352 | " elif df['Month'].iloc[i] == 2:\n", 353 | " month[1] += 1\n", 354 | " elif df['Month'].iloc[i] == 3:\n", 355 | " month[2] += 1\n", 356 | " elif df['Month'].iloc[i] == 4:\n", 357 | " month[3] += 1\n", 358 | " elif df['Month'].iloc[i] == 5:\n", 359 | " month[4] += 1\n", 360 | " elif df['Month'].iloc[i] == 6:\n", 361 | " month[5] += 1\n", 362 | " elif df['Month'].iloc[i] == 7:\n", 363 | " month[6] += 1\n", 364 | " elif df['Month'].iloc[i] == 8:\n", 365 | " month[7] += 1\n", 366 | " elif df['Month'].iloc[i] == 9:\n", 367 | " month[8] += 1\n", 368 | " elif df['Month'].iloc[i] == 10:\n", 369 | " month[9] += 1\n", 370 | " elif df['Month'].iloc[i] == 11:\n", 371 | " month[10] += 1\n", 372 | " elif df['Month'].iloc[i] == 12:\n", 373 | " month[11] += 1\n", 374 | " return month\n", 375 | "\n", 376 | "d = get_month_count(results_df)\n", 377 | "print(d)\n", 378 | "\n", 379 | "def get_mape_by_month(df, column_name):\n", 380 | "\n", 381 | " final = df.pivot_table(column_name, index = 'Month', aggfunc='sum')\n", 382 | " return final \n", 383 | "\n", 384 | "import pandas as pd \n", 385 | "\n", 386 | "def get_mape_by_month(df, column_name):\n", 387 | " final = df.pivot_table(column_name, index = 'Month', aggfunc='sum')\n", 388 | " return final \n", 389 | "\n", 390 | "def mean(dataframe, d):\n", 391 | " for i in range(0, len(dataframe)):\n", 392 | " dataframe.iloc[i] = dataframe.iloc[i] / d[i]\n", 393 | " \n", 394 | " return dataframe\n", 395 | "\n", 396 | "\n", 397 | "def create_mape_table(df, d):\n", 398 | " final_frame = pd.DataFrame()\n", 399 | " columns = list(df.columns[8:len(df.columns)])\n", 400 | " for i, val in enumerate(columns): \n", 401 | " temp = get_mape_by_month(df,val)\n", 402 | " temp_frame = mean(temp, d)\n", 403 | " final_frame = pd.concat([final_frame, temp_frame], axis = 1)\n", 404 | " return final_frame \n", 405 | "\n", 406 | "final_frame = create_mape_table(results_df, d)\n", 407 | "final_frame = final_frame.reset_index(drop = True)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 46, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/html": [ 418 | "
\n", 419 | "\n", 432 | "\n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | "
FeatureImportance
0fresno_relative_humidity0.373411
1Hour0.255585
2lemoore_temperature0.111071
3fresno_temperature0.063811
4Day_of_week0.059408
5lemoore_relative_humidity0.021034
6chico_relative_humidity0.015598
7Week0.012467
8Day0.011358
9chico_temperature0.006728
10san_diego_dew_point0.005414
11san_diego_temperature0.005129
12los_angeles_temperature0.003860
13roseville_relative_humidity0.003626
14Month0.003414
15los_angeles_wind_speed0.003391
16sacramento_relative_humidity0.003040
17chico_wind_speed0.002434
18san_diego_wind_speed0.002428
19fresno_wind_speed0.002338
20los_angeles_dew_point0.002248
21roseville_temperature0.002050
22san_diego_relative_humidity0.001976
23san_jose_wind_speed0.001913
24los_angeles_relative_humidity0.001855
25lemoore_wind_speed0.001718
26sacramento_temperature0.001360
27petaluma_relative_humidity0.001332
28san_jose_relative_humidity0.001241
29petaluma_dew_point0.001182
30san_francisco_relative_humidity0.001125
31san_francisco_wind_speed0.001094
32lemoore_dew_point0.001045
33chico_dew_point0.001042
34petaluma_wind_speed0.001038
35roseville_dew_point0.001030
36sacramento_wind_speed0.000986
37san_jose_dew_point0.000977
38fresno_dew_point0.000958
39sacramento_dew_point0.000957
40roseville_wind_speed0.000861
41san_jose_temperature0.000822
42san_francisco_dew_point0.000719
43san_francisco_temperature0.000588
44petaluma_temperature0.000514
45los_angeles_cloud_type0.000450
46san_francisco_cloud_type0.000431
47chico_cloud_type0.000417
48sacramento_cloud_type0.000407
49san_diego_cloud_type0.000395
50roseville_cloud_type0.000379
51petaluma_cloud_type0.000360
52san_jose_cloud_type0.000345
53lemoore_cloud_type0.000324
54fresno_cloud_type0.000314
55Year0.000005
\n", 723 | "
" 724 | ], 725 | "text/plain": [ 726 | " Feature Importance\n", 727 | "0 fresno_relative_humidity 0.373411\n", 728 | "1 Hour 0.255585\n", 729 | "2 lemoore_temperature 0.111071\n", 730 | "3 fresno_temperature 0.063811\n", 731 | "4 Day_of_week 0.059408\n", 732 | "5 lemoore_relative_humidity 0.021034\n", 733 | "6 chico_relative_humidity 0.015598\n", 734 | "7 Week 0.012467\n", 735 | "8 Day 0.011358\n", 736 | "9 chico_temperature 0.006728\n", 737 | "10 san_diego_dew_point 0.005414\n", 738 | "11 san_diego_temperature 0.005129\n", 739 | "12 los_angeles_temperature 0.003860\n", 740 | "13 roseville_relative_humidity 0.003626\n", 741 | "14 Month 0.003414\n", 742 | "15 los_angeles_wind_speed 0.003391\n", 743 | "16 sacramento_relative_humidity 0.003040\n", 744 | "17 chico_wind_speed 0.002434\n", 745 | "18 san_diego_wind_speed 0.002428\n", 746 | "19 fresno_wind_speed 0.002338\n", 747 | "20 los_angeles_dew_point 0.002248\n", 748 | "21 roseville_temperature 0.002050\n", 749 | "22 san_diego_relative_humidity 0.001976\n", 750 | "23 san_jose_wind_speed 0.001913\n", 751 | "24 los_angeles_relative_humidity 0.001855\n", 752 | "25 lemoore_wind_speed 0.001718\n", 753 | "26 sacramento_temperature 0.001360\n", 754 | "27 petaluma_relative_humidity 0.001332\n", 755 | "28 san_jose_relative_humidity 0.001241\n", 756 | "29 petaluma_dew_point 0.001182\n", 757 | "30 san_francisco_relative_humidity 0.001125\n", 758 | "31 san_francisco_wind_speed 0.001094\n", 759 | "32 lemoore_dew_point 0.001045\n", 760 | "33 chico_dew_point 0.001042\n", 761 | "34 petaluma_wind_speed 0.001038\n", 762 | "35 roseville_dew_point 0.001030\n", 763 | "36 sacramento_wind_speed 0.000986\n", 764 | "37 san_jose_dew_point 0.000977\n", 765 | "38 fresno_dew_point 0.000958\n", 766 | "39 sacramento_dew_point 0.000957\n", 767 | "40 roseville_wind_speed 0.000861\n", 768 | "41 san_jose_temperature 0.000822\n", 769 | "42 san_francisco_dew_point 0.000719\n", 770 | "43 san_francisco_temperature 0.000588\n", 771 | "44 petaluma_temperature 0.000514\n", 772 | "45 los_angeles_cloud_type 0.000450\n", 773 | "46 san_francisco_cloud_type 0.000431\n", 774 | "47 chico_cloud_type 0.000417\n", 775 | "48 sacramento_cloud_type 0.000407\n", 776 | "49 san_diego_cloud_type 0.000395\n", 777 | "50 roseville_cloud_type 0.000379\n", 778 | "51 petaluma_cloud_type 0.000360\n", 779 | "52 san_jose_cloud_type 0.000345\n", 780 | "53 lemoore_cloud_type 0.000324\n", 781 | "54 fresno_cloud_type 0.000314\n", 782 | "55 Year 0.000005" 783 | ] 784 | }, 785 | "execution_count": 46, 786 | "metadata": {}, 787 | "output_type": "execute_result" 788 | } 789 | ], 790 | "source": [ 791 | "# Random Forest tree importances\n", 792 | "pd.DataFrame({'Feature':final_data_X_train.columns, 'Importance': rf_tree.feature_importances_}).sort_values(by = 'Importance', ascending = False).reset_index(drop=True)" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": 47, 798 | "metadata": {}, 799 | "outputs": [ 800 | { 801 | "data": { 802 | "text/html": [ 803 | "
\n", 804 | "\n", 817 | "\n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | "
FeatureImportance
0fresno_relative_humidity0.481964
1Hour0.267807
2Day_of_week0.062052
3fresno_temperature0.055203
4lemoore_temperature0.050826
5lemoore_relative_humidity0.017357
6Day0.011032
7Week0.007940
8san_diego_dew_point0.007538
9los_angeles_temperature0.004585
10chico_relative_humidity0.003697
11san_diego_temperature0.003477
12san_jose_wind_speed0.003111
13chico_temperature0.002460
14Month0.002422
15sacramento_relative_humidity0.002137
16san_diego_wind_speed0.002021
17los_angeles_dew_point0.001585
18chico_wind_speed0.001463
19roseville_relative_humidity0.001314
20roseville_temperature0.001256
21los_angeles_wind_speed0.000866
22roseville_dew_point0.000857
23lemoore_wind_speed0.000835
24los_angeles_relative_humidity0.000792
25san_francisco_dew_point0.000719
26san_jose_dew_point0.000712
27petaluma_wind_speed0.000608
28petaluma_temperature0.000580
29san_diego_relative_humidity0.000421
30lemoore_dew_point0.000414
31san_jose_cloud_type0.000412
32san_jose_temperature0.000279
33san_francisco_relative_humidity0.000256
34chico_dew_point0.000240
35sacramento_dew_point0.000231
36petaluma_relative_humidity0.000183
37san_jose_relative_humidity0.000178
38roseville_cloud_type0.000172
39fresno_wind_speed0.000000
40petaluma_cloud_type0.000000
41petaluma_dew_point0.000000
42fresno_dew_point0.000000
43lemoore_cloud_type0.000000
44fresno_cloud_type0.000000
45san_francisco_cloud_type0.000000
46san_francisco_temperature0.000000
47chico_cloud_type0.000000
48roseville_wind_speed0.000000
49san_francisco_wind_speed0.000000
50sacramento_temperature0.000000
51sacramento_cloud_type0.000000
52san_diego_cloud_type0.000000
53sacramento_wind_speed0.000000
54los_angeles_cloud_type0.000000
55Year0.000000
\n", 1108 | "
" 1109 | ], 1110 | "text/plain": [ 1111 | " Feature Importance\n", 1112 | "0 fresno_relative_humidity 0.481964\n", 1113 | "1 Hour 0.267807\n", 1114 | "2 Day_of_week 0.062052\n", 1115 | "3 fresno_temperature 0.055203\n", 1116 | "4 lemoore_temperature 0.050826\n", 1117 | "5 lemoore_relative_humidity 0.017357\n", 1118 | "6 Day 0.011032\n", 1119 | "7 Week 0.007940\n", 1120 | "8 san_diego_dew_point 0.007538\n", 1121 | "9 los_angeles_temperature 0.004585\n", 1122 | "10 chico_relative_humidity 0.003697\n", 1123 | "11 san_diego_temperature 0.003477\n", 1124 | "12 san_jose_wind_speed 0.003111\n", 1125 | "13 chico_temperature 0.002460\n", 1126 | "14 Month 0.002422\n", 1127 | "15 sacramento_relative_humidity 0.002137\n", 1128 | "16 san_diego_wind_speed 0.002021\n", 1129 | "17 los_angeles_dew_point 0.001585\n", 1130 | "18 chico_wind_speed 0.001463\n", 1131 | "19 roseville_relative_humidity 0.001314\n", 1132 | "20 roseville_temperature 0.001256\n", 1133 | "21 los_angeles_wind_speed 0.000866\n", 1134 | "22 roseville_dew_point 0.000857\n", 1135 | "23 lemoore_wind_speed 0.000835\n", 1136 | "24 los_angeles_relative_humidity 0.000792\n", 1137 | "25 san_francisco_dew_point 0.000719\n", 1138 | "26 san_jose_dew_point 0.000712\n", 1139 | "27 petaluma_wind_speed 0.000608\n", 1140 | "28 petaluma_temperature 0.000580\n", 1141 | "29 san_diego_relative_humidity 0.000421\n", 1142 | "30 lemoore_dew_point 0.000414\n", 1143 | "31 san_jose_cloud_type 0.000412\n", 1144 | "32 san_jose_temperature 0.000279\n", 1145 | "33 san_francisco_relative_humidity 0.000256\n", 1146 | "34 chico_dew_point 0.000240\n", 1147 | "35 sacramento_dew_point 0.000231\n", 1148 | "36 petaluma_relative_humidity 0.000183\n", 1149 | "37 san_jose_relative_humidity 0.000178\n", 1150 | "38 roseville_cloud_type 0.000172\n", 1151 | "39 fresno_wind_speed 0.000000\n", 1152 | "40 petaluma_cloud_type 0.000000\n", 1153 | "41 petaluma_dew_point 0.000000\n", 1154 | "42 fresno_dew_point 0.000000\n", 1155 | "43 lemoore_cloud_type 0.000000\n", 1156 | "44 fresno_cloud_type 0.000000\n", 1157 | "45 san_francisco_cloud_type 0.000000\n", 1158 | "46 san_francisco_temperature 0.000000\n", 1159 | "47 chico_cloud_type 0.000000\n", 1160 | "48 roseville_wind_speed 0.000000\n", 1161 | "49 san_francisco_wind_speed 0.000000\n", 1162 | "50 sacramento_temperature 0.000000\n", 1163 | "51 sacramento_cloud_type 0.000000\n", 1164 | "52 san_diego_cloud_type 0.000000\n", 1165 | "53 sacramento_wind_speed 0.000000\n", 1166 | "54 los_angeles_cloud_type 0.000000\n", 1167 | "55 Year 0.000000" 1168 | ] 1169 | }, 1170 | "execution_count": 47, 1171 | "metadata": {}, 1172 | "output_type": "execute_result" 1173 | } 1174 | ], 1175 | "source": [ 1176 | "#Decision tree importances\n", 1177 | "pd.DataFrame({'Feature':final_data_X_train.columns, 'Importance': tree.feature_importances_}).sort_values(by = 'Importance', ascending = False).reset_index(drop=True)" 1178 | ] 1179 | }, 1180 | { 1181 | "cell_type": "code", 1182 | "execution_count": 64, 1183 | "metadata": {}, 1184 | "outputs": [ 1185 | { 1186 | "data": { 1187 | "image/png": "\n", 1188 | "text/plain": [ 1189 | "
" 1190 | ] 1191 | }, 1192 | "metadata": { 1193 | "needs_background": "light" 1194 | }, 1195 | "output_type": "display_data" 1196 | } 1197 | ], 1198 | "source": [ 1199 | "#Decision tree importances\n", 1200 | "feature_importance = tree.feature_importances_\n", 1201 | "mod_cols_x = [sub.replace('_', ' ') for sub in final_data_X_train.columns]\n", 1202 | "\n", 1203 | "gb_feat = pd.DataFrame({'feature':mod_cols_x, 'importance':feature_importance})\n", 1204 | "gb_feat = gb_feat.sort_values(by='importance', ascending=False).head(20)\n", 1205 | "\n", 1206 | "plt.figure(figsize=(8, 7.5))\n", 1207 | "plt.barh(width=gb_feat.importance, y=gb_feat.feature);" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "code", 1212 | "execution_count": 21, 1213 | "metadata": {}, 1214 | "outputs": [ 1215 | { 1216 | "data": { 1217 | "text/plain": [ 1218 | "array([ 5, 3, 4, 54, 10, 9, 55, 2, 45, 24, 29, 19, 43, 1, 8, 32, 13,\n", 1219 | " 38, 44, 40, 14, 22, 30, 17, 25, 42, 50, 18, 35, 33, 48, 23, 28, 39,\n", 1220 | " 12, 7, 47, 52, 20, 15, 27, 37, 49, 34, 46, 11, 53, 36, 21, 16, 6,\n", 1221 | " 51, 41, 31, 26, 0], dtype=int64)" 1222 | ] 1223 | }, 1224 | "execution_count": 21, 1225 | "metadata": {}, 1226 | "output_type": "execute_result" 1227 | } 1228 | ], 1229 | "source": [ 1230 | "#Random forest importances\n", 1231 | "\n", 1232 | "importances = rf_tree.feature_importances_\n", 1233 | "std = np.std([tree.feature_importances_ for tree in rf_tree.estimators_],\n", 1234 | " axis=0)\n", 1235 | "indices = np.argsort(importances)[::-1]\n", 1236 | "\n", 1237 | "indices\n", 1238 | "\n", 1239 | "#Print the feature ranking\n", 1240 | "print(\"Feature ranking:\")\n", 1241 | "\n", 1242 | "for f in range(0,11):\n", 1243 | " print(\"%d. %s (%f)\" % (f, final_data_X_train.columns[indices[f]], importances[indices[f]]))\n", 1244 | "\n", 1245 | "# Plot the feature importances of the forest\n", 1246 | "plt.figure()\n", 1247 | "plt.title(\"Feature importances\")\n", 1248 | "plt.bar(range(0,11), importances[indices],\n", 1249 | " color=\"r\", yerr=std[indices], align=\"center\")\n", 1250 | "plt.show()" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "code", 1255 | "execution_count": 52, 1256 | "metadata": {}, 1257 | "outputs": [], 1258 | "source": [ 1259 | "gb_feat = gb_feat.sort_values(by='importance', ascending= False).head(20)" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "code", 1264 | "execution_count": 53, 1265 | "metadata": {}, 1266 | "outputs": [ 1267 | { 1268 | "data": { 1269 | "image/png": "\n", 1270 | "text/plain": [ 1271 | "
" 1272 | ] 1273 | }, 1274 | "metadata": { 1275 | "needs_background": "light" 1276 | }, 1277 | "output_type": "display_data" 1278 | } 1279 | ], 1280 | "source": [ 1281 | "plt.figure(figsize=(8, 7.5))\n", 1282 | "plt.barh(width=gb_feat.importance, y=gb_feat.feature);" 1283 | ] 1284 | }, 1285 | { 1286 | "cell_type": "code", 1287 | "execution_count": 34, 1288 | "metadata": {}, 1289 | "outputs": [ 1290 | { 1291 | "data": { 1292 | "text/html": [ 1293 | "
\n", 1294 | "\n", 1307 | "\n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | "
featureimportance
5fresno_relative_humidity0.449245
3Hour0.260940
4fresno_temperature0.097121
54Day_of_week0.059365
10chico_relative_humidity0.017765
9chico_temperature0.017347
55Week0.012169
2Day0.011900
45redding_relative_humidity0.006338
24ontario_temperature0.004155
29moreno_valley_temperature0.003867
19anaheim_temperature0.003583
43concord_wind_speed0.003421
1Month0.003416
8fresno_wind_speed0.002980
32moreno_valley_dew_point0.002770
13chico_wind_speed0.002438
38san_francisco_wind_speed0.002303
44redding_temperature0.002288
40concord_relative_humidity0.002004
\n", 1418 | "
" 1419 | ], 1420 | "text/plain": [ 1421 | " feature importance\n", 1422 | "5 fresno_relative_humidity 0.449245\n", 1423 | "3 Hour 0.260940\n", 1424 | "4 fresno_temperature 0.097121\n", 1425 | "54 Day_of_week 0.059365\n", 1426 | "10 chico_relative_humidity 0.017765\n", 1427 | "9 chico_temperature 0.017347\n", 1428 | "55 Week 0.012169\n", 1429 | "2 Day 0.011900\n", 1430 | "45 redding_relative_humidity 0.006338\n", 1431 | "24 ontario_temperature 0.004155\n", 1432 | "29 moreno_valley_temperature 0.003867\n", 1433 | "19 anaheim_temperature 0.003583\n", 1434 | "43 concord_wind_speed 0.003421\n", 1435 | "1 Month 0.003416\n", 1436 | "8 fresno_wind_speed 0.002980\n", 1437 | "32 moreno_valley_dew_point 0.002770\n", 1438 | "13 chico_wind_speed 0.002438\n", 1439 | "38 san_francisco_wind_speed 0.002303\n", 1440 | "44 redding_temperature 0.002288\n", 1441 | "40 concord_relative_humidity 0.002004" 1442 | ] 1443 | }, 1444 | "execution_count": 34, 1445 | "metadata": {}, 1446 | "output_type": "execute_result" 1447 | } 1448 | ], 1449 | "source": [ 1450 | "gb_feat['feature'] = df['B'].str.slice_replace(1, 3, 'AAA')" 1451 | ] 1452 | }, 1453 | { 1454 | "cell_type": "code", 1455 | "execution_count": 82, 1456 | "metadata": {}, 1457 | "outputs": [ 1458 | { 1459 | "name": "stdout", 1460 | "output_type": "stream", 1461 | "text": [ 1462 | "Feature ranking:\n", 1463 | "0. fresno_relative_humidity (0.373411)\n", 1464 | "1. Hour (0.255585)\n", 1465 | "2. lemoore_temperature (0.111071)\n", 1466 | "3. fresno_temperature (0.063811)\n", 1467 | "4. Day_of_week (0.059408)\n", 1468 | "5. lemoore_relative_humidity (0.021034)\n", 1469 | "6. chico_relative_humidity (0.015598)\n", 1470 | "7. Week (0.012467)\n", 1471 | "8. Day (0.011358)\n", 1472 | "9. chico_temperature (0.006728)\n", 1473 | "10. san_diego_dew_point (0.005414)\n", 1474 | "11. san_diego_temperature (0.005129)\n", 1475 | "12. los_angeles_temperature (0.003860)\n", 1476 | "13. roseville_relative_humidity (0.003626)\n", 1477 | "14. Month (0.003414)\n", 1478 | "15. los_angeles_wind_speed (0.003391)\n", 1479 | "16. sacramento_relative_humidity (0.003040)\n", 1480 | "17. chico_wind_speed (0.002434)\n", 1481 | "18. san_diego_wind_speed (0.002428)\n", 1482 | "19. fresno_wind_speed (0.002338)\n", 1483 | "20. los_angeles_dew_point (0.002248)\n", 1484 | "21. roseville_temperature (0.002050)\n", 1485 | "22. san_diego_relative_humidity (0.001976)\n", 1486 | "23. san_jose_wind_speed (0.001913)\n", 1487 | "24. los_angeles_relative_humidity (0.001855)\n", 1488 | "25. lemoore_wind_speed (0.001718)\n", 1489 | "26. sacramento_temperature (0.001360)\n", 1490 | "27. petaluma_relative_humidity (0.001332)\n", 1491 | "28. san_jose_relative_humidity (0.001241)\n", 1492 | "29. petaluma_dew_point (0.001182)\n", 1493 | "30. san_francisco_relative_humidity (0.001125)\n", 1494 | "31. san_francisco_wind_speed (0.001094)\n", 1495 | "32. lemoore_dew_point (0.001045)\n", 1496 | "33. chico_dew_point (0.001042)\n", 1497 | "34. petaluma_wind_speed (0.001038)\n", 1498 | "35. roseville_dew_point (0.001030)\n", 1499 | "36. sacramento_wind_speed (0.000986)\n", 1500 | "37. san_jose_dew_point (0.000977)\n", 1501 | "38. fresno_dew_point (0.000958)\n", 1502 | "39. sacramento_dew_point (0.000957)\n", 1503 | "40. roseville_wind_speed (0.000861)\n", 1504 | "41. san_jose_temperature (0.000822)\n", 1505 | "42. san_francisco_dew_point (0.000719)\n", 1506 | "43. san_francisco_temperature (0.000588)\n", 1507 | "44. petaluma_temperature (0.000514)\n", 1508 | "45. los_angeles_cloud_type (0.000450)\n", 1509 | "46. san_francisco_cloud_type (0.000431)\n", 1510 | "47. chico_cloud_type (0.000417)\n", 1511 | "48. sacramento_cloud_type (0.000407)\n", 1512 | "49. san_diego_cloud_type (0.000395)\n", 1513 | "50. roseville_cloud_type (0.000379)\n", 1514 | "51. petaluma_cloud_type (0.000360)\n", 1515 | "52. san_jose_cloud_type (0.000345)\n", 1516 | "53. lemoore_cloud_type (0.000324)\n", 1517 | "54. fresno_cloud_type (0.000314)\n", 1518 | "55. Year (0.000005)\n" 1519 | ] 1520 | }, 1521 | { 1522 | "data": { 1523 | "image/png": "\n", 1524 | "text/plain": [ 1525 | "
" 1526 | ] 1527 | }, 1528 | "metadata": { 1529 | "needs_background": "light" 1530 | }, 1531 | "output_type": "display_data" 1532 | } 1533 | ], 1534 | "source": [ 1535 | "#Random Forest Importances\n", 1536 | "importances = rf_tree.feature_importances_\n", 1537 | "std = np.std([tree.feature_importances_ for tree in rf_tree.estimators_],\n", 1538 | " axis=0)\n", 1539 | "indices = np.argsort(importances)[::-1]\n", 1540 | "\n", 1541 | "# Print the feature ranking\n", 1542 | "print(\"Feature ranking:\")\n", 1543 | "\n", 1544 | "for f in range(final_data_X_train.shape[1]):\n", 1545 | " print(\"%d. %s (%f)\" % (f, final_data_X_train.columns[indices[f]], importances[indices[f]]))\n", 1546 | "\n", 1547 | "# Plot the feature importances of the forest\n", 1548 | "plt.figure()\n", 1549 | "plt.title(\"Feature Importances Random Forest (Two-Stage Clustering) with Error Bars\")\n", 1550 | "plt.bar(range(final_data_X_train.shape[1]), importances[indices],\n", 1551 | " color=\"r\", yerr=std[indices], align=\"center\")\n", 1552 | "plt.rcParams['figure.figsize'] = [20, 10]\n", 1553 | "plt.savefig(\"C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/features_importances.png\")" 1554 | ] 1555 | }, 1556 | { 1557 | "cell_type": "code", 1558 | "execution_count": null, 1559 | "metadata": {}, 1560 | "outputs": [], 1561 | "source": [] 1562 | }, 1563 | { 1564 | "cell_type": "code", 1565 | "execution_count": null, 1566 | "metadata": {}, 1567 | "outputs": [], 1568 | "source": [] 1569 | } 1570 | ], 1571 | "metadata": { 1572 | "kernelspec": { 1573 | "display_name": "Python 3", 1574 | "language": "python", 1575 | "name": "python3" 1576 | }, 1577 | "language_info": { 1578 | "codemirror_mode": { 1579 | "name": "ipython", 1580 | "version": 3 1581 | }, 1582 | "file_extension": ".py", 1583 | "mimetype": "text/x-python", 1584 | "name": "python", 1585 | "nbconvert_exporter": "python", 1586 | "pygments_lexer": "ipython3", 1587 | "version": "3.7.4" 1588 | } 1589 | }, 1590 | "nbformat": 4, 1591 | "nbformat_minor": 2 1592 | } 1593 | --------------------------------------------------------------------------------