├── CAISO Load.xlsx
├── Data_Workbook.xlsx
├── ElectricityByCounty.xlsx
├── CAISO Load Forecasting.pdf
├── City_Electricity_Data_1.xlsx
├── README.md
└── COMP152_FinalProject_TwoStagePerformance.ipynb
/CAISO Load.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/CAISO Load.xlsx
--------------------------------------------------------------------------------
/Data_Workbook.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/Data_Workbook.xlsx
--------------------------------------------------------------------------------
/ElectricityByCounty.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/ElectricityByCounty.xlsx
--------------------------------------------------------------------------------
/CAISO Load Forecasting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/CAISO Load Forecasting.pdf
--------------------------------------------------------------------------------
/City_Electricity_Data_1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhgao/load-forecasting-CAISO/HEAD/City_Electricity_Data_1.xlsx
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # load-forecasting-CAISO
2 | This novel model and associated paper proposes the use of a two-stage K- means clustering for variable selection and then using decision trees and support vector regressors for day-ahead load forecasting in the CAISO electricity market.
3 |
4 | Find an account of the results and methodology in the report: [CAISO Load Forecasting.pdf]
5 |
6 | The team for the project includes: Martin Gao, Daniel Lee, Paul Maina, and Zohaib Siddique \
7 | martin.gao@tufts.edu; daniel.c_lee@tufts.edu; paul.maina@tufts.edu; zohaib.siddique@tufts.edu \
8 | Department of Computer Science, Tufts University, 161 College Ave, 02155 Medford, MA, USA
9 |
--------------------------------------------------------------------------------
/COMP152_FinalProject_TwoStagePerformance.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#Call required libraries\n",
10 | "import time \n",
11 | "import warnings \n",
12 | "import numpy as np \n",
13 | "import pandas as pd \n",
14 | "import matplotlib.pyplot as plt \n",
15 | "import seaborn as sns\n",
16 | "from sklearn.preprocessing import StandardScaler \n",
17 | "from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation\n",
18 | "from sklearn.mixture import GaussianMixture \n",
19 | "\n",
20 | "#inputs\n",
21 | "holiday_pd = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'Holiday Dates')\n",
22 | "lat_long_data = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'City-Lat-Long')\n",
23 | "county_load = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'Load By County 2018')\n",
24 | "county_gdp = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'GDP By County 2018')\n",
25 | "city_pop = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/Data_Workbook.xlsx', sheet_name = 'City Population')\n",
26 | "caiso_2016 = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/CAISO Load.xlsx',sheet_name = '2016')\n",
27 | "caiso_2017 = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/CAISO Load.xlsx',sheet_name = '2017')\n",
28 | "caiso_2018 = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/CAISO Load.xlsx',sheet_name = '2018')\n",
29 | "city_wx_data = pd.read_csv('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/all_wx_data.csv')\n",
30 | "\n",
31 | "#select specific columns not all\n",
32 | "city_wx_data = city_wx_data[['City', 'Year', 'Month', 'Day', 'Hour', 'Temperature',\n",
33 | " 'Relative Humidity',\n",
34 | " 'Cloud Type', 'Dew Point', 'Wind Speed']]\n",
35 | "city_wx_data = city_wx_data.drop_duplicates()\n",
36 | "caiso_total = caiso_2016.append([caiso_2017,caiso_2018])\n",
37 | "caiso_total['Date'] = pd.to_datetime(caiso_total['Date'])\n",
38 | "caiso_total['Month'] = caiso_total['Date'].dt.month\n",
39 | "caiso_total['Day'] = caiso_total['Date'].dt.day\n",
40 | "caiso_total['Year'] = caiso_total['Date'].dt.year\n",
41 | "caiso_total['Day_of_week'] = caiso_total['Date'].dt.dayofweek\n",
42 | "caiso_total['Week'] = caiso_total['Date'].dt.week\n",
43 | "caiso_total = caiso_total.rename(columns={\"HE\": \"Hour\"})\n",
44 | "\n",
45 | "#read-in\n",
46 | "city_elec = pd.read_excel('C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/City_Electricity_Data_1.xlsx')\n",
47 | "city_elec_complete = city_elec.dropna()\n",
48 | "city_elec_complete = city_elec_complete.drop_duplicates()\n",
49 | "city_elec_complete['total_elec_mwh'] = city_elec_complete['res_elec_mwh']+city_elec_complete['com_elec_mwh']+city_elec_complete['ind_elec_mwh']\n",
50 | "city_elec_complete.sort_values(ascending = False,by = 'total_elec_mwh').head(10)\n",
51 | "city_elec_complete= city_elec_complete[['City','housing_units', 'total_pop', 'res_elec_mwh', 'com_units',\n",
52 | " 'com_elec_mwh', 'ind_units', 'ind_elec_mwh','total_elec_mwh']]\n",
53 | "#city_elec_complete.sort_values(ascending = False,by = 'com_elec_mwh').head(10)\n",
54 | "#gather top10 commercial,industrial,residential\n",
55 | "top10_commercial = city_elec_complete.sort_values(ascending = False,by = 'com_elec_mwh').head(10)[['City', 'total_pop', 'com_elec_mwh']]\n",
56 | "top10_commercial.columns = ['City', 'Total Population' ,'Commercial MWh Usage']\n",
57 | "top10_industrial = city_elec_complete.sort_values(ascending = False,by = 'ind_elec_mwh').head(10)[['City', 'total_pop', 'ind_elec_mwh']]\n",
58 | "top10_industrial.columns = ['City', 'Total Population' ,'Industrial MWh Usage']\n",
59 | "top10_residential = city_elec_complete.sort_values(ascending = False,by = 'res_elec_mwh').head(10)[['City', 'total_pop', 'res_elec_mwh']]\n",
60 | "top10_residential.columns = ['City', 'Total Population' ,'Residential MWh Usage']\n",
61 | "top10_consumption = city_elec_complete.sort_values(ascending = False,by = 'total_elec_mwh').head(10)[['City', 'total_pop', 'total_elec_mwh']]\n",
62 | "top10_consumption.columns = ['City', 'Total Population' ,'Total MWh Usage'] \n",
63 | "#create percentages\n",
64 | "city_elec_complete['ind_pct'] = city_elec_complete['ind_elec_mwh']/city_elec_complete['total_elec_mwh']\n",
65 | "city_elec_complete['res_pct'] = city_elec_complete['res_elec_mwh']/city_elec_complete['total_elec_mwh']\n",
66 | "city_elec_complete['com_pct'] = city_elec_complete['com_elec_mwh']/city_elec_complete['total_elec_mwh']\n",
67 | "#correlation charts\n",
68 | "correlations_2 = city_elec_complete[['total_pop','res_pct', 'com_pct', 'ind_pct']].corr()\n",
69 | "correlations_1 = city_elec_complete[['total_pop','res_elec_mwh','com_elec_mwh','ind_elec_mwh']].corr()"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 2,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "name": "stderr",
79 | "output_type": "stream",
80 | "text": [
81 | "C:\\Users\\zohai\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:144: FutureWarning: The sklearn.cluster.k_means_ module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.cluster. Anything that cannot be imported from sklearn.cluster is now part of the private API.\n",
82 | " warnings.warn(message, FutureWarning)\n"
83 | ]
84 | }
85 | ],
86 | "source": [
87 | "#Time series clustering\n",
88 | "from tslearn.clustering import TimeSeriesKMeans\n",
89 | "import warnings\n",
90 | "warnings.filterwarnings('ignore')\n",
91 | "\n",
92 | "#Scale the data as the data is in different units\n",
93 | "city_wx_cluster = city_wx_data[['City', 'Temperature',\n",
94 | " 'Relative Humidity',\n",
95 | " 'Cloud Type', 'Dew Point', 'Wind Speed']]\n",
96 | "from sklearn.preprocessing import StandardScaler\n",
97 | "scaler = StandardScaler()\n",
98 | "\n",
99 | "req_cols = ['Temperature',\n",
100 | " 'Relative Humidity',\n",
101 | " 'Cloud Type', 'Dew Point', 'Wind Speed']\n",
102 | "\n",
103 | "#scale for clustering\n",
104 | "city_wx_cluster_scaled = scaler.fit_transform(city_wx_cluster[req_cols])\n",
105 | "city_wx_cluster_scaled_df = pd.DataFrame(city_wx_cluster_scaled)\n",
106 | "\n",
107 | "#Add back the city column\n",
108 | "city_wx_cluster_scaled_df['City'] = city_wx_cluster['City']\n",
109 | "\n",
110 | "list_of_cities = list(set(city_wx_cluster.City.values))\n",
111 | "#np.array(city_wx_cluster_scaled_df[city_wx_cluster_scaled_df.City == list_of_cities[0]].iloc[:,0:5])\n",
112 | "wx_cluster = []\n",
113 | "\n",
114 | "for i in range(0,len(list_of_cities)):\n",
115 | " wx_cluster.append(np.array(city_wx_cluster_scaled_df[city_wx_cluster_scaled_df.City == list_of_cities[i]].iloc[:,0:5]))\n",
116 | "\n",
117 | "\n",
118 | "#Assign Labels\n",
119 | "city_wx_cluster = pd.DataFrame(list_of_cities)\n",
120 | "ts_kmeans_optimal = TimeSeriesKMeans(n_clusters = 5, metric=\"euclidean\", max_iter = 3, max_iter_barycenter=3, random_state = 5).fit(wx_cluster)\n",
121 | "wx_cluster_label = list(ts_kmeans_optimal.labels_)\n",
122 | "city_wx_cluster['Weather Cluster'] = wx_cluster_label\n",
123 | "city_wx_cluster.columns = ['City', 'Weather Cluster']\n",
124 | "\n",
125 | "#get all stats\n",
126 | "wx_cluster_0 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 0].merge(city_elec_complete,on='City')\n",
127 | "wx_cluster_1 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 1].merge(city_elec_complete,on='City')\n",
128 | "wx_cluster_2 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 2].merge(city_elec_complete,on='City')\n",
129 | "wx_cluster_3 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 3].merge(city_elec_complete,on='City')\n",
130 | "wx_cluster_4 = city_wx_cluster[city_wx_cluster['Weather Cluster'] == 4].merge(city_elec_complete,on='City')"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 3,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "#create elbow function plot\n",
140 | "def plot_elbow_method(cluster_df):\n",
141 | " wcss = []\n",
142 | " for i in range(1,5):\n",
143 | " kmeans = KMeans(n_clusters = i, init = 'k-means++',max_iter = 300, n_init = 10, random_state = 5)\n",
144 | " kmeans.fit(cluster_df)\n",
145 | " wcss.append(kmeans.inertia_)\n",
146 | " \n",
147 | " plt.plot(range(1,5),wcss)\n",
148 | " plt.title('Optimal K/Elbow method visualization')\n",
149 | " plt.ylabel('Within Cluster Sum-of-Squares')\n",
150 | " plt.xlabel('Number of Clusters/K')\n",
151 | " plt.show()\n",
152 | " \n",
153 | "req_columns = ['res_pct', 'com_pct', 'ind_pct']\n",
154 | "\n",
155 | "#create kmeans labels\n",
156 | "def get_kmeans_labels(cluster_df):\n",
157 | " kmeans = KMeans(n_clusters = 2, init = 'k-means++',max_iter = 300, n_init = 10, random_state = 5).fit(cluster_df)\n",
158 | " labels = list(kmeans.labels_)\n",
159 | " return labels\n",
160 | "\n",
161 | "#create centroid \n",
162 | "def get_closest_to_centroid(cluster_df):\n",
163 | " from sklearn.metrics import pairwise_distances_argmin_min\n",
164 | " kmeans = KMeans(n_clusters = 2, init = 'k-means++',max_iter = 300, n_init = 10, random_state = 5).fit(cluster_df)\n",
165 | " closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, cluster_df)\n",
166 | " return closest\n",
167 | "\n",
168 | "cluster_0_sub = wx_cluster_0[req_columns]\n",
169 | "cluster_1_sub = wx_cluster_1[req_columns]\n",
170 | "cluster_2_sub = wx_cluster_2[req_columns]\n",
171 | "cluster_3_sub = wx_cluster_3[req_columns]\n",
172 | "cluster_4_sub = wx_cluster_4[req_columns]\n",
173 | "\n",
174 | "#scaling for clustering\n",
175 | "from sklearn.preprocessing import StandardScaler\n",
176 | "scaler = StandardScaler()\n",
177 | "\n",
178 | "cluster_0_sub_scaled = scaler.fit_transform(cluster_0_sub)\n",
179 | "cluster_1_sub_scaled = scaler.fit_transform(cluster_1_sub)\n",
180 | "cluster_2_sub_scaled = scaler.fit_transform(cluster_2_sub)\n",
181 | "cluster_3_sub_scaled = scaler.fit_transform(cluster_3_sub)\n",
182 | "cluster_4_sub_scaled = scaler.fit_transform(cluster_4_sub)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 5,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "wx_cluster_0['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_0[req_columns])\n",
192 | "wx_cluster_1['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_1[req_columns])\n",
193 | "wx_cluster_2['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_2[req_columns])\n",
194 | "wx_cluster_3['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_3[req_columns])\n",
195 | "wx_cluster_4['Electric Sub-Cluster'] = get_kmeans_labels(wx_cluster_4[req_columns])\n",
196 | "\n",
197 | "ex_wx_cluster_0 = wx_cluster_0[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n",
198 | "ex_wx_cluster_1 = wx_cluster_1[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n",
199 | "ex_wx_cluster_2 = wx_cluster_2[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n",
200 | "ex_wx_cluster_3 = wx_cluster_3[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n",
201 | "ex_wx_cluster_4 = wx_cluster_4[['City', 'Weather Cluster', 'Electric Sub-Cluster','ind_pct', 'res_pct', 'com_pct', 'total_pop' , 'total_elec_mwh']]\n",
202 | "\n",
203 | "all_ex_wx_cluster = ex_wx_cluster_0.append([ex_wx_cluster_1, ex_wx_cluster_2, ex_wx_cluster_3,ex_wx_cluster_4])\n",
204 | "#arcgis data\n",
205 | "arcgis_data_all = all_ex_wx_cluster[['City', 'Weather Cluster', 'Electric Sub-Cluster', 'total_pop' , 'total_elec_mwh']]"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 38,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "wx00_max_load = wx_cluster_0[wx_cluster_0['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n",
215 | "wx01_max_load = wx_cluster_0[wx_cluster_0['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n",
216 | "wx10_max_load = wx_cluster_1[wx_cluster_1['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n",
217 | "wx11_max_load = wx_cluster_1[wx_cluster_1['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n",
218 | "wx20_max_load = wx_cluster_2[wx_cluster_2['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n",
219 | "wx21_max_load = wx_cluster_2[wx_cluster_2['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n",
220 | "wx30_max_load = wx_cluster_3[wx_cluster_3['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n",
221 | "wx31_max_load = wx_cluster_3[wx_cluster_3['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n",
222 | "wx40_max_load = wx_cluster_4[wx_cluster_4['Electric Sub-Cluster'] == 0]['total_elec_mwh'].max()\n",
223 | "wx41_max_load = wx_cluster_4[wx_cluster_4['Electric Sub-Cluster'] == 1]['total_elec_mwh'].max()\n",
224 | "\n",
225 | "#Cities List based on max load in each subcluster\n",
226 | "city_1=wx_cluster_0[wx_cluster_0['total_elec_mwh'] == wx00_max_load]['City'].item()\n",
227 | "city_2=wx_cluster_0[wx_cluster_0['total_elec_mwh'] == wx01_max_load]['City'].item()\n",
228 | "city_3=wx_cluster_1[wx_cluster_1['total_elec_mwh'] == wx10_max_load]['City'].item()\n",
229 | "city_4=wx_cluster_1[wx_cluster_1['total_elec_mwh'] == wx11_max_load]['City'].item()\n",
230 | "city_5=wx_cluster_2[wx_cluster_2['total_elec_mwh'] == wx20_max_load]['City'].item()\n",
231 | "city_6=wx_cluster_2[wx_cluster_2['total_elec_mwh'] == wx21_max_load]['City'].item()\n",
232 | "city_7=wx_cluster_3[wx_cluster_3['total_elec_mwh'] == wx30_max_load]['City'].item()\n",
233 | "city_8=wx_cluster_3[wx_cluster_3['total_elec_mwh'] == wx31_max_load]['City'].item()\n",
234 | "city_9=wx_cluster_4[wx_cluster_4['total_elec_mwh'] == wx40_max_load]['City'].item()\n",
235 | "city_10=wx_cluster_4[wx_cluster_4['total_elec_mwh'] == wx41_max_load]['City'].item()\n",
236 | "\n",
237 | "cities_list = ['Petaluma' , 'San Francisco', 'Sacramento', 'San Jose', 'Los Angeles', 'San Diego', 'Roseville', 'Chico', 'Fresno', 'Lemoore']\n",
238 | "cities_list_df = pd.DataFrame(cities_list)\n",
239 | "cities_list_df.columns = ['City']\n"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 40,
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "name": "stdout",
249 | "output_type": "stream",
250 | "text": [
251 | "{'max_depth': 7, 'max_features': 50, 'max_leaf_nodes': 192}\n",
252 | "0.9542729942738374\n",
253 | "0.7294860967589885\n"
254 | ]
255 | }
256 | ],
257 | "source": [
258 | "# tuned tree\n",
259 | "from sklearn.model_selection import RandomizedSearchCV\n",
260 | "from scipy.stats import randint\n",
261 | "\n",
262 | "parameters = {'max_leaf_nodes': randint(3, 200),\n",
263 | " 'max_features': randint(2, 76),\n",
264 | " 'max_depth': randint(1, 10)}\n",
265 | "rnd_params = RandomizedSearchCV(tree, param_distributions = parameters, cv = 10, n_iter = 200)\n",
266 | "rnd_params.fit(final_data_X_train, final_data_Y_train)\n",
267 | "\n",
268 | "best_parameters = rnd_params.best_params_\n",
269 | "print(best_parameters)\n",
270 | "\n",
271 | "tree.set_params(max_features=best_parameters['max_features'], max_leaf_nodes=best_parameters['max_leaf_nodes'])\n",
272 | "tree.fit(final_data_X_train, final_data_Y_train)\n",
273 | "\n",
274 | "tuned_train_score = tree.score(final_data_X_train, final_data_Y_train)\n",
275 | "tuned_test_score = tree.score(final_data_X_test, final_data_Y_test)\n",
276 | "y_pred_tuned_tree = tree.predict(final_data_X_test)\n",
277 | "\n",
278 | "from sklearn.model_selection import RandomizedSearchCV\n",
279 | "from scipy.stats import randint\n",
280 | "from sklearn.ensemble import RandomForestRegressor\n",
281 | "\n",
282 | "rf_tree = RandomForestRegressor()\n",
283 | "rf_tree.fit(final_data_X_train, final_data_Y_train)\n",
284 | "\n",
285 | "param_dist = {'n_estimators': randint(10, 100),\n",
286 | " 'max_leaf_nodes': randint(3, 100),\n",
287 | " 'max_features': [\"auto\"],\n",
288 | " 'max_depth': randint(1, 10),\n",
289 | " 'min_samples_leaf': randint(1, 30),\n",
290 | " 'min_samples_split': randint(2, 20)}\n",
291 | "\n",
292 | "rnd_search_rf = RandomizedSearchCV(rf_tree, param_distributions=param_dist, \n",
293 | " cv=10, n_iter=50)\n",
294 | "rnd_search_rf.fit(final_data_X_train, final_data_Y_train)\n",
295 | "\n",
296 | "rf_tree.set_params(n_estimators=rnd_search_rf.best_params_['n_estimators'], \n",
297 | " max_leaf_nodes=rnd_search_rf.best_params_['max_leaf_nodes'], \n",
298 | " max_features = rnd_search_rf.best_params_['max_features'],\n",
299 | " max_depth = rnd_search_rf.best_params_['max_depth'],\n",
300 | " min_samples_leaf = rnd_search_rf.best_params_['min_samples_leaf'],\n",
301 | " min_samples_split = rnd_search_rf.best_params_['min_samples_split'])\n",
302 | "\n",
303 | "\n",
304 | "#Tuned SVR\n",
305 | "from sklearn.model_selection import GridSearchCV \n",
306 | "from sklearn.svm import SVR\n",
307 | "svr_regressor = SVR()\n",
308 | "svr_regressor.fit(StandardScaler().fit_transform(final_data_X_train), final_data_Y_train) \n",
309 | "param_grid = {'C': [0.1, 1, 10, 100, 1000], \n",
310 | " 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], \n",
311 | " 'kernel': ['linear','rbf']} \n",
312 | " \n",
313 | "svr_regressor_1 = RandomizedSearchCV(SVR(), param_distributions = param_grid) \n",
314 | " \n",
315 | "# fitting the model for grid search \n",
316 | "svr_regressor_1.fit(StandardScaler().fit_transform(final_data_X_train), final_data_Y_train) \n",
317 | "y_pred_svr = svr_regressor_1.predict(StandardScaler().fit_transform(final_data_X_test))"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 43,
323 | "metadata": {},
324 | "outputs": [
325 | {
326 | "name": "stdout",
327 | "output_type": "stream",
328 | "text": [
329 | "[744, 672, 744, 720, 744, 720, 744, 744, 720, 744, 720, 720]\n"
330 | ]
331 | }
332 | ],
333 | "source": [
334 | "results_df = final_data[final_data.Year == 2018]\n",
335 | "results_df = results_df[['Day', 'Month','Hour', 'Year', 'CAISO Total']]\n",
336 | "results_df['RF_Tuned_Prediction'] = rf_tree.predict(final_data_X_test)\n",
337 | "results_df['SVR_Prediction'] = y_pred_svr\n",
338 | "results_df['DTree_Tuned_Prediction'] = y_pred_tuned_tree\n",
339 | "\n",
340 | "results_df['Hourly_MAPE_RF_Tuned'] = abs((results_df['CAISO Total'] - results_df['RF_Tuned_Prediction'])/results_df['CAISO Total'])*100\n",
341 | "results_df['Hourly_MAPE_SVR'] = abs((results_df['CAISO Total'] - results_df['SVR_Prediction'])/results_df['CAISO Total'])*100\n",
342 | "results_df['Hourly_MAPE_DTree_Tuned'] = abs((results_df['CAISO Total'] - results_df['DTree_Tuned_Prediction'])/results_df['CAISO Total'])*100\n",
343 | "results_df\n",
344 | "\n",
345 | "#mape functions\n",
346 | "\n",
347 | "def get_month_count(df):\n",
348 | " month = [0]*12 \n",
349 | " for i in range(0, len(df)):\n",
350 | " if df['Month'].iloc[i] == 1:\n",
351 | " month[0] += 1\n",
352 | " elif df['Month'].iloc[i] == 2:\n",
353 | " month[1] += 1\n",
354 | " elif df['Month'].iloc[i] == 3:\n",
355 | " month[2] += 1\n",
356 | " elif df['Month'].iloc[i] == 4:\n",
357 | " month[3] += 1\n",
358 | " elif df['Month'].iloc[i] == 5:\n",
359 | " month[4] += 1\n",
360 | " elif df['Month'].iloc[i] == 6:\n",
361 | " month[5] += 1\n",
362 | " elif df['Month'].iloc[i] == 7:\n",
363 | " month[6] += 1\n",
364 | " elif df['Month'].iloc[i] == 8:\n",
365 | " month[7] += 1\n",
366 | " elif df['Month'].iloc[i] == 9:\n",
367 | " month[8] += 1\n",
368 | " elif df['Month'].iloc[i] == 10:\n",
369 | " month[9] += 1\n",
370 | " elif df['Month'].iloc[i] == 11:\n",
371 | " month[10] += 1\n",
372 | " elif df['Month'].iloc[i] == 12:\n",
373 | " month[11] += 1\n",
374 | " return month\n",
375 | "\n",
376 | "d = get_month_count(results_df)\n",
377 | "print(d)\n",
378 | "\n",
379 | "def get_mape_by_month(df, column_name):\n",
380 | "\n",
381 | " final = df.pivot_table(column_name, index = 'Month', aggfunc='sum')\n",
382 | " return final \n",
383 | "\n",
384 | "import pandas as pd \n",
385 | "\n",
386 | "def get_mape_by_month(df, column_name):\n",
387 | " final = df.pivot_table(column_name, index = 'Month', aggfunc='sum')\n",
388 | " return final \n",
389 | "\n",
390 | "def mean(dataframe, d):\n",
391 | " for i in range(0, len(dataframe)):\n",
392 | " dataframe.iloc[i] = dataframe.iloc[i] / d[i]\n",
393 | " \n",
394 | " return dataframe\n",
395 | "\n",
396 | "\n",
397 | "def create_mape_table(df, d):\n",
398 | " final_frame = pd.DataFrame()\n",
399 | " columns = list(df.columns[8:len(df.columns)])\n",
400 | " for i, val in enumerate(columns): \n",
401 | " temp = get_mape_by_month(df,val)\n",
402 | " temp_frame = mean(temp, d)\n",
403 | " final_frame = pd.concat([final_frame, temp_frame], axis = 1)\n",
404 | " return final_frame \n",
405 | "\n",
406 | "final_frame = create_mape_table(results_df, d)\n",
407 | "final_frame = final_frame.reset_index(drop = True)"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 46,
413 | "metadata": {},
414 | "outputs": [
415 | {
416 | "data": {
417 | "text/html": [
418 | "
\n",
419 | "\n",
432 | "
\n",
433 | " \n",
434 | " \n",
435 | " | \n",
436 | " Feature | \n",
437 | " Importance | \n",
438 | "
\n",
439 | " \n",
440 | " \n",
441 | " \n",
442 | " | 0 | \n",
443 | " fresno_relative_humidity | \n",
444 | " 0.373411 | \n",
445 | "
\n",
446 | " \n",
447 | " | 1 | \n",
448 | " Hour | \n",
449 | " 0.255585 | \n",
450 | "
\n",
451 | " \n",
452 | " | 2 | \n",
453 | " lemoore_temperature | \n",
454 | " 0.111071 | \n",
455 | "
\n",
456 | " \n",
457 | " | 3 | \n",
458 | " fresno_temperature | \n",
459 | " 0.063811 | \n",
460 | "
\n",
461 | " \n",
462 | " | 4 | \n",
463 | " Day_of_week | \n",
464 | " 0.059408 | \n",
465 | "
\n",
466 | " \n",
467 | " | 5 | \n",
468 | " lemoore_relative_humidity | \n",
469 | " 0.021034 | \n",
470 | "
\n",
471 | " \n",
472 | " | 6 | \n",
473 | " chico_relative_humidity | \n",
474 | " 0.015598 | \n",
475 | "
\n",
476 | " \n",
477 | " | 7 | \n",
478 | " Week | \n",
479 | " 0.012467 | \n",
480 | "
\n",
481 | " \n",
482 | " | 8 | \n",
483 | " Day | \n",
484 | " 0.011358 | \n",
485 | "
\n",
486 | " \n",
487 | " | 9 | \n",
488 | " chico_temperature | \n",
489 | " 0.006728 | \n",
490 | "
\n",
491 | " \n",
492 | " | 10 | \n",
493 | " san_diego_dew_point | \n",
494 | " 0.005414 | \n",
495 | "
\n",
496 | " \n",
497 | " | 11 | \n",
498 | " san_diego_temperature | \n",
499 | " 0.005129 | \n",
500 | "
\n",
501 | " \n",
502 | " | 12 | \n",
503 | " los_angeles_temperature | \n",
504 | " 0.003860 | \n",
505 | "
\n",
506 | " \n",
507 | " | 13 | \n",
508 | " roseville_relative_humidity | \n",
509 | " 0.003626 | \n",
510 | "
\n",
511 | " \n",
512 | " | 14 | \n",
513 | " Month | \n",
514 | " 0.003414 | \n",
515 | "
\n",
516 | " \n",
517 | " | 15 | \n",
518 | " los_angeles_wind_speed | \n",
519 | " 0.003391 | \n",
520 | "
\n",
521 | " \n",
522 | " | 16 | \n",
523 | " sacramento_relative_humidity | \n",
524 | " 0.003040 | \n",
525 | "
\n",
526 | " \n",
527 | " | 17 | \n",
528 | " chico_wind_speed | \n",
529 | " 0.002434 | \n",
530 | "
\n",
531 | " \n",
532 | " | 18 | \n",
533 | " san_diego_wind_speed | \n",
534 | " 0.002428 | \n",
535 | "
\n",
536 | " \n",
537 | " | 19 | \n",
538 | " fresno_wind_speed | \n",
539 | " 0.002338 | \n",
540 | "
\n",
541 | " \n",
542 | " | 20 | \n",
543 | " los_angeles_dew_point | \n",
544 | " 0.002248 | \n",
545 | "
\n",
546 | " \n",
547 | " | 21 | \n",
548 | " roseville_temperature | \n",
549 | " 0.002050 | \n",
550 | "
\n",
551 | " \n",
552 | " | 22 | \n",
553 | " san_diego_relative_humidity | \n",
554 | " 0.001976 | \n",
555 | "
\n",
556 | " \n",
557 | " | 23 | \n",
558 | " san_jose_wind_speed | \n",
559 | " 0.001913 | \n",
560 | "
\n",
561 | " \n",
562 | " | 24 | \n",
563 | " los_angeles_relative_humidity | \n",
564 | " 0.001855 | \n",
565 | "
\n",
566 | " \n",
567 | " | 25 | \n",
568 | " lemoore_wind_speed | \n",
569 | " 0.001718 | \n",
570 | "
\n",
571 | " \n",
572 | " | 26 | \n",
573 | " sacramento_temperature | \n",
574 | " 0.001360 | \n",
575 | "
\n",
576 | " \n",
577 | " | 27 | \n",
578 | " petaluma_relative_humidity | \n",
579 | " 0.001332 | \n",
580 | "
\n",
581 | " \n",
582 | " | 28 | \n",
583 | " san_jose_relative_humidity | \n",
584 | " 0.001241 | \n",
585 | "
\n",
586 | " \n",
587 | " | 29 | \n",
588 | " petaluma_dew_point | \n",
589 | " 0.001182 | \n",
590 | "
\n",
591 | " \n",
592 | " | 30 | \n",
593 | " san_francisco_relative_humidity | \n",
594 | " 0.001125 | \n",
595 | "
\n",
596 | " \n",
597 | " | 31 | \n",
598 | " san_francisco_wind_speed | \n",
599 | " 0.001094 | \n",
600 | "
\n",
601 | " \n",
602 | " | 32 | \n",
603 | " lemoore_dew_point | \n",
604 | " 0.001045 | \n",
605 | "
\n",
606 | " \n",
607 | " | 33 | \n",
608 | " chico_dew_point | \n",
609 | " 0.001042 | \n",
610 | "
\n",
611 | " \n",
612 | " | 34 | \n",
613 | " petaluma_wind_speed | \n",
614 | " 0.001038 | \n",
615 | "
\n",
616 | " \n",
617 | " | 35 | \n",
618 | " roseville_dew_point | \n",
619 | " 0.001030 | \n",
620 | "
\n",
621 | " \n",
622 | " | 36 | \n",
623 | " sacramento_wind_speed | \n",
624 | " 0.000986 | \n",
625 | "
\n",
626 | " \n",
627 | " | 37 | \n",
628 | " san_jose_dew_point | \n",
629 | " 0.000977 | \n",
630 | "
\n",
631 | " \n",
632 | " | 38 | \n",
633 | " fresno_dew_point | \n",
634 | " 0.000958 | \n",
635 | "
\n",
636 | " \n",
637 | " | 39 | \n",
638 | " sacramento_dew_point | \n",
639 | " 0.000957 | \n",
640 | "
\n",
641 | " \n",
642 | " | 40 | \n",
643 | " roseville_wind_speed | \n",
644 | " 0.000861 | \n",
645 | "
\n",
646 | " \n",
647 | " | 41 | \n",
648 | " san_jose_temperature | \n",
649 | " 0.000822 | \n",
650 | "
\n",
651 | " \n",
652 | " | 42 | \n",
653 | " san_francisco_dew_point | \n",
654 | " 0.000719 | \n",
655 | "
\n",
656 | " \n",
657 | " | 43 | \n",
658 | " san_francisco_temperature | \n",
659 | " 0.000588 | \n",
660 | "
\n",
661 | " \n",
662 | " | 44 | \n",
663 | " petaluma_temperature | \n",
664 | " 0.000514 | \n",
665 | "
\n",
666 | " \n",
667 | " | 45 | \n",
668 | " los_angeles_cloud_type | \n",
669 | " 0.000450 | \n",
670 | "
\n",
671 | " \n",
672 | " | 46 | \n",
673 | " san_francisco_cloud_type | \n",
674 | " 0.000431 | \n",
675 | "
\n",
676 | " \n",
677 | " | 47 | \n",
678 | " chico_cloud_type | \n",
679 | " 0.000417 | \n",
680 | "
\n",
681 | " \n",
682 | " | 48 | \n",
683 | " sacramento_cloud_type | \n",
684 | " 0.000407 | \n",
685 | "
\n",
686 | " \n",
687 | " | 49 | \n",
688 | " san_diego_cloud_type | \n",
689 | " 0.000395 | \n",
690 | "
\n",
691 | " \n",
692 | " | 50 | \n",
693 | " roseville_cloud_type | \n",
694 | " 0.000379 | \n",
695 | "
\n",
696 | " \n",
697 | " | 51 | \n",
698 | " petaluma_cloud_type | \n",
699 | " 0.000360 | \n",
700 | "
\n",
701 | " \n",
702 | " | 52 | \n",
703 | " san_jose_cloud_type | \n",
704 | " 0.000345 | \n",
705 | "
\n",
706 | " \n",
707 | " | 53 | \n",
708 | " lemoore_cloud_type | \n",
709 | " 0.000324 | \n",
710 | "
\n",
711 | " \n",
712 | " | 54 | \n",
713 | " fresno_cloud_type | \n",
714 | " 0.000314 | \n",
715 | "
\n",
716 | " \n",
717 | " | 55 | \n",
718 | " Year | \n",
719 | " 0.000005 | \n",
720 | "
\n",
721 | " \n",
722 | "
\n",
723 | "
"
724 | ],
725 | "text/plain": [
726 | " Feature Importance\n",
727 | "0 fresno_relative_humidity 0.373411\n",
728 | "1 Hour 0.255585\n",
729 | "2 lemoore_temperature 0.111071\n",
730 | "3 fresno_temperature 0.063811\n",
731 | "4 Day_of_week 0.059408\n",
732 | "5 lemoore_relative_humidity 0.021034\n",
733 | "6 chico_relative_humidity 0.015598\n",
734 | "7 Week 0.012467\n",
735 | "8 Day 0.011358\n",
736 | "9 chico_temperature 0.006728\n",
737 | "10 san_diego_dew_point 0.005414\n",
738 | "11 san_diego_temperature 0.005129\n",
739 | "12 los_angeles_temperature 0.003860\n",
740 | "13 roseville_relative_humidity 0.003626\n",
741 | "14 Month 0.003414\n",
742 | "15 los_angeles_wind_speed 0.003391\n",
743 | "16 sacramento_relative_humidity 0.003040\n",
744 | "17 chico_wind_speed 0.002434\n",
745 | "18 san_diego_wind_speed 0.002428\n",
746 | "19 fresno_wind_speed 0.002338\n",
747 | "20 los_angeles_dew_point 0.002248\n",
748 | "21 roseville_temperature 0.002050\n",
749 | "22 san_diego_relative_humidity 0.001976\n",
750 | "23 san_jose_wind_speed 0.001913\n",
751 | "24 los_angeles_relative_humidity 0.001855\n",
752 | "25 lemoore_wind_speed 0.001718\n",
753 | "26 sacramento_temperature 0.001360\n",
754 | "27 petaluma_relative_humidity 0.001332\n",
755 | "28 san_jose_relative_humidity 0.001241\n",
756 | "29 petaluma_dew_point 0.001182\n",
757 | "30 san_francisco_relative_humidity 0.001125\n",
758 | "31 san_francisco_wind_speed 0.001094\n",
759 | "32 lemoore_dew_point 0.001045\n",
760 | "33 chico_dew_point 0.001042\n",
761 | "34 petaluma_wind_speed 0.001038\n",
762 | "35 roseville_dew_point 0.001030\n",
763 | "36 sacramento_wind_speed 0.000986\n",
764 | "37 san_jose_dew_point 0.000977\n",
765 | "38 fresno_dew_point 0.000958\n",
766 | "39 sacramento_dew_point 0.000957\n",
767 | "40 roseville_wind_speed 0.000861\n",
768 | "41 san_jose_temperature 0.000822\n",
769 | "42 san_francisco_dew_point 0.000719\n",
770 | "43 san_francisco_temperature 0.000588\n",
771 | "44 petaluma_temperature 0.000514\n",
772 | "45 los_angeles_cloud_type 0.000450\n",
773 | "46 san_francisco_cloud_type 0.000431\n",
774 | "47 chico_cloud_type 0.000417\n",
775 | "48 sacramento_cloud_type 0.000407\n",
776 | "49 san_diego_cloud_type 0.000395\n",
777 | "50 roseville_cloud_type 0.000379\n",
778 | "51 petaluma_cloud_type 0.000360\n",
779 | "52 san_jose_cloud_type 0.000345\n",
780 | "53 lemoore_cloud_type 0.000324\n",
781 | "54 fresno_cloud_type 0.000314\n",
782 | "55 Year 0.000005"
783 | ]
784 | },
785 | "execution_count": 46,
786 | "metadata": {},
787 | "output_type": "execute_result"
788 | }
789 | ],
790 | "source": [
791 | "# Random Forest tree importances\n",
792 | "pd.DataFrame({'Feature':final_data_X_train.columns, 'Importance': rf_tree.feature_importances_}).sort_values(by = 'Importance', ascending = False).reset_index(drop=True)"
793 | ]
794 | },
795 | {
796 | "cell_type": "code",
797 | "execution_count": 47,
798 | "metadata": {},
799 | "outputs": [
800 | {
801 | "data": {
802 | "text/html": [
803 | "\n",
804 | "\n",
817 | "
\n",
818 | " \n",
819 | " \n",
820 | " | \n",
821 | " Feature | \n",
822 | " Importance | \n",
823 | "
\n",
824 | " \n",
825 | " \n",
826 | " \n",
827 | " | 0 | \n",
828 | " fresno_relative_humidity | \n",
829 | " 0.481964 | \n",
830 | "
\n",
831 | " \n",
832 | " | 1 | \n",
833 | " Hour | \n",
834 | " 0.267807 | \n",
835 | "
\n",
836 | " \n",
837 | " | 2 | \n",
838 | " Day_of_week | \n",
839 | " 0.062052 | \n",
840 | "
\n",
841 | " \n",
842 | " | 3 | \n",
843 | " fresno_temperature | \n",
844 | " 0.055203 | \n",
845 | "
\n",
846 | " \n",
847 | " | 4 | \n",
848 | " lemoore_temperature | \n",
849 | " 0.050826 | \n",
850 | "
\n",
851 | " \n",
852 | " | 5 | \n",
853 | " lemoore_relative_humidity | \n",
854 | " 0.017357 | \n",
855 | "
\n",
856 | " \n",
857 | " | 6 | \n",
858 | " Day | \n",
859 | " 0.011032 | \n",
860 | "
\n",
861 | " \n",
862 | " | 7 | \n",
863 | " Week | \n",
864 | " 0.007940 | \n",
865 | "
\n",
866 | " \n",
867 | " | 8 | \n",
868 | " san_diego_dew_point | \n",
869 | " 0.007538 | \n",
870 | "
\n",
871 | " \n",
872 | " | 9 | \n",
873 | " los_angeles_temperature | \n",
874 | " 0.004585 | \n",
875 | "
\n",
876 | " \n",
877 | " | 10 | \n",
878 | " chico_relative_humidity | \n",
879 | " 0.003697 | \n",
880 | "
\n",
881 | " \n",
882 | " | 11 | \n",
883 | " san_diego_temperature | \n",
884 | " 0.003477 | \n",
885 | "
\n",
886 | " \n",
887 | " | 12 | \n",
888 | " san_jose_wind_speed | \n",
889 | " 0.003111 | \n",
890 | "
\n",
891 | " \n",
892 | " | 13 | \n",
893 | " chico_temperature | \n",
894 | " 0.002460 | \n",
895 | "
\n",
896 | " \n",
897 | " | 14 | \n",
898 | " Month | \n",
899 | " 0.002422 | \n",
900 | "
\n",
901 | " \n",
902 | " | 15 | \n",
903 | " sacramento_relative_humidity | \n",
904 | " 0.002137 | \n",
905 | "
\n",
906 | " \n",
907 | " | 16 | \n",
908 | " san_diego_wind_speed | \n",
909 | " 0.002021 | \n",
910 | "
\n",
911 | " \n",
912 | " | 17 | \n",
913 | " los_angeles_dew_point | \n",
914 | " 0.001585 | \n",
915 | "
\n",
916 | " \n",
917 | " | 18 | \n",
918 | " chico_wind_speed | \n",
919 | " 0.001463 | \n",
920 | "
\n",
921 | " \n",
922 | " | 19 | \n",
923 | " roseville_relative_humidity | \n",
924 | " 0.001314 | \n",
925 | "
\n",
926 | " \n",
927 | " | 20 | \n",
928 | " roseville_temperature | \n",
929 | " 0.001256 | \n",
930 | "
\n",
931 | " \n",
932 | " | 21 | \n",
933 | " los_angeles_wind_speed | \n",
934 | " 0.000866 | \n",
935 | "
\n",
936 | " \n",
937 | " | 22 | \n",
938 | " roseville_dew_point | \n",
939 | " 0.000857 | \n",
940 | "
\n",
941 | " \n",
942 | " | 23 | \n",
943 | " lemoore_wind_speed | \n",
944 | " 0.000835 | \n",
945 | "
\n",
946 | " \n",
947 | " | 24 | \n",
948 | " los_angeles_relative_humidity | \n",
949 | " 0.000792 | \n",
950 | "
\n",
951 | " \n",
952 | " | 25 | \n",
953 | " san_francisco_dew_point | \n",
954 | " 0.000719 | \n",
955 | "
\n",
956 | " \n",
957 | " | 26 | \n",
958 | " san_jose_dew_point | \n",
959 | " 0.000712 | \n",
960 | "
\n",
961 | " \n",
962 | " | 27 | \n",
963 | " petaluma_wind_speed | \n",
964 | " 0.000608 | \n",
965 | "
\n",
966 | " \n",
967 | " | 28 | \n",
968 | " petaluma_temperature | \n",
969 | " 0.000580 | \n",
970 | "
\n",
971 | " \n",
972 | " | 29 | \n",
973 | " san_diego_relative_humidity | \n",
974 | " 0.000421 | \n",
975 | "
\n",
976 | " \n",
977 | " | 30 | \n",
978 | " lemoore_dew_point | \n",
979 | " 0.000414 | \n",
980 | "
\n",
981 | " \n",
982 | " | 31 | \n",
983 | " san_jose_cloud_type | \n",
984 | " 0.000412 | \n",
985 | "
\n",
986 | " \n",
987 | " | 32 | \n",
988 | " san_jose_temperature | \n",
989 | " 0.000279 | \n",
990 | "
\n",
991 | " \n",
992 | " | 33 | \n",
993 | " san_francisco_relative_humidity | \n",
994 | " 0.000256 | \n",
995 | "
\n",
996 | " \n",
997 | " | 34 | \n",
998 | " chico_dew_point | \n",
999 | " 0.000240 | \n",
1000 | "
\n",
1001 | " \n",
1002 | " | 35 | \n",
1003 | " sacramento_dew_point | \n",
1004 | " 0.000231 | \n",
1005 | "
\n",
1006 | " \n",
1007 | " | 36 | \n",
1008 | " petaluma_relative_humidity | \n",
1009 | " 0.000183 | \n",
1010 | "
\n",
1011 | " \n",
1012 | " | 37 | \n",
1013 | " san_jose_relative_humidity | \n",
1014 | " 0.000178 | \n",
1015 | "
\n",
1016 | " \n",
1017 | " | 38 | \n",
1018 | " roseville_cloud_type | \n",
1019 | " 0.000172 | \n",
1020 | "
\n",
1021 | " \n",
1022 | " | 39 | \n",
1023 | " fresno_wind_speed | \n",
1024 | " 0.000000 | \n",
1025 | "
\n",
1026 | " \n",
1027 | " | 40 | \n",
1028 | " petaluma_cloud_type | \n",
1029 | " 0.000000 | \n",
1030 | "
\n",
1031 | " \n",
1032 | " | 41 | \n",
1033 | " petaluma_dew_point | \n",
1034 | " 0.000000 | \n",
1035 | "
\n",
1036 | " \n",
1037 | " | 42 | \n",
1038 | " fresno_dew_point | \n",
1039 | " 0.000000 | \n",
1040 | "
\n",
1041 | " \n",
1042 | " | 43 | \n",
1043 | " lemoore_cloud_type | \n",
1044 | " 0.000000 | \n",
1045 | "
\n",
1046 | " \n",
1047 | " | 44 | \n",
1048 | " fresno_cloud_type | \n",
1049 | " 0.000000 | \n",
1050 | "
\n",
1051 | " \n",
1052 | " | 45 | \n",
1053 | " san_francisco_cloud_type | \n",
1054 | " 0.000000 | \n",
1055 | "
\n",
1056 | " \n",
1057 | " | 46 | \n",
1058 | " san_francisco_temperature | \n",
1059 | " 0.000000 | \n",
1060 | "
\n",
1061 | " \n",
1062 | " | 47 | \n",
1063 | " chico_cloud_type | \n",
1064 | " 0.000000 | \n",
1065 | "
\n",
1066 | " \n",
1067 | " | 48 | \n",
1068 | " roseville_wind_speed | \n",
1069 | " 0.000000 | \n",
1070 | "
\n",
1071 | " \n",
1072 | " | 49 | \n",
1073 | " san_francisco_wind_speed | \n",
1074 | " 0.000000 | \n",
1075 | "
\n",
1076 | " \n",
1077 | " | 50 | \n",
1078 | " sacramento_temperature | \n",
1079 | " 0.000000 | \n",
1080 | "
\n",
1081 | " \n",
1082 | " | 51 | \n",
1083 | " sacramento_cloud_type | \n",
1084 | " 0.000000 | \n",
1085 | "
\n",
1086 | " \n",
1087 | " | 52 | \n",
1088 | " san_diego_cloud_type | \n",
1089 | " 0.000000 | \n",
1090 | "
\n",
1091 | " \n",
1092 | " | 53 | \n",
1093 | " sacramento_wind_speed | \n",
1094 | " 0.000000 | \n",
1095 | "
\n",
1096 | " \n",
1097 | " | 54 | \n",
1098 | " los_angeles_cloud_type | \n",
1099 | " 0.000000 | \n",
1100 | "
\n",
1101 | " \n",
1102 | " | 55 | \n",
1103 | " Year | \n",
1104 | " 0.000000 | \n",
1105 | "
\n",
1106 | " \n",
1107 | "
\n",
1108 | "
"
1109 | ],
1110 | "text/plain": [
1111 | " Feature Importance\n",
1112 | "0 fresno_relative_humidity 0.481964\n",
1113 | "1 Hour 0.267807\n",
1114 | "2 Day_of_week 0.062052\n",
1115 | "3 fresno_temperature 0.055203\n",
1116 | "4 lemoore_temperature 0.050826\n",
1117 | "5 lemoore_relative_humidity 0.017357\n",
1118 | "6 Day 0.011032\n",
1119 | "7 Week 0.007940\n",
1120 | "8 san_diego_dew_point 0.007538\n",
1121 | "9 los_angeles_temperature 0.004585\n",
1122 | "10 chico_relative_humidity 0.003697\n",
1123 | "11 san_diego_temperature 0.003477\n",
1124 | "12 san_jose_wind_speed 0.003111\n",
1125 | "13 chico_temperature 0.002460\n",
1126 | "14 Month 0.002422\n",
1127 | "15 sacramento_relative_humidity 0.002137\n",
1128 | "16 san_diego_wind_speed 0.002021\n",
1129 | "17 los_angeles_dew_point 0.001585\n",
1130 | "18 chico_wind_speed 0.001463\n",
1131 | "19 roseville_relative_humidity 0.001314\n",
1132 | "20 roseville_temperature 0.001256\n",
1133 | "21 los_angeles_wind_speed 0.000866\n",
1134 | "22 roseville_dew_point 0.000857\n",
1135 | "23 lemoore_wind_speed 0.000835\n",
1136 | "24 los_angeles_relative_humidity 0.000792\n",
1137 | "25 san_francisco_dew_point 0.000719\n",
1138 | "26 san_jose_dew_point 0.000712\n",
1139 | "27 petaluma_wind_speed 0.000608\n",
1140 | "28 petaluma_temperature 0.000580\n",
1141 | "29 san_diego_relative_humidity 0.000421\n",
1142 | "30 lemoore_dew_point 0.000414\n",
1143 | "31 san_jose_cloud_type 0.000412\n",
1144 | "32 san_jose_temperature 0.000279\n",
1145 | "33 san_francisco_relative_humidity 0.000256\n",
1146 | "34 chico_dew_point 0.000240\n",
1147 | "35 sacramento_dew_point 0.000231\n",
1148 | "36 petaluma_relative_humidity 0.000183\n",
1149 | "37 san_jose_relative_humidity 0.000178\n",
1150 | "38 roseville_cloud_type 0.000172\n",
1151 | "39 fresno_wind_speed 0.000000\n",
1152 | "40 petaluma_cloud_type 0.000000\n",
1153 | "41 petaluma_dew_point 0.000000\n",
1154 | "42 fresno_dew_point 0.000000\n",
1155 | "43 lemoore_cloud_type 0.000000\n",
1156 | "44 fresno_cloud_type 0.000000\n",
1157 | "45 san_francisco_cloud_type 0.000000\n",
1158 | "46 san_francisco_temperature 0.000000\n",
1159 | "47 chico_cloud_type 0.000000\n",
1160 | "48 roseville_wind_speed 0.000000\n",
1161 | "49 san_francisco_wind_speed 0.000000\n",
1162 | "50 sacramento_temperature 0.000000\n",
1163 | "51 sacramento_cloud_type 0.000000\n",
1164 | "52 san_diego_cloud_type 0.000000\n",
1165 | "53 sacramento_wind_speed 0.000000\n",
1166 | "54 los_angeles_cloud_type 0.000000\n",
1167 | "55 Year 0.000000"
1168 | ]
1169 | },
1170 | "execution_count": 47,
1171 | "metadata": {},
1172 | "output_type": "execute_result"
1173 | }
1174 | ],
1175 | "source": [
1176 | "#Decision tree importances\n",
1177 | "pd.DataFrame({'Feature':final_data_X_train.columns, 'Importance': tree.feature_importances_}).sort_values(by = 'Importance', ascending = False).reset_index(drop=True)"
1178 | ]
1179 | },
1180 | {
1181 | "cell_type": "code",
1182 | "execution_count": 64,
1183 | "metadata": {},
1184 | "outputs": [
1185 | {
1186 | "data": {
1187 | "image/png": "\n",
1188 | "text/plain": [
1189 | ""
1190 | ]
1191 | },
1192 | "metadata": {
1193 | "needs_background": "light"
1194 | },
1195 | "output_type": "display_data"
1196 | }
1197 | ],
1198 | "source": [
1199 | "#Decision tree importances\n",
1200 | "feature_importance = tree.feature_importances_\n",
1201 | "mod_cols_x = [sub.replace('_', ' ') for sub in final_data_X_train.columns]\n",
1202 | "\n",
1203 | "gb_feat = pd.DataFrame({'feature':mod_cols_x, 'importance':feature_importance})\n",
1204 | "gb_feat = gb_feat.sort_values(by='importance', ascending=False).head(20)\n",
1205 | "\n",
1206 | "plt.figure(figsize=(8, 7.5))\n",
1207 | "plt.barh(width=gb_feat.importance, y=gb_feat.feature);"
1208 | ]
1209 | },
1210 | {
1211 | "cell_type": "code",
1212 | "execution_count": 21,
1213 | "metadata": {},
1214 | "outputs": [
1215 | {
1216 | "data": {
1217 | "text/plain": [
1218 | "array([ 5, 3, 4, 54, 10, 9, 55, 2, 45, 24, 29, 19, 43, 1, 8, 32, 13,\n",
1219 | " 38, 44, 40, 14, 22, 30, 17, 25, 42, 50, 18, 35, 33, 48, 23, 28, 39,\n",
1220 | " 12, 7, 47, 52, 20, 15, 27, 37, 49, 34, 46, 11, 53, 36, 21, 16, 6,\n",
1221 | " 51, 41, 31, 26, 0], dtype=int64)"
1222 | ]
1223 | },
1224 | "execution_count": 21,
1225 | "metadata": {},
1226 | "output_type": "execute_result"
1227 | }
1228 | ],
1229 | "source": [
1230 | "#Random forest importances\n",
1231 | "\n",
1232 | "importances = rf_tree.feature_importances_\n",
1233 | "std = np.std([tree.feature_importances_ for tree in rf_tree.estimators_],\n",
1234 | " axis=0)\n",
1235 | "indices = np.argsort(importances)[::-1]\n",
1236 | "\n",
1237 | "indices\n",
1238 | "\n",
1239 | "#Print the feature ranking\n",
1240 | "print(\"Feature ranking:\")\n",
1241 | "\n",
1242 | "for f in range(0,11):\n",
1243 | " print(\"%d. %s (%f)\" % (f, final_data_X_train.columns[indices[f]], importances[indices[f]]))\n",
1244 | "\n",
1245 | "# Plot the feature importances of the forest\n",
1246 | "plt.figure()\n",
1247 | "plt.title(\"Feature importances\")\n",
1248 | "plt.bar(range(0,11), importances[indices],\n",
1249 | " color=\"r\", yerr=std[indices], align=\"center\")\n",
1250 | "plt.show()"
1251 | ]
1252 | },
1253 | {
1254 | "cell_type": "code",
1255 | "execution_count": 52,
1256 | "metadata": {},
1257 | "outputs": [],
1258 | "source": [
1259 | "gb_feat = gb_feat.sort_values(by='importance', ascending= False).head(20)"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "code",
1264 | "execution_count": 53,
1265 | "metadata": {},
1266 | "outputs": [
1267 | {
1268 | "data": {
1269 | "image/png": "\n",
1270 | "text/plain": [
1271 | ""
1272 | ]
1273 | },
1274 | "metadata": {
1275 | "needs_background": "light"
1276 | },
1277 | "output_type": "display_data"
1278 | }
1279 | ],
1280 | "source": [
1281 | "plt.figure(figsize=(8, 7.5))\n",
1282 | "plt.barh(width=gb_feat.importance, y=gb_feat.feature);"
1283 | ]
1284 | },
1285 | {
1286 | "cell_type": "code",
1287 | "execution_count": 34,
1288 | "metadata": {},
1289 | "outputs": [
1290 | {
1291 | "data": {
1292 | "text/html": [
1293 | "\n",
1294 | "\n",
1307 | "
\n",
1308 | " \n",
1309 | " \n",
1310 | " | \n",
1311 | " feature | \n",
1312 | " importance | \n",
1313 | "
\n",
1314 | " \n",
1315 | " \n",
1316 | " \n",
1317 | " | 5 | \n",
1318 | " fresno_relative_humidity | \n",
1319 | " 0.449245 | \n",
1320 | "
\n",
1321 | " \n",
1322 | " | 3 | \n",
1323 | " Hour | \n",
1324 | " 0.260940 | \n",
1325 | "
\n",
1326 | " \n",
1327 | " | 4 | \n",
1328 | " fresno_temperature | \n",
1329 | " 0.097121 | \n",
1330 | "
\n",
1331 | " \n",
1332 | " | 54 | \n",
1333 | " Day_of_week | \n",
1334 | " 0.059365 | \n",
1335 | "
\n",
1336 | " \n",
1337 | " | 10 | \n",
1338 | " chico_relative_humidity | \n",
1339 | " 0.017765 | \n",
1340 | "
\n",
1341 | " \n",
1342 | " | 9 | \n",
1343 | " chico_temperature | \n",
1344 | " 0.017347 | \n",
1345 | "
\n",
1346 | " \n",
1347 | " | 55 | \n",
1348 | " Week | \n",
1349 | " 0.012169 | \n",
1350 | "
\n",
1351 | " \n",
1352 | " | 2 | \n",
1353 | " Day | \n",
1354 | " 0.011900 | \n",
1355 | "
\n",
1356 | " \n",
1357 | " | 45 | \n",
1358 | " redding_relative_humidity | \n",
1359 | " 0.006338 | \n",
1360 | "
\n",
1361 | " \n",
1362 | " | 24 | \n",
1363 | " ontario_temperature | \n",
1364 | " 0.004155 | \n",
1365 | "
\n",
1366 | " \n",
1367 | " | 29 | \n",
1368 | " moreno_valley_temperature | \n",
1369 | " 0.003867 | \n",
1370 | "
\n",
1371 | " \n",
1372 | " | 19 | \n",
1373 | " anaheim_temperature | \n",
1374 | " 0.003583 | \n",
1375 | "
\n",
1376 | " \n",
1377 | " | 43 | \n",
1378 | " concord_wind_speed | \n",
1379 | " 0.003421 | \n",
1380 | "
\n",
1381 | " \n",
1382 | " | 1 | \n",
1383 | " Month | \n",
1384 | " 0.003416 | \n",
1385 | "
\n",
1386 | " \n",
1387 | " | 8 | \n",
1388 | " fresno_wind_speed | \n",
1389 | " 0.002980 | \n",
1390 | "
\n",
1391 | " \n",
1392 | " | 32 | \n",
1393 | " moreno_valley_dew_point | \n",
1394 | " 0.002770 | \n",
1395 | "
\n",
1396 | " \n",
1397 | " | 13 | \n",
1398 | " chico_wind_speed | \n",
1399 | " 0.002438 | \n",
1400 | "
\n",
1401 | " \n",
1402 | " | 38 | \n",
1403 | " san_francisco_wind_speed | \n",
1404 | " 0.002303 | \n",
1405 | "
\n",
1406 | " \n",
1407 | " | 44 | \n",
1408 | " redding_temperature | \n",
1409 | " 0.002288 | \n",
1410 | "
\n",
1411 | " \n",
1412 | " | 40 | \n",
1413 | " concord_relative_humidity | \n",
1414 | " 0.002004 | \n",
1415 | "
\n",
1416 | " \n",
1417 | "
\n",
1418 | "
"
1419 | ],
1420 | "text/plain": [
1421 | " feature importance\n",
1422 | "5 fresno_relative_humidity 0.449245\n",
1423 | "3 Hour 0.260940\n",
1424 | "4 fresno_temperature 0.097121\n",
1425 | "54 Day_of_week 0.059365\n",
1426 | "10 chico_relative_humidity 0.017765\n",
1427 | "9 chico_temperature 0.017347\n",
1428 | "55 Week 0.012169\n",
1429 | "2 Day 0.011900\n",
1430 | "45 redding_relative_humidity 0.006338\n",
1431 | "24 ontario_temperature 0.004155\n",
1432 | "29 moreno_valley_temperature 0.003867\n",
1433 | "19 anaheim_temperature 0.003583\n",
1434 | "43 concord_wind_speed 0.003421\n",
1435 | "1 Month 0.003416\n",
1436 | "8 fresno_wind_speed 0.002980\n",
1437 | "32 moreno_valley_dew_point 0.002770\n",
1438 | "13 chico_wind_speed 0.002438\n",
1439 | "38 san_francisco_wind_speed 0.002303\n",
1440 | "44 redding_temperature 0.002288\n",
1441 | "40 concord_relative_humidity 0.002004"
1442 | ]
1443 | },
1444 | "execution_count": 34,
1445 | "metadata": {},
1446 | "output_type": "execute_result"
1447 | }
1448 | ],
1449 | "source": [
1450 | "gb_feat['feature'] = df['B'].str.slice_replace(1, 3, 'AAA')"
1451 | ]
1452 | },
1453 | {
1454 | "cell_type": "code",
1455 | "execution_count": 82,
1456 | "metadata": {},
1457 | "outputs": [
1458 | {
1459 | "name": "stdout",
1460 | "output_type": "stream",
1461 | "text": [
1462 | "Feature ranking:\n",
1463 | "0. fresno_relative_humidity (0.373411)\n",
1464 | "1. Hour (0.255585)\n",
1465 | "2. lemoore_temperature (0.111071)\n",
1466 | "3. fresno_temperature (0.063811)\n",
1467 | "4. Day_of_week (0.059408)\n",
1468 | "5. lemoore_relative_humidity (0.021034)\n",
1469 | "6. chico_relative_humidity (0.015598)\n",
1470 | "7. Week (0.012467)\n",
1471 | "8. Day (0.011358)\n",
1472 | "9. chico_temperature (0.006728)\n",
1473 | "10. san_diego_dew_point (0.005414)\n",
1474 | "11. san_diego_temperature (0.005129)\n",
1475 | "12. los_angeles_temperature (0.003860)\n",
1476 | "13. roseville_relative_humidity (0.003626)\n",
1477 | "14. Month (0.003414)\n",
1478 | "15. los_angeles_wind_speed (0.003391)\n",
1479 | "16. sacramento_relative_humidity (0.003040)\n",
1480 | "17. chico_wind_speed (0.002434)\n",
1481 | "18. san_diego_wind_speed (0.002428)\n",
1482 | "19. fresno_wind_speed (0.002338)\n",
1483 | "20. los_angeles_dew_point (0.002248)\n",
1484 | "21. roseville_temperature (0.002050)\n",
1485 | "22. san_diego_relative_humidity (0.001976)\n",
1486 | "23. san_jose_wind_speed (0.001913)\n",
1487 | "24. los_angeles_relative_humidity (0.001855)\n",
1488 | "25. lemoore_wind_speed (0.001718)\n",
1489 | "26. sacramento_temperature (0.001360)\n",
1490 | "27. petaluma_relative_humidity (0.001332)\n",
1491 | "28. san_jose_relative_humidity (0.001241)\n",
1492 | "29. petaluma_dew_point (0.001182)\n",
1493 | "30. san_francisco_relative_humidity (0.001125)\n",
1494 | "31. san_francisco_wind_speed (0.001094)\n",
1495 | "32. lemoore_dew_point (0.001045)\n",
1496 | "33. chico_dew_point (0.001042)\n",
1497 | "34. petaluma_wind_speed (0.001038)\n",
1498 | "35. roseville_dew_point (0.001030)\n",
1499 | "36. sacramento_wind_speed (0.000986)\n",
1500 | "37. san_jose_dew_point (0.000977)\n",
1501 | "38. fresno_dew_point (0.000958)\n",
1502 | "39. sacramento_dew_point (0.000957)\n",
1503 | "40. roseville_wind_speed (0.000861)\n",
1504 | "41. san_jose_temperature (0.000822)\n",
1505 | "42. san_francisco_dew_point (0.000719)\n",
1506 | "43. san_francisco_temperature (0.000588)\n",
1507 | "44. petaluma_temperature (0.000514)\n",
1508 | "45. los_angeles_cloud_type (0.000450)\n",
1509 | "46. san_francisco_cloud_type (0.000431)\n",
1510 | "47. chico_cloud_type (0.000417)\n",
1511 | "48. sacramento_cloud_type (0.000407)\n",
1512 | "49. san_diego_cloud_type (0.000395)\n",
1513 | "50. roseville_cloud_type (0.000379)\n",
1514 | "51. petaluma_cloud_type (0.000360)\n",
1515 | "52. san_jose_cloud_type (0.000345)\n",
1516 | "53. lemoore_cloud_type (0.000324)\n",
1517 | "54. fresno_cloud_type (0.000314)\n",
1518 | "55. Year (0.000005)\n"
1519 | ]
1520 | },
1521 | {
1522 | "data": {
1523 | "image/png": "\n",
1524 | "text/plain": [
1525 | ""
1526 | ]
1527 | },
1528 | "metadata": {
1529 | "needs_background": "light"
1530 | },
1531 | "output_type": "display_data"
1532 | }
1533 | ],
1534 | "source": [
1535 | "#Random Forest Importances\n",
1536 | "importances = rf_tree.feature_importances_\n",
1537 | "std = np.std([tree.feature_importances_ for tree in rf_tree.estimators_],\n",
1538 | " axis=0)\n",
1539 | "indices = np.argsort(importances)[::-1]\n",
1540 | "\n",
1541 | "# Print the feature ranking\n",
1542 | "print(\"Feature ranking:\")\n",
1543 | "\n",
1544 | "for f in range(final_data_X_train.shape[1]):\n",
1545 | " print(\"%d. %s (%f)\" % (f, final_data_X_train.columns[indices[f]], importances[indices[f]]))\n",
1546 | "\n",
1547 | "# Plot the feature importances of the forest\n",
1548 | "plt.figure()\n",
1549 | "plt.title(\"Feature Importances Random Forest (Two-Stage Clustering) with Error Bars\")\n",
1550 | "plt.bar(range(final_data_X_train.shape[1]), importances[indices],\n",
1551 | " color=\"r\", yerr=std[indices], align=\"center\")\n",
1552 | "plt.rcParams['figure.figsize'] = [20, 10]\n",
1553 | "plt.savefig(\"C:/Users/zohai/OneDrive/桌面/Comp 152/Final Project Material/Code/Data Folder/features_importances.png\")"
1554 | ]
1555 | },
1556 | {
1557 | "cell_type": "code",
1558 | "execution_count": null,
1559 | "metadata": {},
1560 | "outputs": [],
1561 | "source": []
1562 | },
1563 | {
1564 | "cell_type": "code",
1565 | "execution_count": null,
1566 | "metadata": {},
1567 | "outputs": [],
1568 | "source": []
1569 | }
1570 | ],
1571 | "metadata": {
1572 | "kernelspec": {
1573 | "display_name": "Python 3",
1574 | "language": "python",
1575 | "name": "python3"
1576 | },
1577 | "language_info": {
1578 | "codemirror_mode": {
1579 | "name": "ipython",
1580 | "version": 3
1581 | },
1582 | "file_extension": ".py",
1583 | "mimetype": "text/x-python",
1584 | "name": "python",
1585 | "nbconvert_exporter": "python",
1586 | "pygments_lexer": "ipython3",
1587 | "version": "3.7.4"
1588 | }
1589 | },
1590 | "nbformat": 4,
1591 | "nbformat_minor": 2
1592 | }
1593 |
--------------------------------------------------------------------------------