├── .gitattributes ├── code ├── csv └── readme /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /code: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from datetime import datetime 4 | import matplotlib.pyplot as plt 5 | from sklearn.svm import SVR 6 | from sklearn.preprocessing import StandardScaler 7 | import random 8 | from sklearn.grid_search import GridSearchCV 9 | from sklearn.preprocessing import StandardScaler 10 | import seaborn as sns 11 | import threading 12 | 13 | ################################### 数据读取与预处理 ############################################### 14 | data = pd.read_excel('modeling_data.xlsx',sheetname='Area1_Load') 15 | features=pd.read_excel('modeling_data.xlsx',sheetname='Area1_Weather') 16 | ## features前向填充缺失值 17 | features=features.fillna(method='ffill',axis=1) 18 | print(features.describe()) 19 | ## 数据标准化 20 | scaler_fea=StandardScaler() 21 | scaler_load=StandardScaler() 22 | ## 选取历史负荷起始时间 23 | start=features.index[0] 24 | ## 筛选具有特征的负荷 25 | data=data[data['YMD']>=start] 26 | ## 删除预测填充空值部分 27 | data=data.dropna(how='any',axis=0) 28 | ## 选取历史负荷终止时间 29 | end=data.iloc[-1,0] 30 | ## 创建时间-负荷Dataframe 31 | load=pd.DataFrame() 32 | info=np.array(data.iloc[:,1:]) 33 | ## 求和获取日负荷值 34 | load_data=info.sum(axis=1) 35 | load_data=np.reshape(load_data,(-1,1)) 36 | ## 时间负荷Dataframe填充数据 37 | load['time']=data['YMD'] 38 | load['daily_load']=load_data 39 | # 负荷趋势图 40 | # x_axis=[i for i in range(len(load['time']))] 41 | # plt.scatter(x_axis,load['daily_load']) 42 | # plt.show() 43 | 44 | ######################################## 分析数据关系 ############################################## 45 | # trainset=features[features.index<=end].copy() 46 | # trainset['d_load']=load_data 47 | # trainset.columns=['year','month','week','H_tem','L_tem','Avg_tem','Hum','Pre','d_load'] 48 | # year 49 | # 每一年的变化趋势 50 | # year2012=trainset.loc[trainset['year']==2012,['d_load']] 51 | # year2013=trainset.loc[trainset['year']==2013,['d_load']] 52 | # year2014=trainset.loc[trainset['year']==2014,['d_load']] 53 | # plt.subplot(311) 54 | # plt.plot([i for i in range(len(year2012))],year2012,'-*g',label='year 2012') 55 | # plt.subplot(312) 56 | # plt.plot([i for i in range(len(year2013))],year2013,'-*b',label='year 2013') 57 | # plt.subplot(313) 58 | # plt.plot([i for i in range(len(year2014))],year2014,'-*k',label='year 2014') 59 | # # 年均日负荷 60 | # year_avg_load=trainset[['year','d_load']].groupby('year').mean() 61 | # year_avg_load.plot.bar() 62 | # plt.title('year-avg_d_load') 63 | # plt.xlabel('year') 64 | # plt.ylabel('load') 65 | 66 | # month 67 | # year2012=trainset.loc[trainset['year']==2012,['d_load','month']] 68 | # for i in range(1,13): 69 | # month=year2012.loc[year2012['month']==i,['d_load']] 70 | # plt.figure() 71 | # plt.plot([j for j in range(len(month))],month) 72 | # plt.show() 73 | # # 月均日负荷 74 | # month_avg_load=trainset[['month','d_load','year']].groupby(['year','month']).mean() 75 | # month_avg_load.plot.bar() 76 | # plt.title('month-avg_d_load') 77 | # plt.xlabel('month') 78 | # plt.ylabel('load') 79 | 80 | # week 81 | # trainset[['week','d_load']].groupby('week').mean().plot.bar() 82 | 83 | # 平均温度(明显正相关) 84 | # trainset['Avg_tem_cut']=pd.qcut(trainset['Avg_tem'],3) 85 | # trainset['Avg_tem_cut']=pd.factorize(trainset['Avg_tem_cut'])[0] 86 | # trainset[['Avg_tem_cut','d_load']].groupby('Avg_tem_cut').mean().plot.bar() 87 | 88 | # 相对湿度(无太大影响) 89 | # trainset.describe() 90 | # trainset['Hum_cut']=pd.qcut(trainset['Hum'],6) 91 | # trainset['Hum_cut']=pd.factorize(trainset['Hum_cut'])[0] 92 | # trainset[['Hum_cut','d_load']].groupby('Hum_cut').mean().plot.bar() 93 | 94 | # 降水量(无太大影响) 95 | # print(trainset.describe()) 96 | # pre_cut=[] 97 | # for pre in trainset['Pre']: 98 | # if pre<=1: 99 | # pre_cut.append(0) 100 | # elif pre<=50: 101 | # pre_cut.append(1) 102 | # else: 103 | # pre_cut.append(2) 104 | # trainset['Pre_cut']=pre_cut 105 | # trainset[['Pre_cut','d_load']].groupby('Pre_cut').mean().plot.bar() 106 | 107 | ############################################# 特征工程与特征处理 ########################################### 108 | # 加入年份\月份\星期属性 109 | year=[];month=[];week=[];time=list(features.index) 110 | for i in range(len(time)): 111 | t=str(time[i]) 112 | t_year=t[:4] 113 | t_month=t[-4:-2] 114 | ## 对星期属性进行处理 115 | t_week=datetime.strptime(t,'%Y%m%d').weekday()+1 116 | if t_week==6: 117 | t_week=10 118 | if t_week==7: 119 | t_week=15 120 | year.append(int(t_year)) 121 | month.append(int(t_month)) 122 | week.append(t_week) 123 | features.insert(0,'年份',year) 124 | features.insert(1,'月份',month) 125 | # 加入星期属性 126 | features.insert(2,'星期',week) 127 | features.columns=['year','month','week','H_tem','L_tem','Avg_tem','Hum','Pre'] 128 | # 根据特征分析将降水量分为3类 129 | for i in range(len(features['Pre'])): 130 | if features.iloc[i,-1]<=10: 131 | features.iloc[i, -1]=0 132 | elif features.iloc[i,-1]>=50: 133 | features.iloc[i, -1] = 5 134 | else: 135 | features.iloc[i,-1]=2 136 | # 只对不包含(年份、月份\星期类型和最后一列降水量)的数据标准化 137 | features=np.array(features) 138 | features[:,3:-1]=scaler_fea.fit_transform(features[:,3:-1]) 139 | all_load=np.reshape(np.array(load['daily_load']),(-1,1)) 140 | all_load=scaler_load.fit_transform(all_load) 141 | # 选取预测输入并标准化 142 | # features[:,2:-1]=(features[:,2:-1]-scaler_fea.mean_)/np.sqrt(scaler_fea.var_) 143 | # 选取历史与预测的输入输出 144 | pre_day=10 145 | begin=len(load_data)-pre_day 146 | over=len(load_data) 147 | his_x=features[:begin,:] 148 | his_load=all_load[:begin,:] 149 | print(his_x.shape) 150 | pre_x=features[begin:over,:] 151 | # pre_load=np.zeros((pre_x.shape[0],1)) 152 | actual_load=load_data[-pre_day:,:] 153 | print(pre_x.shape) 154 | # 数据预处理后保存至本地csv文件 155 | csv=np.concatenate((features[:over,:],load_data),axis=1) 156 | csv=pd.DataFrame(csv) 157 | csv.columns=['year','month','week','H_tem','L_tem','Avg_tem','Hum','Pre','Daily_load'] 158 | csv.to_csv('modeling_load.csv',index=None) 159 | 160 | ##################################### 特征属性可视化分析 ####################################### 161 | # 互信息权重分析 162 | from sklearn.feature_selection import mutual_info_regression 163 | from sklearn.feature_selection import SelectKBest 164 | model=SelectKBest(mutual_info_regression,k=8) # 删除互信息最小的因子 165 | #输出互信息的大小 166 | his_x=model.fit_transform(his_x,his_load) 167 | weight=list(model.scores_) 168 | 169 | # pearson相关系数分析 170 | import seaborn as sns 171 | correlation=csv.copy() 172 | correlation=correlation.astype(float) 173 | corr=correlation.corr() 174 | # sns.heatmap(corr,annot=True) 175 | 176 | 177 | ############################################ 模型融合与测试 ######################################## 178 | from sklearn.ensemble import RandomForestRegressor 179 | from sklearn.ensemble import AdaBoostRegressor 180 | from sklearn.ensemble import GradientBoostingRegressor 181 | 182 | feature_names=corr.columns[:-1] 183 | print(type(feature_names)) 184 | # RFR 185 | rf_param={'n_estimators':[50,60,70,80,90,100],'min_samples_split':[3,4,5],'min_samples_leaf':[2,3,4],'max_depth':[4]} 186 | rf=RandomForestRegressor() 187 | rf_grid=GridSearchCV(rf,param_grid=rf_param,scoring='neg_mean_squared_error') 188 | rf_grid.fit(his_x,his_load) 189 | rf_importance=rf_grid.best_estimator_.feature_importances_ 190 | rf_importance=list(sorted(zip(rf_importance,feature_names),reverse=True))[:-1] 191 | rf_feature=[i[1] for i in rf_importance] 192 | rf_best_param=rf_grid.best_params_ 193 | 194 | # ADB 195 | adb_param={'n_estimators':[i for i in range(80,200,10)],'learning_rate':list(np.linspace(0.01,1,10))} 196 | adb=AdaBoostRegressor(loss='square') 197 | adb_grid=GridSearchCV(adb,param_grid=adb_param,scoring='neg_mean_squared_error') 198 | adb_grid.fit(his_x,his_load) 199 | adb_importance=adb_grid.best_estimator_.feature_importances_ 200 | adb_importance=list(sorted(zip(adb_importance,feature_names),reverse=True))[:-1] 201 | adb_feature=[i[1] for i in adb_importance] 202 | adb_best_param=adb_grid.best_params_ 203 | 204 | # GBR 205 | gbr=GradientBoostingRegressor(loss='ls') 206 | gbr_param={'n_estimators':[i for i in range(100,200,10)],'max_features':[3,4,5,6], 207 | 'learning_rate':list(np.linspace(0.01,1,10)),'min_samples_split':[2,3,4,5], 208 | 'min_samples_leaf':[2,3,4,5]} 209 | gbr_grid=GridSearchCV(gbr,param_grid=gbr_param,scoring='neg_mean_squared_error') 210 | gbr_grid.fit(his_x,his_load) 211 | gbr_importance=gbr_grid.best_estimator_.feature_importances_ 212 | gbr_importance=list(sorted(zip(gbr_importance,feature_names),reverse=True))[:-1] 213 | gbr_feature=[i[1] for i in gbr_importance] 214 | gbr_best_param=gbr_grid.best_params_ 215 | 216 | # 特征融合与选择、 217 | final_features=list(set(rf_feature+adb_feature+gbr_feature)) 218 | his_x,pre_x=pd.DataFrame(his_x),pd.DataFrame(pre_x) 219 | his_x.columns=['year','month','week','H_tem','L_tem','Avg_tem','Hum','Pre'] 220 | pre_x.columns=['year','month','week','H_tem','L_tem','Avg_tem','Hum','Pre'] 221 | out_features=[item for item in list(his_x.columns) if item not in final_features] 222 | del his_x[out_features[0]]; del pre_x[out_features[0]] 223 | his_x,pre_x=np.array(his_x),np.array(pre_x) 224 | 225 | 226 | 227 | ############################################ GSA-CV-SVR ############################################# 228 | svr=SVR(kernel='rbf') 229 | C_value=list(np.linspace(0.1,10,20)) 230 | gamma_value=list(np.linspace(0.1,5,20)) 231 | svr_param={'C':C_value,'gamma':gamma_value} 232 | svr_grid=GridSearchCV(svr,param_grid=svr_param,cv=10) 233 | svr_grid.fit(his_x,his_load) 234 | # 属性输出 235 | grid_scores=[score[1] for score in svr_grid.grid_scores_] 236 | best_param=svr_grid.best_params_ 237 | best_score=svr_grid.best_score_ 238 | print(best_param,best_score) 239 | best_C=best_param['C'] 240 | best_gamma=best_param['gamma'] 241 | # 得到全局最佳参数的回归器 242 | # Advanced_svr=SVR(kernel='rbf',C=best_C,gamma=best_gamma) 243 | # Advanced_svr.fit(his_x,his_load) 244 | # Advanced_svr_load=Advanced_svr.predict(pre_x) 245 | # Advanced_svr_load=scaler_load.inverse_transform(Advanced_svr_load) 246 | # print(Advanced_svr_load,actual_load) 247 | # Advanced_svr_load=np.reshape(Advanced_svr_load,(-1,1)) 248 | # pre_result=np.concatenate((Advanced_svr_load,actual_load),axis=1) 249 | 250 | ## 等高线图确定全局最优范围 251 | def contourmap(C,gamma,grid_scores): 252 | x=np.array(C) 253 | y=np.array(gamma) 254 | gs=[k for k in grid_scores] 255 | scores=[];score=[] 256 | for index, value in enumerate(gs): 257 | if index > 0 and index % (len(x)) == 0: 258 | scores.append(score) 259 | score=[] 260 | score.append(value) 261 | if index==(len(gs)-1): 262 | scores.append(score) 263 | X,Y=np.meshgrid(x,y) 264 | plt.contourf(X,Y,scores,10,alpha=0.5,cmap=plt.cm.hot) 265 | C=plt.contour(X,Y,scores,10,colors='blue',linewidth=0.5) 266 | plt.clabel(C,fontsize=20) 267 | plt.xlabel('C') 268 | plt.ylabel('gamma') 269 | plt.title('The contour map of SVR parameters') 270 | plt.show() 271 | # 绘制等高线图 272 | # contourmap(C_value,gamma_value,grid_scores) 273 | 274 | # PSO参数优化 275 | class PSO(object): 276 | def __init__(self, population_size, max_steps): 277 | self.w=0.6 # 惯性权重 278 | self.c1=self.c2 = 2 279 | self.population_size=population_size # 粒子群数量 280 | self.dim=2 # 搜索空间的维度 281 | self.max_steps=max_steps # 迭代次数 282 | self.x_bound=[0.1,5] # 解空间范围 283 | self.x=np.random.uniform(self.x_bound[0], self.x_bound[1], 284 | (self.population_size, self.dim)) # 初始化粒子群位置 285 | self.v=np.random.rand(self.population_size, self.dim) # 初始化粒子群速度 286 | fitness,best_param=self.calculate_fitness(self.x) 287 | self.p=self.x # 个体的最佳位置 288 | self.param=best_param # 记录个体最佳参数 289 | self.individual_best_fitness=fitness # 个体的最优适应度 290 | self.fitness_score=[] 291 | ###定义目标损失函数 292 | def calculate_fitness(self,x): 293 | param_grid={'C':list(abs(x[:,0])),'gamma':list(abs(x[:,1]))} 294 | svr=GridSearchCV(SVR(kernel='rbf'),param_grid,cv=5) 295 | svr.fit(his_x,his_load) 296 | grid_scores=svr.best_score_ 297 | best_param=svr.best_params_ 298 | return grid_scores,best_param 299 | 300 | def evolve(self): 301 | fig = plt.figure() 302 | for step in range(self.max_steps): 303 | r1=np.random.rand(self.population_size,self.dim) 304 | r2=np.random.rand(self.population_size,self.dim) 305 | # 更新速度和权重 306 | self.v=self.w*self.v+self.c1*r1*(self.p-self.x) 307 | self.x=self.v+self.x 308 | # 粒子变化散点图 309 | plt.clf() 310 | plt.scatter(self.x[:, 0],self.x[:, 1],s=30,color='k') 311 | plt.xlabel('Parameter C') 312 | plt.ylabel('Parameter gamma') 313 | plt.title('Particle motion scatter plot') 314 | plt.xlim(self.x_bound[0],self.x_bound[1]) 315 | plt.ylim(self.x_bound[0],self.x_bound[1]) 316 | plt.show() 317 | plt.pause(0.01) 318 | fitness,best_param=self.calculate_fitness(self.x) 319 | # 需要更新的个体 320 | if fitness