├── Get_data.py ├── PM2.5-Prediction-Based-on-Random-Forest-Algorithm-CN.ipynb ├── README.md ├── air.csv ├── air_reg_anlysis.py ├── air_train&test.csv ├── 图片0.png ├── 图片1.png └── 图片8.png /Get_data.py: -------------------------------------------------------------------------------- 1 | import time 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | headers = { 6 | #自行更改User-Agent 7 | 'User-Agent':'' 8 | } 9 | for i in range(1, 13): 10 |     time.sleep(5) 11 |     # 把1转换为01 12 |     # 获取2018年空气质量数据 13 |     url = 'http://www.tianqihoubao.com/aqi/mianyang-2018' + str("%02d" % i) + '.html'   14 |     response = requests.get(url=url, headers=headers) 15 |     soup = BeautifulSoup(response.text, 'html.parser') 16 |     tr = soup.find_all('tr') 17 |     # 去除标签栏 18 |     for j in tr[1:]: 19 |         td = j.find_all('td') 20 |         Date = td[0].get_text().strip() 21 |         Quality_grade = td[1].get_text().strip() 22 |         AQI = td[2].get_text().strip() 23 |         AQI_rank = td[3].get_text().strip() 24 |         PM = td[4].get_text() 25 |         PM10=td[5].get_text() 26 |         So2=td[6].get_text() 27 |         No2=td[7].get_text() 28 |         Co=td[8].get_text() 29 |         O3=td[9].get_text() 30 |         with open('air_mianyang_2018.csv', 'a+', encoding='utf-8-sig') as f: 31 |             f.write(Date + ',' + Quality_grade + ',' + AQI + ',' + AQI_rank + ',' + PM + ','+PM10+','+So2+','+No2+','+Co+','+O3+'\n') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PM2.5-Prediction-Based-on-Random-Forest-Algorithm 2 | Based on Python 3.7.4 and Anaconda 3.0 3 | Modle in needed 4 | |-pandas 5 | |-numpy as np 6 | |-sklearn 7 | |-matplotlib 8 | |-pyecharts 9 | 10 | It is important to note that here pychart version is best below 1.0.0!!!!!! 11 | Zhihu Link:基于随机森林算法的PM2.5预测 12 | Any Question PLZ send E-mail to me:zyjy0315@hotmail.com 13 | -------------------------------------------------------------------------------- /air.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/air.csv -------------------------------------------------------------------------------- /air_reg_anlysis.py: -------------------------------------------------------------------------------- 1 | # 4510002988676@hgcxyka 2 | #0xpxtupj 3 | 4 | #-*- coding: utf-8 -*- 5 | ##导入包 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn.ensemble import RandomForestRegressor 9 | import matplotlib.pyplot as plt 10 | import matplotlib 11 | from sklearn.model_selection import train_test_split 12 | 13 | 14 | '''对数据进行统计分析,查看数据的分布情况''' 15 | data=pd.read_csv('datasets/air_train&test.csv',index_col=0,encoding='gb2312') 16 | print (data.head()) 17 | print (data.shape) 18 | index=data.index 19 | col=data.columns 20 | class_names=np.unique(data.iloc[:,-1]) 21 | #print (type(data)) 22 | print (class_names) 23 | #print (data.describe()) 24 | 25 | 26 | '''划分训练集和验证集''' 27 | data_train, data_test= train_test_split(data,test_size=0.1, random_state=0) 28 | print ("训练集统计描述:\n",data_train.describe().round(2)) 29 | print ("验证集统计描述:\n",data_test.describe().round(2)) 30 | print ("训练集信息:\n",data_train.iloc[:,-1].value_counts()) 31 | print ("验证集信息:\n",data_test.iloc[:,-1].value_counts()) 32 | 33 | 34 | '''查看各变量间的相关系数''' 35 | data.drop([u'质量等级'],axis = 1).corr() 36 | 37 | ##绘制散点图矩阵 38 | import seaborn as sns 39 | sns.set(style="ticks", color_codes=True); 40 | # 创建自定义颜色调色板 41 | palette = sns.xkcd_palette(['dark blue', 'dark green', 'gold', 'orange']) 42 | # 画散点图矩阵 43 | sns.pairplot(data.drop([u'质量等级'],axis = 1), diag_kind = 'kde', plot_kws=dict(alpha = 0.7)) 44 | plt.show() 45 | 46 | 47 | '''构建随机森林回归模型预测AQI''' 48 | #获取训练集和验证集 49 | X_train=data_train.iloc[:,0:-2] 50 | X_test=data_test.iloc[:,0:-2] 51 | feature=data_train.iloc[:,0:-2].columns 52 | print (feature) 53 | y_train=data_train.iloc[:,-2] 54 | y_test=data_test.iloc[:,-2] 55 | #print (y_test_reg) 56 | 57 | 58 | '''模型调参''' 59 | ##参数选择 60 | from sklearn.model_selection import RandomizedSearchCV 61 | criterion=['mse','mae'] 62 | n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] 63 | max_features = ['auto', 'sqrt'] 64 | max_depth = [int(x) for x in np.linspace(10, 100, num = 10)] 65 | max_depth.append(None) 66 | min_samples_split = [2, 5, 10] 67 | min_samples_leaf = [1, 2, 4] 68 | bootstrap = [True, False] 69 | random_grid = {'criterion':criterion, 70 | 'n_estimators': n_estimators, 71 | 'max_features': max_features, 72 | 'max_depth': max_depth, 73 | 'min_samples_split': min_samples_split, 74 | 'min_samples_leaf': min_samples_leaf, 75 | 'bootstrap': bootstrap} 76 | #构建模型 77 | clf= RandomForestRegressor() 78 | clf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid, 79 | n_iter = 10, 80 | cv = 3, verbose=2, random_state=42, n_jobs=1) 81 | #回归 82 | clf_random.fit(X_train, y_train) 83 | print (clf_random.best_params_) 84 | 85 | 86 | '''模型训练、验证、评估''' 87 | from pyecharts import Bar 88 | rf=RandomForestRegressor(criterion='mse',bootstrap=False,max_features='sqrt', max_depth=20,min_samples_split=10, n_estimators=1200,min_samples_leaf=2) 89 | 90 | rf.fit(X_train, y_train) 91 | y_train_pred=rf.predict(X_train) 92 | y_test_pred=rf.predict(X_test) 93 | 94 | #指标重要性 95 | print (rf.feature_importances_) 96 | bar=Bar() 97 | bar.add('指标重要性',feature, rf.feature_importances_.round(2),is_label_show=True,label_text_color='#000') 98 | bar.render('指标重要性.html') 99 | 100 | from sklearn.metrics import mean_squared_error,explained_variance_score,mean_absolute_error,r2_score 101 | print ("决策树模型评估--训练集:") 102 | print ('训练r^2:',rf.score(X_train,y_train)) 103 | print ('均方差',mean_squared_error(y_train,y_train_pred)) 104 | print ('绝对差',mean_absolute_error(y_train,y_train_pred)) 105 | print ('解释度',explained_variance_score(y_train,y_train_pred)) 106 | 107 | print ("决策树模型评估--验证集:") 108 | print ('验证r^2:',rf.score(X_test,y_test)) 109 | print ('均方差',mean_squared_error(y_test,y_test_pred)) 110 | print ('绝对差',mean_absolute_error(y_test,y_test_pred)) 111 | print ('解释度',explained_variance_score(y_test,y_test_pred)) 112 | 113 | '''预测''' 114 | data_pred=pd.read_csv('datasets/air.csv',index_col=0,encoding='gb2312') 115 | index=data_pred.index 116 | y_pred=rf.predict(data_pred.values) 117 | 118 | #将预测结果保存到文件中 119 | result_reg=pd.DataFrame(index) 120 | result_reg['AQI']=y_pred 121 | result_reg.to_csv('datasets/result_reg_city.txt',encoding='gb2312') 122 | print (result_reg) 123 | 124 | 125 | #可视化预测结果 126 | from pyecharts import Geo 127 | import pandas as pd 128 | df=pd.read_csv('datasets/result_reg_city.txt',index_col=0,encoding='gb2312') 129 | print (df.head()) 130 | geo = Geo( 131 | "全国主要城市空气质量", 132 | "", 133 | title_color="#fff", 134 | title_pos="center", 135 | width=1200, 136 | height=600, 137 | background_color="#404a59", 138 | ) 139 | geo.add( 140 | "", 141 | df.iloc[:,0], 142 | df.iloc[:,1], 143 | visual_range=[0, 300], 144 | visual_text_color="#111", 145 | symbol_size=15, 146 | is_visualmap=True, 147 | is_piecewise=True, 148 | #visual_split_number=6 149 | pieces=[{"max": 50, "min": 0, "label": "优:0-50"}, 150 | {"max": 100, "min": 50, "label": "良:51-100"}, 151 | {"max": 150, "min": 100, "label": "轻度污染:101-150"}, 152 | {"max": 200, "min": 150, "label": "中度污染:151-200"}, 153 | {"max": 300, "min": 200, "label": "重度污染:201-300"}, 154 | {"max": 1000, "min": 300, "label": "严重污染:>300"}, 155 | ] 156 | ) 157 | geo.render('全国重点城市AQI预测结果的可视化.html') -------------------------------------------------------------------------------- /air_train&test.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/air_train&test.csv -------------------------------------------------------------------------------- /图片0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片0.png -------------------------------------------------------------------------------- /图片1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片1.png -------------------------------------------------------------------------------- /图片8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片8.png --------------------------------------------------------------------------------