├── Get_data.py
├── PM2.5-Prediction-Based-on-Random-Forest-Algorithm-CN.ipynb
├── README.md
├── air.csv
├── air_reg_anlysis.py
├── air_train&test.csv
├── 图片0.png
├── 图片1.png
└── 图片8.png


/Get_data.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | headers = {
 6 |         #自行更改User-Agent
 7 |         'User-Agent':''
 8 | }
 9 | for i in range(1, 13):
10 |     time.sleep(5)
11 |     # 把1转换为01
12 |     # 获取2018年空气质量数据
13 |     url = 'http://www.tianqihoubao.com/aqi/mianyang-2018' + str("%02d" % i) + '.html'  
14 |     response = requests.get(url=url, headers=headers)
15 |     soup = BeautifulSoup(response.text, 'html.parser')
16 |     tr = soup.find_all('tr')
17 |     # 去除标签栏
18 |     for j in tr[1:]:
19 |         td = j.find_all('td')
20 |         Date = td[0].get_text().strip()
21 |         Quality_grade = td[1].get_text().strip()
22 |         AQI = td[2].get_text().strip()
23 |         AQI_rank = td[3].get_text().strip()
24 |         PM = td[4].get_text()
25 |         PM10=td[5].get_text()
26 |         So2=td[6].get_text()
27 |         No2=td[7].get_text()
28 |         Co=td[8].get_text()
29 |         O3=td[9].get_text()
30 |         with open('air_mianyang_2018.csv', 'a+', encoding='utf-8-sig') as f:
31 |             f.write(Date + ',' + Quality_grade + ',' + AQI + ',' + AQI_rank + ',' + PM + ','+PM10+','+So2+','+No2+','+Co+','+O3+'\n')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PM2.5-Prediction-Based-on-Random-Forest-Algorithm
 2 | Based on Python 3.7.4 and Anaconda 3.0  
 3 | Modle in needed  
 4 | |-pandas  
 5 | |-numpy as np   
 6 | |-sklearn  
 7 | |-matplotlib  
 8 | |-pyecharts  
 9 | 
10 | It is important to note that here pychart version is best below 1.0.0!!!!!!  
11 | <a href = "https://www.zhihu.com/">Zhihu Link</a>:<a href="https://zhuanlan.zhihu.com/p/83220850">基于随机森林算法的PM2.5预测</a>  
12 | Any Question PLZ send E-mail to me:zyjy0315@hotmail.com
13 | 


--------------------------------------------------------------------------------
/air.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/air.csv


--------------------------------------------------------------------------------
/air_reg_anlysis.py:
--------------------------------------------------------------------------------
  1 | # 4510002988676@hgcxyka
  2 | #0xpxtupj
  3 | 
  4 | #-*- coding: utf-8 -*-
  5 | ##导入包
  6 | import pandas as pd
  7 | import numpy as np
  8 | from sklearn.ensemble import RandomForestRegressor
  9 | import matplotlib.pyplot as plt
 10 | import matplotlib
 11 | from sklearn.model_selection import train_test_split
 12 | 
 13 | 
 14 | '''对数据进行统计分析，查看数据的分布情况'''
 15 | data=pd.read_csv('datasets/air_train&test.csv',index_col=0,encoding='gb2312')
 16 | print (data.head())
 17 | print (data.shape)
 18 | index=data.index
 19 | col=data.columns
 20 | class_names=np.unique(data.iloc[:,-1])
 21 | #print (type(data))
 22 | print (class_names)
 23 | #print (data.describe())
 24 | 
 25 | 
 26 | '''划分训练集和验证集'''
 27 | data_train, data_test= train_test_split(data,test_size=0.1, random_state=0)
 28 | print ("训练集统计描述：\n",data_train.describe().round(2))
 29 | print ("验证集统计描述：\n",data_test.describe().round(2))
 30 | print ("训练集信息：\n",data_train.iloc[:,-1].value_counts())
 31 | print ("验证集信息：\n",data_test.iloc[:,-1].value_counts())
 32 | 
 33 | 
 34 | '''查看各变量间的相关系数'''
 35 | data.drop([u'质量等级'],axis = 1).corr()
 36 | 
 37 | ##绘制散点图矩阵
 38 | import seaborn as sns
 39 | sns.set(style="ticks", color_codes=True);
 40 | # 创建自定义颜色调色板
 41 | palette = sns.xkcd_palette(['dark blue', 'dark green', 'gold', 'orange'])
 42 | # 画散点图矩阵
 43 | sns.pairplot(data.drop([u'质量等级'],axis = 1), diag_kind = 'kde', plot_kws=dict(alpha = 0.7))
 44 | plt.show()
 45 | 
 46 | 
 47 | '''构建随机森林回归模型预测AQI'''
 48 | #获取训练集和验证集
 49 | X_train=data_train.iloc[:,0:-2]
 50 | X_test=data_test.iloc[:,0:-2]
 51 | feature=data_train.iloc[:,0:-2].columns
 52 | print (feature)
 53 | y_train=data_train.iloc[:,-2]
 54 | y_test=data_test.iloc[:,-2]
 55 | #print (y_test_reg)
 56 | 
 57 | 
 58 | '''模型调参'''
 59 | ##参数选择
 60 | from sklearn.model_selection import RandomizedSearchCV
 61 | criterion=['mse','mae']
 62 | n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
 63 | max_features = ['auto', 'sqrt']
 64 | max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
 65 | max_depth.append(None)
 66 | min_samples_split = [2, 5, 10]
 67 | min_samples_leaf = [1, 2, 4]
 68 | bootstrap = [True, False]
 69 | random_grid = {'criterion':criterion,
 70 |                 'n_estimators': n_estimators,
 71 |                'max_features': max_features,
 72 |                'max_depth': max_depth,
 73 |                'min_samples_split': min_samples_split,
 74 |                'min_samples_leaf': min_samples_leaf,
 75 |                'bootstrap': bootstrap}
 76 | #构建模型
 77 | clf= RandomForestRegressor()
 78 | clf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid,
 79 |                               n_iter = 10,  
 80 |                               cv = 3, verbose=2, random_state=42, n_jobs=1)
 81 | #回归
 82 | clf_random.fit(X_train, y_train)
 83 | print (clf_random.best_params_)
 84 | 
 85 | 
 86 | '''模型训练、验证、评估'''
 87 | from pyecharts import Bar
 88 | rf=RandomForestRegressor(criterion='mse',bootstrap=False,max_features='sqrt', max_depth=20,min_samples_split=10, n_estimators=1200,min_samples_leaf=2)
 89 | 
 90 | rf.fit(X_train, y_train) 
 91 | y_train_pred=rf.predict(X_train)
 92 | y_test_pred=rf.predict(X_test)
 93 | 
 94 | #指标重要性
 95 | print (rf.feature_importances_)
 96 | bar=Bar()
 97 | bar.add('指标重要性',feature, rf.feature_importances_.round(2),is_label_show=True,label_text_color='#000')
 98 | bar.render('指标重要性.html')
 99 | 
100 | from sklearn.metrics import mean_squared_error,explained_variance_score,mean_absolute_error,r2_score
101 | print ("决策树模型评估--训练集：")
102 | print ('训练r^2:',rf.score(X_train,y_train))
103 | print ('均方差',mean_squared_error(y_train,y_train_pred))
104 | print ('绝对差',mean_absolute_error(y_train,y_train_pred))
105 | print ('解释度',explained_variance_score(y_train,y_train_pred))
106 | 
107 | print ("决策树模型评估--验证集：")
108 | print ('验证r^2:',rf.score(X_test,y_test))
109 | print ('均方差',mean_squared_error(y_test,y_test_pred))
110 | print ('绝对差',mean_absolute_error(y_test,y_test_pred))
111 | print ('解释度',explained_variance_score(y_test,y_test_pred))
112 | 
113 | '''预测'''
114 | data_pred=pd.read_csv('datasets/air.csv',index_col=0,encoding='gb2312')
115 | index=data_pred.index
116 | y_pred=rf.predict(data_pred.values)
117 | 
118 | #将预测结果保存到文件中
119 | result_reg=pd.DataFrame(index)
120 | result_reg['AQI']=y_pred
121 | result_reg.to_csv('datasets/result_reg_city.txt',encoding='gb2312')
122 | print (result_reg)
123 | 
124 | 
125 | #可视化预测结果
126 | from pyecharts import Geo
127 | import pandas as pd
128 | df=pd.read_csv('datasets/result_reg_city.txt',index_col=0,encoding='gb2312')
129 | print (df.head())
130 | geo = Geo(
131 |     "全国主要城市空气质量",
132 |     "",
133 |     title_color="#fff",
134 |     title_pos="center",
135 |     width=1200,
136 |     height=600,
137 |     background_color="#404a59",
138 | )
139 | geo.add(
140 |     "",
141 |     df.iloc[:,0],
142 |     df.iloc[:,1],
143 |     visual_range=[0, 300],
144 |     visual_text_color="#111",
145 |     symbol_size=15,
146 |     is_visualmap=True, 
147 |     is_piecewise=True,
148 |     #visual_split_number=6
149 |     pieces=[{"max": 50, "min": 0, "label": "优:0-50"},
150 |             {"max": 100, "min": 50, "label": "良:51-100"},
151 |             {"max": 150, "min": 100, "label": "轻度污染:101-150"},
152 |             {"max": 200, "min": 150, "label": "中度污染:151-200"},
153 |             {"max": 300, "min": 200, "label": "重度污染:201-300"},
154 |             {"max": 1000, "min": 300, "label": "严重污染:>300"},        
155 |         ]
156 | )
157 | geo.render('全国重点城市AQI预测结果的可视化.html')


--------------------------------------------------------------------------------
/air_train&test.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/air_train&test.csv


--------------------------------------------------------------------------------
/图片0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片0.png


--------------------------------------------------------------------------------
/图片1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片1.png


--------------------------------------------------------------------------------
/图片8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片8.png


--------------------------------------------------------------------------------