├── Get_data.py
├── PM2.5-Prediction-Based-on-Random-Forest-Algorithm-CN.ipynb
├── README.md
├── air.csv
├── air_reg_anlysis.py
├── air_train&test.csv
├── 图片0.png
├── 图片1.png
└── 图片8.png
/Get_data.py:
--------------------------------------------------------------------------------
1 | import time
2 | import requests
3 | from bs4 import BeautifulSoup
4 |
5 | headers = {
6 | #自行更改User-Agent
7 | 'User-Agent':''
8 | }
9 | for i in range(1, 13):
10 | time.sleep(5)
11 | # 把1转换为01
12 | # 获取2018年空气质量数据
13 | url = 'http://www.tianqihoubao.com/aqi/mianyang-2018' + str("%02d" % i) + '.html'
14 | response = requests.get(url=url, headers=headers)
15 | soup = BeautifulSoup(response.text, 'html.parser')
16 | tr = soup.find_all('tr')
17 | # 去除标签栏
18 | for j in tr[1:]:
19 | td = j.find_all('td')
20 | Date = td[0].get_text().strip()
21 | Quality_grade = td[1].get_text().strip()
22 | AQI = td[2].get_text().strip()
23 | AQI_rank = td[3].get_text().strip()
24 | PM = td[4].get_text()
25 | PM10=td[5].get_text()
26 | So2=td[6].get_text()
27 | No2=td[7].get_text()
28 | Co=td[8].get_text()
29 | O3=td[9].get_text()
30 | with open('air_mianyang_2018.csv', 'a+', encoding='utf-8-sig') as f:
31 | f.write(Date + ',' + Quality_grade + ',' + AQI + ',' + AQI_rank + ',' + PM + ','+PM10+','+So2+','+No2+','+Co+','+O3+'\n')
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PM2.5-Prediction-Based-on-Random-Forest-Algorithm
2 | Based on Python 3.7.4 and Anaconda 3.0
3 | Modle in needed
4 | |-pandas
5 | |-numpy as np
6 | |-sklearn
7 | |-matplotlib
8 | |-pyecharts
9 |
10 | It is important to note that here pychart version is best below 1.0.0!!!!!!
11 | Zhihu Link:基于随机森林算法的PM2.5预测
12 | Any Question PLZ send E-mail to me:zyjy0315@hotmail.com
13 |
--------------------------------------------------------------------------------
/air.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/air.csv
--------------------------------------------------------------------------------
/air_reg_anlysis.py:
--------------------------------------------------------------------------------
1 | # 4510002988676@hgcxyka
2 | #0xpxtupj
3 |
4 | #-*- coding: utf-8 -*-
5 | ##导入包
6 | import pandas as pd
7 | import numpy as np
8 | from sklearn.ensemble import RandomForestRegressor
9 | import matplotlib.pyplot as plt
10 | import matplotlib
11 | from sklearn.model_selection import train_test_split
12 |
13 |
14 | '''对数据进行统计分析,查看数据的分布情况'''
15 | data=pd.read_csv('datasets/air_train&test.csv',index_col=0,encoding='gb2312')
16 | print (data.head())
17 | print (data.shape)
18 | index=data.index
19 | col=data.columns
20 | class_names=np.unique(data.iloc[:,-1])
21 | #print (type(data))
22 | print (class_names)
23 | #print (data.describe())
24 |
25 |
26 | '''划分训练集和验证集'''
27 | data_train, data_test= train_test_split(data,test_size=0.1, random_state=0)
28 | print ("训练集统计描述:\n",data_train.describe().round(2))
29 | print ("验证集统计描述:\n",data_test.describe().round(2))
30 | print ("训练集信息:\n",data_train.iloc[:,-1].value_counts())
31 | print ("验证集信息:\n",data_test.iloc[:,-1].value_counts())
32 |
33 |
34 | '''查看各变量间的相关系数'''
35 | data.drop([u'质量等级'],axis = 1).corr()
36 |
37 | ##绘制散点图矩阵
38 | import seaborn as sns
39 | sns.set(style="ticks", color_codes=True);
40 | # 创建自定义颜色调色板
41 | palette = sns.xkcd_palette(['dark blue', 'dark green', 'gold', 'orange'])
42 | # 画散点图矩阵
43 | sns.pairplot(data.drop([u'质量等级'],axis = 1), diag_kind = 'kde', plot_kws=dict(alpha = 0.7))
44 | plt.show()
45 |
46 |
47 | '''构建随机森林回归模型预测AQI'''
48 | #获取训练集和验证集
49 | X_train=data_train.iloc[:,0:-2]
50 | X_test=data_test.iloc[:,0:-2]
51 | feature=data_train.iloc[:,0:-2].columns
52 | print (feature)
53 | y_train=data_train.iloc[:,-2]
54 | y_test=data_test.iloc[:,-2]
55 | #print (y_test_reg)
56 |
57 |
58 | '''模型调参'''
59 | ##参数选择
60 | from sklearn.model_selection import RandomizedSearchCV
61 | criterion=['mse','mae']
62 | n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
63 | max_features = ['auto', 'sqrt']
64 | max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
65 | max_depth.append(None)
66 | min_samples_split = [2, 5, 10]
67 | min_samples_leaf = [1, 2, 4]
68 | bootstrap = [True, False]
69 | random_grid = {'criterion':criterion,
70 | 'n_estimators': n_estimators,
71 | 'max_features': max_features,
72 | 'max_depth': max_depth,
73 | 'min_samples_split': min_samples_split,
74 | 'min_samples_leaf': min_samples_leaf,
75 | 'bootstrap': bootstrap}
76 | #构建模型
77 | clf= RandomForestRegressor()
78 | clf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid,
79 | n_iter = 10,
80 | cv = 3, verbose=2, random_state=42, n_jobs=1)
81 | #回归
82 | clf_random.fit(X_train, y_train)
83 | print (clf_random.best_params_)
84 |
85 |
86 | '''模型训练、验证、评估'''
87 | from pyecharts import Bar
88 | rf=RandomForestRegressor(criterion='mse',bootstrap=False,max_features='sqrt', max_depth=20,min_samples_split=10, n_estimators=1200,min_samples_leaf=2)
89 |
90 | rf.fit(X_train, y_train)
91 | y_train_pred=rf.predict(X_train)
92 | y_test_pred=rf.predict(X_test)
93 |
94 | #指标重要性
95 | print (rf.feature_importances_)
96 | bar=Bar()
97 | bar.add('指标重要性',feature, rf.feature_importances_.round(2),is_label_show=True,label_text_color='#000')
98 | bar.render('指标重要性.html')
99 |
100 | from sklearn.metrics import mean_squared_error,explained_variance_score,mean_absolute_error,r2_score
101 | print ("决策树模型评估--训练集:")
102 | print ('训练r^2:',rf.score(X_train,y_train))
103 | print ('均方差',mean_squared_error(y_train,y_train_pred))
104 | print ('绝对差',mean_absolute_error(y_train,y_train_pred))
105 | print ('解释度',explained_variance_score(y_train,y_train_pred))
106 |
107 | print ("决策树模型评估--验证集:")
108 | print ('验证r^2:',rf.score(X_test,y_test))
109 | print ('均方差',mean_squared_error(y_test,y_test_pred))
110 | print ('绝对差',mean_absolute_error(y_test,y_test_pred))
111 | print ('解释度',explained_variance_score(y_test,y_test_pred))
112 |
113 | '''预测'''
114 | data_pred=pd.read_csv('datasets/air.csv',index_col=0,encoding='gb2312')
115 | index=data_pred.index
116 | y_pred=rf.predict(data_pred.values)
117 |
118 | #将预测结果保存到文件中
119 | result_reg=pd.DataFrame(index)
120 | result_reg['AQI']=y_pred
121 | result_reg.to_csv('datasets/result_reg_city.txt',encoding='gb2312')
122 | print (result_reg)
123 |
124 |
125 | #可视化预测结果
126 | from pyecharts import Geo
127 | import pandas as pd
128 | df=pd.read_csv('datasets/result_reg_city.txt',index_col=0,encoding='gb2312')
129 | print (df.head())
130 | geo = Geo(
131 | "全国主要城市空气质量",
132 | "",
133 | title_color="#fff",
134 | title_pos="center",
135 | width=1200,
136 | height=600,
137 | background_color="#404a59",
138 | )
139 | geo.add(
140 | "",
141 | df.iloc[:,0],
142 | df.iloc[:,1],
143 | visual_range=[0, 300],
144 | visual_text_color="#111",
145 | symbol_size=15,
146 | is_visualmap=True,
147 | is_piecewise=True,
148 | #visual_split_number=6
149 | pieces=[{"max": 50, "min": 0, "label": "优:0-50"},
150 | {"max": 100, "min": 50, "label": "良:51-100"},
151 | {"max": 150, "min": 100, "label": "轻度污染:101-150"},
152 | {"max": 200, "min": 150, "label": "中度污染:151-200"},
153 | {"max": 300, "min": 200, "label": "重度污染:201-300"},
154 | {"max": 1000, "min": 300, "label": "严重污染:>300"},
155 | ]
156 | )
157 | geo.render('全国重点城市AQI预测结果的可视化.html')
--------------------------------------------------------------------------------
/air_train&test.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/air_train&test.csv
--------------------------------------------------------------------------------
/图片0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片0.png
--------------------------------------------------------------------------------
/图片1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片1.png
--------------------------------------------------------------------------------
/图片8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephenZheng0315/PM2.5-Prediction-Based-on-Random-Forest-Algorithm/dfe8d5bc4085c7c085edc6ed0931ea83aa5acf0e/图片8.png
--------------------------------------------------------------------------------