├── README.md
├── seriesShow.py
├── dataPreprocessing.py
└── main.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 基于LSTM模型预测PM2.5
 2 | 数据集：UCI数据集 BeiJing PM2.5 Data Data Set
 3 | 数据集详细说明请见：https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data
 4 | 
 5 | ## 环境说明
 6 | 本程序在win10系统pycharm的以下环境中运行成功，不保证在其他环境能够运行
 7 | 
 8 | ```
 9 | Python == 3.8
10 | Tensorflow == 2.5.0
11 | scikit-learn == 0.24.2
12 | ```
13 | 
14 | ## 各程序说明
15 | ```dataPreprocessing.py```:  数据预处理，原始数据为 ```raw.csv```,预处理后的数据为 ```pollution.csv```;
16 | 
17 | ```seriesShow.py```: 时间序列绘制程序
18 | 
19 | ```main.py```: 主程序
20 | 


--------------------------------------------------------------------------------
/seriesShow.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author  : Hugo Wang
 3 | # @Time    : 2021/8/19 10:29
 4 | # @IDE     : PyCharm
 5 | # @Function: Show Time Series
 6 | 
 7 | from pandas import read_csv
 8 | from matplotlib import pyplot
 9 | 
10 | # load dataset
11 | dataset = read_csv('pollution.csv', header=0, index_col=0)
12 | values = dataset.values
13 | # specify columns to plot
14 | groups = [0, 1, 2, 3, 5, 6, 7]
15 | i = 1
16 | # plot each column
17 | pyplot.figure()
18 | for group in groups:
19 |     pyplot.subplot(len(groups), 1, i)
20 |     pyplot.plot(values[:, group])
21 |     pyplot.title(dataset.columns[group], y=0.5, loc='right')
22 |     i += 1
23 | pyplot.show()
24 | 


--------------------------------------------------------------------------------
/dataPreprocessing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author  : Hugo Wang
 3 | # @Time    : 2021/8/19 10:26
 4 | # @IDE     : PyCharm
 5 | # @Function: 
 6 | 
 7 | from pandas import read_csv
 8 | from datetime import datetime
 9 | 
10 | 
11 | # load data
12 | def parse(x):
13 |     return datetime.strptime(x, '%Y %m %d %H')
14 | 
15 | 
16 | dataset = read_csv('raw.csv', parse_dates=[['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
17 | dataset.drop('No', axis=1, inplace=True)
18 | # manually specify column names
19 | dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
20 | dataset.index.name = 'date'
21 | # mark all NA values with 0
22 | dataset['pollution'].fillna(0, inplace=True)
23 | # drop the first 24 hours
24 | dataset = dataset[24:]
25 | # summarize first 5 rows
26 | print(dataset.head(5))
27 | # save to file
28 | dataset.to_csv('pollution.csv')
29 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author  : Hugo Wang
  3 | # @Time    : 2021/8/19 10:29
  4 | # @IDE     : PyCharm
  5 | # @Function:
  6 | 
  7 | 
  8 | from math import sqrt
  9 | from numpy import concatenate
 10 | from matplotlib import pyplot
 11 | from pandas import read_csv
 12 | from pandas import DataFrame
 13 | from pandas import concat
 14 | from sklearn.preprocessing import MinMaxScaler
 15 | from sklearn.preprocessing import LabelEncoder
 16 | from sklearn.metrics import mean_squared_error
 17 | from tensorflow.keras.models import Sequential
 18 | from tensorflow.keras.layers import Dense
 19 | from tensorflow.keras.layers import LSTM
 20 | 
 21 | 
 22 | # convert series to supervised learning
 23 | def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
 24 |     n_vars = 1 if type(data) is list else data.shape[1]
 25 |     df = DataFrame(data)
 26 |     cols, names = list(), list()
 27 |     # input sequence (t-n, ... t-1)
 28 |     for i in range(n_in, 0, -1):
 29 |         cols.append(df.shift(i))
 30 |         names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
 31 |     # forecast sequence (t, t+1, ... t+n)
 32 |     for i in range(0, n_out):
 33 |         cols.append(df.shift(-i))
 34 |         if i == 0:
 35 |             names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
 36 |         else:
 37 |             names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
 38 |     # put it all together
 39 |     agg = concat(cols, axis=1)
 40 |     agg.columns = names
 41 |     # drop rows with NaN values
 42 |     if dropnan:
 43 |         agg.dropna(inplace=True)
 44 |     return agg
 45 | 
 46 | 
 47 | # load dataset
 48 | dataset = read_csv('pollution.csv', header=0, index_col=0)
 49 | values = dataset.values
 50 | # integer encode direction
 51 | encoder = LabelEncoder()
 52 | values[:, 4] = encoder.fit_transform(values[:, 4])
 53 | # ensure all data is float
 54 | values = values.astype('float32')
 55 | # normalize features
 56 | scaler = MinMaxScaler(feature_range=(0, 1))
 57 | scaled = scaler.fit_transform(values)
 58 | # frame as supervised learning
 59 | reframed = series_to_supervised(scaled, 1, 1)
 60 | # drop columns we don't want to predict
 61 | reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]], axis=1, inplace=True)
 62 | print(reframed.head())
 63 | 
 64 | # split into train and test sets
 65 | values = reframed.values
 66 | n_train_hours = 365 * 24
 67 | train = values[:n_train_hours, :]
 68 | test = values[n_train_hours:, :]
 69 | # split into input and outputs
 70 | train_X, train_y = train[:, :-1], train[:, -1]
 71 | test_X, test_y = test[:, :-1], test[:, -1]
 72 | # reshape input to be 3D [samples, timesteps, features]
 73 | train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
 74 | test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
 75 | print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
 76 | 
 77 | # design network
 78 | model = Sequential()
 79 | model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
 80 | model.add(Dense(1))
 81 | model.compile(loss='mae', optimizer='adam')
 82 | # fit network
 83 | history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)
 84 | pyplot.plot(history.history['loss'], label='train')
 85 | pyplot.plot(history.history['val_loss'], label='test')
 86 | pyplot.legend()
 87 | pyplot.show()
 88 | # make a prediction
 89 | yhat = model.predict(test_X)
 90 | test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
 91 | # invert scaling for forecast
 92 | inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
 93 | inv_yhat = scaler.inverse_transform(inv_yhat)
 94 | inv_yhat = inv_yhat[:, 0]
 95 | # invert scaling for actual
 96 | test_y = test_y.reshape((len(test_y), 1))
 97 | inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
 98 | inv_y = scaler.inverse_transform(inv_y)
 99 | inv_y = inv_y[:, 0]
100 | # calculate RMSE
101 | rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
102 | print('Test RMSE: %.3f' % rmse)
103 | 


--------------------------------------------------------------------------------