├── README.md ├── seriesShow.py ├── dataPreprocessing.py └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # 基于LSTM模型预测PM2.5 2 | 数据集:UCI数据集 BeiJing PM2.5 Data Data Set 3 | 数据集详细说明请见:https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data 4 | 5 | ## 环境说明 6 | 本程序在win10系统pycharm的以下环境中运行成功,不保证在其他环境能够运行 7 | 8 | ``` 9 | Python == 3.8 10 | Tensorflow == 2.5.0 11 | scikit-learn == 0.24.2 12 | ``` 13 | 14 | ## 各程序说明 15 | ```dataPreprocessing.py```: 数据预处理,原始数据为 ```raw.csv```,预处理后的数据为 ```pollution.csv```; 16 | 17 | ```seriesShow.py```: 时间序列绘制程序 18 | 19 | ```main.py```: 主程序 20 | -------------------------------------------------------------------------------- /seriesShow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Hugo Wang 3 | # @Time : 2021/8/19 10:29 4 | # @IDE : PyCharm 5 | # @Function: Show Time Series 6 | 7 | from pandas import read_csv 8 | from matplotlib import pyplot 9 | 10 | # load dataset 11 | dataset = read_csv('pollution.csv', header=0, index_col=0) 12 | values = dataset.values 13 | # specify columns to plot 14 | groups = [0, 1, 2, 3, 5, 6, 7] 15 | i = 1 16 | # plot each column 17 | pyplot.figure() 18 | for group in groups: 19 | pyplot.subplot(len(groups), 1, i) 20 | pyplot.plot(values[:, group]) 21 | pyplot.title(dataset.columns[group], y=0.5, loc='right') 22 | i += 1 23 | pyplot.show() 24 | -------------------------------------------------------------------------------- /dataPreprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Hugo Wang 3 | # @Time : 2021/8/19 10:26 4 | # @IDE : PyCharm 5 | # @Function: 6 | 7 | from pandas import read_csv 8 | from datetime import datetime 9 | 10 | 11 | # load data 12 | def parse(x): 13 | return datetime.strptime(x, '%Y %m %d %H') 14 | 15 | 16 | dataset = read_csv('raw.csv', parse_dates=[['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse) 17 | dataset.drop('No', axis=1, inplace=True) 18 | # manually specify column names 19 | dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain'] 20 | dataset.index.name = 'date' 21 | # mark all NA values with 0 22 | dataset['pollution'].fillna(0, inplace=True) 23 | # drop the first 24 hours 24 | dataset = dataset[24:] 25 | # summarize first 5 rows 26 | print(dataset.head(5)) 27 | # save to file 28 | dataset.to_csv('pollution.csv') 29 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Hugo Wang 3 | # @Time : 2021/8/19 10:29 4 | # @IDE : PyCharm 5 | # @Function: 6 | 7 | 8 | from math import sqrt 9 | from numpy import concatenate 10 | from matplotlib import pyplot 11 | from pandas import read_csv 12 | from pandas import DataFrame 13 | from pandas import concat 14 | from sklearn.preprocessing import MinMaxScaler 15 | from sklearn.preprocessing import LabelEncoder 16 | from sklearn.metrics import mean_squared_error 17 | from tensorflow.keras.models import Sequential 18 | from tensorflow.keras.layers import Dense 19 | from tensorflow.keras.layers import LSTM 20 | 21 | 22 | # convert series to supervised learning 23 | def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): 24 | n_vars = 1 if type(data) is list else data.shape[1] 25 | df = DataFrame(data) 26 | cols, names = list(), list() 27 | # input sequence (t-n, ... t-1) 28 | for i in range(n_in, 0, -1): 29 | cols.append(df.shift(i)) 30 | names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] 31 | # forecast sequence (t, t+1, ... t+n) 32 | for i in range(0, n_out): 33 | cols.append(df.shift(-i)) 34 | if i == 0: 35 | names += [('var%d(t)' % (j + 1)) for j in range(n_vars)] 36 | else: 37 | names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] 38 | # put it all together 39 | agg = concat(cols, axis=1) 40 | agg.columns = names 41 | # drop rows with NaN values 42 | if dropnan: 43 | agg.dropna(inplace=True) 44 | return agg 45 | 46 | 47 | # load dataset 48 | dataset = read_csv('pollution.csv', header=0, index_col=0) 49 | values = dataset.values 50 | # integer encode direction 51 | encoder = LabelEncoder() 52 | values[:, 4] = encoder.fit_transform(values[:, 4]) 53 | # ensure all data is float 54 | values = values.astype('float32') 55 | # normalize features 56 | scaler = MinMaxScaler(feature_range=(0, 1)) 57 | scaled = scaler.fit_transform(values) 58 | # frame as supervised learning 59 | reframed = series_to_supervised(scaled, 1, 1) 60 | # drop columns we don't want to predict 61 | reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]], axis=1, inplace=True) 62 | print(reframed.head()) 63 | 64 | # split into train and test sets 65 | values = reframed.values 66 | n_train_hours = 365 * 24 67 | train = values[:n_train_hours, :] 68 | test = values[n_train_hours:, :] 69 | # split into input and outputs 70 | train_X, train_y = train[:, :-1], train[:, -1] 71 | test_X, test_y = test[:, :-1], test[:, -1] 72 | # reshape input to be 3D [samples, timesteps, features] 73 | train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1])) 74 | test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1])) 75 | print(train_X.shape, train_y.shape, test_X.shape, test_y.shape) 76 | 77 | # design network 78 | model = Sequential() 79 | model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]))) 80 | model.add(Dense(1)) 81 | model.compile(loss='mae', optimizer='adam') 82 | # fit network 83 | history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False) 84 | pyplot.plot(history.history['loss'], label='train') 85 | pyplot.plot(history.history['val_loss'], label='test') 86 | pyplot.legend() 87 | pyplot.show() 88 | # make a prediction 89 | yhat = model.predict(test_X) 90 | test_X = test_X.reshape((test_X.shape[0], test_X.shape[2])) 91 | # invert scaling for forecast 92 | inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1) 93 | inv_yhat = scaler.inverse_transform(inv_yhat) 94 | inv_yhat = inv_yhat[:, 0] 95 | # invert scaling for actual 96 | test_y = test_y.reshape((len(test_y), 1)) 97 | inv_y = concatenate((test_y, test_X[:, 1:]), axis=1) 98 | inv_y = scaler.inverse_transform(inv_y) 99 | inv_y = inv_y[:, 0] 100 | # calculate RMSE 101 | rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) 102 | print('Test RMSE: %.3f' % rmse) 103 | --------------------------------------------------------------------------------