├── README.md ├── data └── api_access_fix.csv ├── decomp_model.py └── test_stationarity.py /README.md: -------------------------------------------------------------------------------- 1 | # cyclical_series_predict 2 | function: 3 | 对周期性时间序列做预测 4 | 5 | usage: 6 | python decomp_model.py 7 | 8 | requirments: 9 | python3,statsmodels,pandas,numpy,matplotlib,os,datetime 10 | 11 | tip: 12 | 画图的部分注释掉了,如果想看图,要把注释去掉 13 | -------------------------------------------------------------------------------- /decomp_model.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ''' 3 | 周期性时间序列预测 4 | ''' 5 | import os 6 | import numpy as np 7 | from test_stationarity import * 8 | from statsmodels.tsa.seasonal import seasonal_decompose 9 | from statsmodels.tsa.arima_model import ARIMA 10 | from datetime import timedelta 11 | os.chdir(os.getcwd()+'/data') 12 | 13 | 14 | class ModelDecomp(object): 15 | def __init__(self, file, test_size=1440): 16 | self.ts = self.read_data(file) 17 | self.test_size = test_size 18 | self.train_size = len(self.ts) - self.test_size 19 | self.train = self.ts[:len(self.ts)-test_size] 20 | self.train = self._diff_smooth(self.train) 21 | # draw_ts(self.train) 22 | self.test = self.ts[-test_size:] 23 | 24 | def read_data(self, f): 25 | data = pd.read_csv(f) 26 | data = data.set_index('date') 27 | data.index = pd.to_datetime(data.index) 28 | ts = data['count'] 29 | draw_ts(ts) 30 | return ts 31 | 32 | def _diff_smooth(self, ts): 33 | dif = ts.diff().dropna() 34 | td = dif.describe() 35 | high = td['75%'] + 1.5 * (td['75%'] - td['25%']) 36 | low = td['25%'] - 1.5 * (td['75%'] - td['25%']) 37 | 38 | forbid_index = dif[(dif > high) | (dif < low)].index 39 | i = 0 40 | while i < len(forbid_index) - 1: 41 | n = 1 42 | start = forbid_index[i] 43 | while forbid_index[i+n] == start + timedelta(minutes=n): 44 | n += 1 45 | i += n - 1 46 | 47 | end = forbid_index[i] 48 | value = np.linspace(ts[start - timedelta(minutes=1)], ts[end + timedelta(minutes=1)], n) 49 | ts[start: end] = value 50 | i += 1 51 | 52 | return ts 53 | 54 | def decomp(self, freq): 55 | ''' 56 | 对时间序列进行分解 57 | :param freq: 周期 58 | ''' 59 | decomposition = seasonal_decompose(self.train, freq=freq, two_sided=False) 60 | self.trend = decomposition.trend 61 | self.seasonal = decomposition.seasonal 62 | self.residual = decomposition.resid 63 | # decomposition.plot() 64 | # plt.show() 65 | 66 | d = self.residual.describe() 67 | delta = d['75%'] - d['25%'] 68 | 69 | self.low_error, self.high_error = (d['25%'] - 1 * delta, d['75%'] + 1 * delta) 70 | 71 | def trend_model(self, order): 72 | ''' 73 | 为分解出来的趋势数据单独建模 74 | ''' 75 | self.trend.dropna(inplace=True) 76 | self.trend_model = ARIMA(self.trend, order).fit(disp=-1, method='css') 77 | 78 | return self.trend_model 79 | 80 | def add_season(self): 81 | ''' 82 | 为预测出的趋势数据添加周期数据和残差数据 83 | ''' 84 | self.train_season = self.seasonal 85 | values = [] 86 | low_conf_values = [] 87 | high_conf_values = [] 88 | 89 | for i, t in enumerate(self.pred_time_index): 90 | trend_part = self.trend_pred[i] 91 | 92 | # 相同时间的数据均值 93 | season_part = self.train_season[ 94 | self.train_season.index.time == t.time() 95 | ].mean() 96 | 97 | # 趋势+周期+误差界限 98 | predict = trend_part + season_part 99 | low_bound = trend_part + season_part + self.low_error 100 | high_bound = trend_part + season_part + self.high_error 101 | 102 | values.append(predict) 103 | low_conf_values.append(low_bound) 104 | high_conf_values.append(high_bound) 105 | 106 | self.final_pred = pd.Series(values, index=self.pred_time_index, name='predict') 107 | self.low_conf = pd.Series(low_conf_values, index=self.pred_time_index, name='low_conf') 108 | self.high_conf = pd.Series(high_conf_values, index=self.pred_time_index, name='high_conf') 109 | 110 | def predict_new(self): 111 | ''' 112 | 预测新数据 113 | ''' 114 | #续接train,生成长度为n的时间索引,赋给预测序列 115 | n = self.test_size 116 | self.pred_time_index= pd.date_range(start=self.train.index[-1], periods=n+1, freq='1min')[1:] 117 | self.trend_pred= self.trend_model.forecast(n)[0] 118 | 119 | self.add_season() 120 | 121 | 122 | def evaluate(filename): 123 | md = ModelDecomp(file=filename, test_size=1440) 124 | md.decomp(freq=1440) 125 | md.trend_model(order=(1, 1, 3)) 126 | md.predict_new() 127 | pred = md.final_pred 128 | test = md.test 129 | 130 | plt.subplot(211) 131 | plt.plot(md.ts) 132 | plt.title(filename.split('.')[0]) 133 | plt.subplot(212) 134 | pred.plot(color='salmon', label='Predict') 135 | test.plot(color='steelblue', label='Original') 136 | md.low_conf.plot(color='grey', label='low') 137 | md.high_conf.plot(color='grey', label='high') 138 | 139 | plt.legend(loc='best') 140 | plt.title('RMSE: %.4f' % np.sqrt(sum((pred.values - test.values) ** 2) / test.size)) 141 | plt.tight_layout() 142 | plt.show() 143 | 144 | 145 | if __name__ == '__main__': 146 | filename = 'api_access_fix.csv' 147 | evaluate(filename) 148 | -------------------------------------------------------------------------------- /test_stationarity.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | ''' 3 | 一些时间序列的画图工具 4 | ''' 5 | from statsmodels.tsa.stattools import adfuller 6 | import pandas as pd 7 | import seaborn as sns 8 | import matplotlib.pyplot as plt 9 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 10 | 11 | 12 | # 时间序列图 13 | def draw_ts(timeseries): 14 | timeseries.plot() 15 | plt.show() 16 | 17 | 18 | # 移动平均图 19 | def draw_trend(timeseries, size): 20 | f = plt.figure(facecolor='white') 21 | # 对size个数据进行移动平均 22 | rol_mean = timeseries.rolling(window=size).mean() 23 | # 对size个数据进行加权移动平均 24 | rol_weighted_mean = pd.ewma(timeseries, span=size) 25 | 26 | timeseries.plot(color='blue', label='Original') 27 | rol_mean.plot(color='red', label='Rolling Mean') 28 | rol_weighted_mean.plot(color='black', label='Weighted Rolling Mean') 29 | plt.legend(loc='best') 30 | plt.title('Rolling Mean') 31 | plt.show() 32 | 33 | 34 | # 语义描述 35 | def testStationarity(ts): 36 | dftest = adfuller(ts) 37 | # 对上述函数求得的值进行语义描述 38 | dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used']) 39 | for key, value in dftest[4].items(): 40 | dfoutput['Critical Value (%s)' % key] = value 41 | print(dfoutput) 42 | return dfoutput 43 | 44 | 45 | # 自相关和偏相关图,默认阶数为31阶 46 | def draw_acf_pacf(ts, lags=31): 47 | f = plt.figure(facecolor='white') 48 | ax1 = f.add_subplot(211) 49 | plot_acf(ts, lags, ax=ax1) 50 | ax2 = f.add_subplot(212) 51 | plot_pacf(ts, lags=31, ax=ax2) 52 | plt.show() 53 | --------------------------------------------------------------------------------