├── README.md
├── data
    └── api_access_fix.csv
├── decomp_model.py
└── test_stationarity.py


/README.md:
--------------------------------------------------------------------------------
 1 | # cyclical_series_predict
 2 | function：
 3 | 对周期性时间序列做预测
 4 | 
 5 | usage：
 6 | python decomp_model.py
 7 | 
 8 | requirments:
 9 | python3,statsmodels,pandas,numpy,matplotlib,os,datetime
10 | 
11 | tip:
12 | 画图的部分注释掉了，如果想看图，要把注释去掉
13 | 


--------------------------------------------------------------------------------
/decomp_model.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | '''
  3 | 周期性时间序列预测
  4 | '''
  5 | import os
  6 | import numpy as np
  7 | from test_stationarity import *
  8 | from statsmodels.tsa.seasonal import seasonal_decompose
  9 | from statsmodels.tsa.arima_model import ARIMA
 10 | from datetime import timedelta
 11 | os.chdir(os.getcwd()+'/data')
 12 | 
 13 | 
 14 | class ModelDecomp(object):
 15 |     def __init__(self, file, test_size=1440):
 16 |         self.ts = self.read_data(file)
 17 |         self.test_size = test_size
 18 |         self.train_size = len(self.ts) - self.test_size
 19 |         self.train = self.ts[:len(self.ts)-test_size]
 20 |         self.train = self._diff_smooth(self.train)
 21 |         # draw_ts(self.train)
 22 |         self.test = self.ts[-test_size:]
 23 | 
 24 |     def read_data(self, f):
 25 |         data = pd.read_csv(f)
 26 |         data = data.set_index('date')
 27 |         data.index = pd.to_datetime(data.index)
 28 |         ts = data['count']
 29 |         draw_ts(ts)
 30 |         return ts
 31 | 
 32 |     def _diff_smooth(self, ts):
 33 |         dif = ts.diff().dropna()
 34 |         td = dif.describe()
 35 |         high = td['75%'] + 1.5 * (td['75%'] - td['25%'])
 36 |         low = td['25%'] - 1.5 * (td['75%'] - td['25%'])
 37 | 
 38 |         forbid_index = dif[(dif > high) | (dif < low)].index
 39 |         i = 0
 40 |         while i < len(forbid_index) - 1:
 41 |             n = 1
 42 |             start = forbid_index[i]
 43 |             while forbid_index[i+n] == start + timedelta(minutes=n):
 44 |                 n += 1
 45 |             i += n - 1
 46 | 
 47 |             end = forbid_index[i]
 48 |             value = np.linspace(ts[start - timedelta(minutes=1)], ts[end + timedelta(minutes=1)], n)
 49 |             ts[start: end] = value
 50 |             i += 1
 51 | 
 52 |         return ts
 53 | 
 54 |     def decomp(self, freq):
 55 |         '''
 56 |         对时间序列进行分解
 57 |         :param freq: 周期
 58 |         '''
 59 |         decomposition = seasonal_decompose(self.train, freq=freq, two_sided=False)
 60 |         self.trend = decomposition.trend
 61 |         self.seasonal = decomposition.seasonal
 62 |         self.residual = decomposition.resid
 63 |         # decomposition.plot()
 64 |         # plt.show()
 65 | 
 66 |         d = self.residual.describe()
 67 |         delta = d['75%'] - d['25%']
 68 | 
 69 |         self.low_error, self.high_error = (d['25%'] - 1 * delta, d['75%'] + 1 * delta)
 70 | 
 71 |     def trend_model(self, order):
 72 |         '''
 73 |         为分解出来的趋势数据单独建模
 74 |         '''
 75 |         self.trend.dropna(inplace=True)
 76 |         self.trend_model = ARIMA(self.trend, order).fit(disp=-1, method='css')
 77 | 
 78 |         return self.trend_model
 79 | 
 80 |     def add_season(self):
 81 |         '''
 82 |         为预测出的趋势数据添加周期数据和残差数据
 83 |         '''
 84 |         self.train_season = self.seasonal
 85 |         values = []
 86 |         low_conf_values = []
 87 |         high_conf_values = []
 88 | 
 89 |         for i, t in enumerate(self.pred_time_index):
 90 |             trend_part = self.trend_pred[i]
 91 | 
 92 |             # 相同时间的数据均值
 93 |             season_part = self.train_season[
 94 |                 self.train_season.index.time == t.time()
 95 |                 ].mean()
 96 | 
 97 |             # 趋势+周期+误差界限
 98 |             predict = trend_part + season_part
 99 |             low_bound = trend_part + season_part + self.low_error
100 |             high_bound = trend_part + season_part + self.high_error
101 | 
102 |             values.append(predict)
103 |             low_conf_values.append(low_bound)
104 |             high_conf_values.append(high_bound)
105 | 
106 |         self.final_pred = pd.Series(values, index=self.pred_time_index, name='predict')
107 |         self.low_conf = pd.Series(low_conf_values, index=self.pred_time_index, name='low_conf')
108 |         self.high_conf = pd.Series(high_conf_values, index=self.pred_time_index, name='high_conf')
109 | 
110 |     def predict_new(self):
111 |         '''
112 |         预测新数据
113 |         '''
114 |         #续接train，生成长度为n的时间索引，赋给预测序列
115 |         n = self.test_size
116 |         self.pred_time_index= pd.date_range(start=self.train.index[-1], periods=n+1, freq='1min')[1:]
117 |         self.trend_pred= self.trend_model.forecast(n)[0]
118 | 
119 |         self.add_season()
120 | 
121 | 
122 | def evaluate(filename):
123 |     md = ModelDecomp(file=filename, test_size=1440)
124 |     md.decomp(freq=1440)
125 |     md.trend_model(order=(1, 1, 3))
126 |     md.predict_new()
127 |     pred = md.final_pred
128 |     test = md.test
129 | 
130 |     plt.subplot(211)
131 |     plt.plot(md.ts)
132 |     plt.title(filename.split('.')[0])
133 |     plt.subplot(212)
134 |     pred.plot(color='salmon', label='Predict')
135 |     test.plot(color='steelblue', label='Original')
136 |     md.low_conf.plot(color='grey', label='low')
137 |     md.high_conf.plot(color='grey', label='high')
138 | 
139 |     plt.legend(loc='best')
140 |     plt.title('RMSE: %.4f' % np.sqrt(sum((pred.values - test.values) ** 2) / test.size))
141 |     plt.tight_layout()
142 |     plt.show()
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     filename = 'api_access_fix.csv'
147 |     evaluate(filename)
148 | 


--------------------------------------------------------------------------------
/test_stationarity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | '''
 3 | 一些时间序列的画图工具
 4 | '''
 5 | from statsmodels.tsa.stattools import adfuller
 6 | import pandas as pd
 7 | import seaborn as sns
 8 | import matplotlib.pyplot as plt
 9 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
10 | 
11 | 
12 | # 时间序列图
13 | def draw_ts(timeseries):
14 |     timeseries.plot()
15 |     plt.show()
16 | 
17 | 
18 | # 移动平均图
19 | def draw_trend(timeseries, size):
20 |     f = plt.figure(facecolor='white')
21 |     # 对size个数据进行移动平均
22 |     rol_mean = timeseries.rolling(window=size).mean()
23 |     # 对size个数据进行加权移动平均
24 |     rol_weighted_mean = pd.ewma(timeseries, span=size)
25 | 
26 |     timeseries.plot(color='blue', label='Original')
27 |     rol_mean.plot(color='red', label='Rolling Mean')
28 |     rol_weighted_mean.plot(color='black', label='Weighted Rolling Mean')
29 |     plt.legend(loc='best')
30 |     plt.title('Rolling Mean')
31 |     plt.show()
32 | 
33 | 
34 | # 语义描述
35 | def testStationarity(ts):
36 |     dftest = adfuller(ts)
37 |     # 对上述函数求得的值进行语义描述
38 |     dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
39 |     for key, value in dftest[4].items():
40 |         dfoutput['Critical Value (%s)' % key] = value
41 |     print(dfoutput)
42 |     return dfoutput
43 | 
44 | 
45 | # 自相关和偏相关图，默认阶数为31阶
46 | def draw_acf_pacf(ts, lags=31):
47 |     f = plt.figure(facecolor='white')
48 |     ax1 = f.add_subplot(211)
49 |     plot_acf(ts, lags, ax=ax1)
50 |     ax2 = f.add_subplot(212)
51 |     plot_pacf(ts, lags=31, ax=ax2)
52 |     plt.show()
53 | 


--------------------------------------------------------------------------------