├── .gitignore
├── test.py
├── dataset.py
├── SMAPE.ipynb
└── tools.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Project
 2 | .idea
 3 | .cache
 4 | .ipynb_checkpoints
 5 | 
 6 | # Python
 7 | *.pyc
 8 | 
 9 | # Dataset
10 | dataset
11 | 
12 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from dataset2 import load_wiki_traffic_dataset
 2 | 
 3 | 
 4 | def test_load_dataset():
 5 |     train_path = '/dataset/web-traffic-forecast/train_1.csv'
 6 |     test_path = '/dataset/web-traffic-forecast/key_1.csv'
 7 |     train = load_wiki_traffic_dataset(train_path, test_path)
 8 | 
 9 |     assert train[train.isnull().any(axis=1)].shape[0] == 0
10 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def load_wiki_traffic_dataset(train_path: str, test_path: str):
 8 |     if not os.path.exists(train_path):
 9 |         raise FileNotFoundError('Train dataset not found')
10 |     if not os.path.exists(test_path):
11 |         raise FileNotFoundError('Test dataset not found')
12 | 
13 |     train_dataset = pd.read_csv(train_path).fillna(0.)
14 |     test_dataset = pd.read_csv(test_path)
15 | 
16 |     # Preprocessing Train Dataset
17 |     # train_dataset.columns = list(train_dataset.columns[:1]) + list(range(1, len(train_dataset.columns[1:]) + 1))
18 | 
19 |     train_regex = re.compile('(?P<page>.+)_(?P<country>[a-z]+)\.'
20 |                              '(?P<project>[a-z]+)\.org_'
21 |                              '(?P<access>\w*-?\w*)_(?P<agent>\w+)')
22 |     train_dataset = pd.concat((train_dataset['Page'].str.extract(train_regex, expand=True),
23 |                                train_dataset.ix[:, train_dataset.columns != 'Page']), axis=1)
24 | 
25 |     # Preprocessing Test Dataset
26 |     test_regex = re.compile('(?P<name>.+)_(?P<date>\d{4}-\d{2}-\d{2})')
27 |     test_dataset = pd.concat([test_dataset['Page'].str.extract(test_regex, expand=True), test_dataset['Id']], axis=1)
28 |     return train_dataset, test_dataset
29 | 


--------------------------------------------------------------------------------
/SMAPE.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Populating the interactive namespace from numpy and matplotlib\n"
13 |      ]
14 |     }
15 |    ],
16 |    "source": [
17 |     "%pylab inline\n",
18 |     "import numpy as np\n",
19 |     "import pandas as pd"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "markdown",
24 |    "metadata": {},
25 |    "source": [
26 |     "# Symmetric Mean Absolute Percentage Error\n",
27 |     "\n",
28 |     "$$ \\text{SMAPE} =  \\frac{100\\%}{n} \\sum^n_{t=1} \n",
29 |     "\\frac{\\left| \\hat{y}_t - y_t \\right|}{(\\left|y_t \\right| + \\left|\\hat{y}_t \\right|) /2} $$\n"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": 40,
35 |    "metadata": {},
36 |    "outputs": [
37 |     {
38 |      "name": "stdout",
39 |      "output_type": "stream",
40 |      "text": [
41 |       "smape(a, a): 0.0\n",
42 |       "smape(a, b): 13.1774891775\n",
43 |       "smape(a, c): 181.818181818\n",
44 |       "smape(a, d): 277.813431224\n",
45 |       "smape(a, e): 400.0\n"
46 |      ]
47 |     }
48 |    ],
49 |    "source": [
50 |     "def smape(y_true, y_pred):\n",
51 |     "    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.\n",
52 |     "    nominator = np.abs(y_pred - y_true)\n",
53 |     "    diff = nominator/denominator\n",
54 |     "    return 200 * np.mean(diff)\n",
55 |     "\n",
56 |     "a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\n",
57 |     "b = np.array([1.2, 2, 3, 4, 5.5, 6, 7, 8, 11, 12])\n",
58 |     "c = np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])\n",
59 |     "d = np.array([100, 0, 20, 10, 1, 2, 3, 5, 0, -1])\n",
60 |     "e = np.array([0, 0, 0, 0,  0, 0, 0, 0, 0, 0])\n",
61 |     "\n",
62 |     "print('smape(a, a):', smape(a, a))\n",
63 |     "print('smape(a, b):', smape(a, b))\n",
64 |     "print('smape(a, c):', smape(a, c))\n",
65 |     "print('smape(a, d):', smape(a, d))\n",
66 |     "print('smape(a, e):', smape(a, e))"
67 |    ]
68 |   }
69 |  ],
70 |  "metadata": {
71 |   "kernelspec": {
72 |    "display_name": "Python 3",
73 |    "language": "python",
74 |    "name": "python3"
75 |   },
76 |   "language_info": {
77 |    "codemirror_mode": {
78 |     "name": "ipython",
79 |     "version": 3
80 |    },
81 |    "file_extension": ".py",
82 |    "mimetype": "text/x-python",
83 |    "name": "python",
84 |    "nbconvert_exporter": "python",
85 |    "pygments_lexer": "ipython3",
86 |    "version": "3.6.1"
87 |   }
88 |  },
89 |  "nbformat": 4,
90 |  "nbformat_minor": 2
91 | }
92 | 


--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import queue
  4 | import urllib.request
  5 | import zipfile
  6 | from io import BytesIO, StringIO
  7 | from typing import Union
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import pylab
 12 | from sklearn.metrics import r2_score
 13 | 
 14 | logger = logging.getLogger('hybrid-lstm.tool')
 15 | logger.setLevel(logging.DEBUG)
 16 | 
 17 | formatter = logging.Formatter('%(asctime)s [%(name)s | %(levelname)s] %(message)s')
 18 | 
 19 | _cl = logging.StreamHandler()
 20 | _cl.setLevel(logging.DEBUG)
 21 | _cl.setFormatter(formatter)
 22 | logger.addHandler(_cl)
 23 | 
 24 | 
 25 | def preprocess(data: pd.DataFrame):
 26 |     # Lower column names
 27 |     COLUMNS = ['date', 'time', 'active_power', 'reactive_power', 'voltage', 'intensity', 'sub1', 'sub2', 'sub3']
 28 |     # data.columns = map(str.lower, data.columns)
 29 |     data.columns = COLUMNS
 30 | 
 31 |     # Datetime Index (it takes a while)
 32 |     data['datetime'] = pd.to_datetime(data['date'] + ' ' + data['time'])
 33 |     data.set_index('datetime', inplace=True)
 34 |     del data['date']
 35 |     del data['time']
 36 | 
 37 |     # Diff: 그 다음 데이터와 시간적 차이 (초단위)
 38 |     # 예를 들어서 현재 00시 00분 이고, 다음이 00시 5분이라면 5분이라는 차이가 생기고,
 39 |     # 5 * 60 = 300초 시간만큼 00분 row에 넣는다.
 40 |     # data['diff_next'] = pd.to_datetime(data.index)
 41 |     # data['diff_next'] = data['diff_next'].diff(1).dt.total_seconds().shift(-1)
 42 | 
 43 |     # Filter only numeric Data
 44 |     data = data[data.applymap(np.isreal)].dropna()
 45 | 
 46 |     return data
 47 | 
 48 | 
 49 | def augment(data):
 50 |     n = len(data)
 51 |     shape = data.shape[1:]
 52 | 
 53 |     # Create Augmented Dataset
 54 |     aug = np.zeros((n * 3, *shape), dtype='float32')
 55 |     aug[:n] = data
 56 | 
 57 |     # Plus
 58 |     aug[n:n * 2] += 3  # np.std(data, axis=0)
 59 | 
 60 |     # Minus
 61 |     aug[n * 2:n * 3] -= 2  # np.std(data, axis=0)
 62 | 
 63 |     return aug
 64 | 
 65 | 
 66 | def load_household_power_consumption(dest='dataset', hour_one_hot=True):
 67 |     """
 68 |     https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption
 69 | 
 70 |     1.date: Date in format dd/mm/yyyy
 71 |     2.time: time in format hh:mm:ss
 72 |     3.global_active_power: household global minute-averaged active power (in kilowatt)
 73 |     4.global_reactive_power: household global minute-averaged reactive power (in kilowatt)
 74 |     5.voltage: minute-averaged voltage (in volt)
 75 |     6.global_intensity: household global minute-averaged current intensity (in ampere)
 76 |     7.sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).
 77 |     8.sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.
 78 |     9.sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.
 79 |     """
 80 |     URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip'
 81 |     ZIP_FILE_NAME = 'household_power_consumption.txt'
 82 |     ORIGIN_PATH = os.path.join('dataset', 'household_power_consumption_original.csv')
 83 |     CSV_PATH = os.path.join('dataset', 'household_power_consumption.csv')
 84 | 
 85 |     ##################################
 86 |     # Check existing file
 87 |     ##################################
 88 |     if not os.path.exists(dest):
 89 |         os.mkdir(dest)
 90 | 
 91 |     ##################################
 92 |     # Download and Unzip file
 93 |     ##################################
 94 |     if not os.path.exists(ORIGIN_PATH) and not os.path.exists(CSV_PATH):
 95 |         logger.info('Started downloading dataset. It may take several minutes.')
 96 |         with urllib.request.urlopen(URL) as res:
 97 |             f = BytesIO(res.read())
 98 |             zip_ref = zipfile.ZipFile(f)
 99 |             data_txt = zip_ref.read(ZIP_FILE_NAME).decode('utf-8')
100 |             zip_ref.close()
101 |         data = pd.read_csv(StringIO(data_txt), sep=';')
102 |         data.to_csv(ORIGIN_PATH)
103 | 
104 |     if os.path.exists(ORIGIN_PATH) and not os.path.exists(CSV_PATH):
105 |         logger.info('Preprocessing...')
106 |         data = pd.read_csv(ORIGIN_PATH, index_col=0)
107 |         data = preprocess(data)
108 |         logger.info(f'Saved the dataset in "{CSV_PATH}"')
109 |         data.to_csv(CSV_PATH)
110 |     else:
111 |         logger.info('Load existing dataset')
112 |         data = pd.read_csv(CSV_PATH, index_col=0)
113 |         data.index = pd.to_datetime(data.index)
114 | 
115 |     if hour_one_hot:
116 |         # Add Hour one-hot vector
117 |         data['hour'] = data.index.hour
118 |         data = pd.get_dummies(data, columns=['hour'], prefix='h')
119 | 
120 |     dataset = data[['active_power', 'reactive_power', 'voltage', 'intensity', 'sub1',
121 |                     'sub2', 'sub3', 'h_0', 'h_1', 'h_2', 'h_3', 'h_4', 'h_5',
122 |                     'h_6', 'h_7', 'h_8', 'h_9', 'h_10', 'h_11', 'h_12', 'h_13', 'h_14',
123 |                     'h_15', 'h_16', 'h_17', 'h_18', 'h_19', 'h_20', 'h_21', 'h_22', 'h_23']]
124 |     return dataset  # , data[['diff_next']].as_matrix()
125 | 
126 | 
127 | def calculate_datetime_diffs(dataset):
128 |     times = pd.Series(pd.to_datetime(dataset.index))
129 |     diffs = times.diff(1).dt.total_seconds().shift(-1)
130 |     return diffs.as_matrix().reshape(-1, 1)
131 | 
132 | 
133 | def to_timeseries(data, diffs, t=30):
134 |     if isinstance(data, pd.DataFrame):
135 |         data = data.as_matrix()
136 | 
137 |     deque = queue.deque(maxlen=t)
138 |     timeseries = list()
139 |     for i in range(len(data)):
140 |         diff = diffs[i]
141 |         if diff >= 120:
142 |             deque.clear()
143 | 
144 |         deque.append(data[i])
145 |         if len(deque) == t:
146 |             timeseries.append(deque.copy())
147 | 
148 |     return np.array(timeseries, dtype=np.float64)
149 | 
150 | 
151 | def split_x_y(dataset, seq_n):
152 |     if isinstance(dataset, pd.DataFrame):
153 |         dataset = dataset.as_matrix()
154 | 
155 |     x = dataset[:-seq_n]
156 |     y = dataset[seq_n:, 0].reshape(-1, 1)
157 |     return x, y
158 | 
159 | 
160 | def split_train_test(data_x, data_y, train_ratio=0.8):
161 |     n = len(data_x)
162 |     train_n = int(n * train_ratio)
163 | 
164 |     train_x, test_x = data_x[:train_n], data_x[train_n:]
165 |     train_y, test_y = data_y[:train_n], data_y[train_n:]
166 |     return train_x, train_y, test_x, test_y
167 | 
168 | 
169 | def get_task2(y):
170 |     n = y.shape[0]
171 |     data = np.zeros((n, 2))
172 | 
173 |     for i in range(n):
174 |         data[i] = (y[i].min(), y[i].max())
175 | 
176 |     return data
177 | 
178 | 
179 | def vis_evaluate(model, test_x, test_y, batch=32):
180 |     n = len(test_x)
181 |     seq_n = test_x.shape[1]
182 | 
183 |     fig, plots = pylab.subplots(4, 4)
184 |     plots = plots.reshape(-1)
185 | 
186 |     fig.set_figwidth(12)
187 |     fig.set_figheight(7)
188 | 
189 |     for i, p in enumerate(plots):
190 |         idx = np.random.randint(0, n)
191 |         input_y = test_x[idx, :, 0]
192 |         x1 = np.arange(seq_n)
193 |         x2 = np.arange(seq_n, seq_n * 2)
194 | 
195 |         true_y = test_y[idx]
196 |         pred_y = model.predict(test_x[idx:idx + 1], batch_size=batch)
197 | 
198 |         score = r2_score(true_y.reshape(-1), pred_y.reshape(-1))
199 |         print(f'[{idx:<4}] r^2: {score:<12.4}')
200 |         p.plot(x1, input_y, color='#555555')
201 |         p.plot(x2, true_y, color='blue', label='true')
202 |         p.plot(x2, pred_y[0], color='red', label='pred')
203 |         if i == 0:
204 |             p.legend()
205 | 
206 | 
207 | if __name__ == '__main__':
208 |     data = load_household_power_consumption()
209 |     print([(c, data[c].dtype) for c in data.columns])
210 | 


--------------------------------------------------------------------------------