├── .gitignore ├── test.py ├── dataset.py ├── SMAPE.ipynb └── tools.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Project 2 | .idea 3 | .cache 4 | .ipynb_checkpoints 5 | 6 | # Python 7 | *.pyc 8 | 9 | # Dataset 10 | dataset 11 | 12 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from dataset2 import load_wiki_traffic_dataset 2 | 3 | 4 | def test_load_dataset(): 5 | train_path = '/dataset/web-traffic-forecast/train_1.csv' 6 | test_path = '/dataset/web-traffic-forecast/key_1.csv' 7 | train = load_wiki_traffic_dataset(train_path, test_path) 8 | 9 | assert train[train.isnull().any(axis=1)].shape[0] == 0 10 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import pandas as pd 5 | 6 | 7 | def load_wiki_traffic_dataset(train_path: str, test_path: str): 8 | if not os.path.exists(train_path): 9 | raise FileNotFoundError('Train dataset not found') 10 | if not os.path.exists(test_path): 11 | raise FileNotFoundError('Test dataset not found') 12 | 13 | train_dataset = pd.read_csv(train_path).fillna(0.) 14 | test_dataset = pd.read_csv(test_path) 15 | 16 | # Preprocessing Train Dataset 17 | # train_dataset.columns = list(train_dataset.columns[:1]) + list(range(1, len(train_dataset.columns[1:]) + 1)) 18 | 19 | train_regex = re.compile('(?P.+)_(?P[a-z]+)\.' 20 | '(?P[a-z]+)\.org_' 21 | '(?P\w*-?\w*)_(?P\w+)') 22 | train_dataset = pd.concat((train_dataset['Page'].str.extract(train_regex, expand=True), 23 | train_dataset.ix[:, train_dataset.columns != 'Page']), axis=1) 24 | 25 | # Preprocessing Test Dataset 26 | test_regex = re.compile('(?P.+)_(?P\d{4}-\d{2}-\d{2})') 27 | test_dataset = pd.concat([test_dataset['Page'].str.extract(test_regex, expand=True), test_dataset['Id']], axis=1) 28 | return train_dataset, test_dataset 29 | -------------------------------------------------------------------------------- /SMAPE.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Populating the interactive namespace from numpy and matplotlib\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%pylab inline\n", 18 | "import numpy as np\n", 19 | "import pandas as pd" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Symmetric Mean Absolute Percentage Error\n", 27 | "\n", 28 | "$$ \\text{SMAPE} = \\frac{100\\%}{n} \\sum^n_{t=1} \n", 29 | "\\frac{\\left| \\hat{y}_t - y_t \\right|}{(\\left|y_t \\right| + \\left|\\hat{y}_t \\right|) /2} $$\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 40, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "smape(a, a): 0.0\n", 42 | "smape(a, b): 13.1774891775\n", 43 | "smape(a, c): 181.818181818\n", 44 | "smape(a, d): 277.813431224\n", 45 | "smape(a, e): 400.0\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "def smape(y_true, y_pred):\n", 51 | " denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.\n", 52 | " nominator = np.abs(y_pred - y_true)\n", 53 | " diff = nominator/denominator\n", 54 | " return 200 * np.mean(diff)\n", 55 | "\n", 56 | "a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\n", 57 | "b = np.array([1.2, 2, 3, 4, 5.5, 6, 7, 8, 11, 12])\n", 58 | "c = np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])\n", 59 | "d = np.array([100, 0, 20, 10, 1, 2, 3, 5, 0, -1])\n", 60 | "e = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])\n", 61 | "\n", 62 | "print('smape(a, a):', smape(a, a))\n", 63 | "print('smape(a, b):', smape(a, b))\n", 64 | "print('smape(a, c):', smape(a, c))\n", 65 | "print('smape(a, d):', smape(a, d))\n", 66 | "print('smape(a, e):', smape(a, e))" 67 | ] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "Python 3", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.6.1" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 2 91 | } 92 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import queue 4 | import urllib.request 5 | import zipfile 6 | from io import BytesIO, StringIO 7 | from typing import Union 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import pylab 12 | from sklearn.metrics import r2_score 13 | 14 | logger = logging.getLogger('hybrid-lstm.tool') 15 | logger.setLevel(logging.DEBUG) 16 | 17 | formatter = logging.Formatter('%(asctime)s [%(name)s | %(levelname)s] %(message)s') 18 | 19 | _cl = logging.StreamHandler() 20 | _cl.setLevel(logging.DEBUG) 21 | _cl.setFormatter(formatter) 22 | logger.addHandler(_cl) 23 | 24 | 25 | def preprocess(data: pd.DataFrame): 26 | # Lower column names 27 | COLUMNS = ['date', 'time', 'active_power', 'reactive_power', 'voltage', 'intensity', 'sub1', 'sub2', 'sub3'] 28 | # data.columns = map(str.lower, data.columns) 29 | data.columns = COLUMNS 30 | 31 | # Datetime Index (it takes a while) 32 | data['datetime'] = pd.to_datetime(data['date'] + ' ' + data['time']) 33 | data.set_index('datetime', inplace=True) 34 | del data['date'] 35 | del data['time'] 36 | 37 | # Diff: 그 다음 데이터와 시간적 차이 (초단위) 38 | # 예를 들어서 현재 00시 00분 이고, 다음이 00시 5분이라면 5분이라는 차이가 생기고, 39 | # 5 * 60 = 300초 시간만큼 00분 row에 넣는다. 40 | # data['diff_next'] = pd.to_datetime(data.index) 41 | # data['diff_next'] = data['diff_next'].diff(1).dt.total_seconds().shift(-1) 42 | 43 | # Filter only numeric Data 44 | data = data[data.applymap(np.isreal)].dropna() 45 | 46 | return data 47 | 48 | 49 | def augment(data): 50 | n = len(data) 51 | shape = data.shape[1:] 52 | 53 | # Create Augmented Dataset 54 | aug = np.zeros((n * 3, *shape), dtype='float32') 55 | aug[:n] = data 56 | 57 | # Plus 58 | aug[n:n * 2] += 3 # np.std(data, axis=0) 59 | 60 | # Minus 61 | aug[n * 2:n * 3] -= 2 # np.std(data, axis=0) 62 | 63 | return aug 64 | 65 | 66 | def load_household_power_consumption(dest='dataset', hour_one_hot=True): 67 | """ 68 | https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption 69 | 70 | 1.date: Date in format dd/mm/yyyy 71 | 2.time: time in format hh:mm:ss 72 | 3.global_active_power: household global minute-averaged active power (in kilowatt) 73 | 4.global_reactive_power: household global minute-averaged reactive power (in kilowatt) 74 | 5.voltage: minute-averaged voltage (in volt) 75 | 6.global_intensity: household global minute-averaged current intensity (in ampere) 76 | 7.sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered). 77 | 8.sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light. 78 | 9.sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner. 79 | """ 80 | URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip' 81 | ZIP_FILE_NAME = 'household_power_consumption.txt' 82 | ORIGIN_PATH = os.path.join('dataset', 'household_power_consumption_original.csv') 83 | CSV_PATH = os.path.join('dataset', 'household_power_consumption.csv') 84 | 85 | ################################## 86 | # Check existing file 87 | ################################## 88 | if not os.path.exists(dest): 89 | os.mkdir(dest) 90 | 91 | ################################## 92 | # Download and Unzip file 93 | ################################## 94 | if not os.path.exists(ORIGIN_PATH) and not os.path.exists(CSV_PATH): 95 | logger.info('Started downloading dataset. It may take several minutes.') 96 | with urllib.request.urlopen(URL) as res: 97 | f = BytesIO(res.read()) 98 | zip_ref = zipfile.ZipFile(f) 99 | data_txt = zip_ref.read(ZIP_FILE_NAME).decode('utf-8') 100 | zip_ref.close() 101 | data = pd.read_csv(StringIO(data_txt), sep=';') 102 | data.to_csv(ORIGIN_PATH) 103 | 104 | if os.path.exists(ORIGIN_PATH) and not os.path.exists(CSV_PATH): 105 | logger.info('Preprocessing...') 106 | data = pd.read_csv(ORIGIN_PATH, index_col=0) 107 | data = preprocess(data) 108 | logger.info(f'Saved the dataset in "{CSV_PATH}"') 109 | data.to_csv(CSV_PATH) 110 | else: 111 | logger.info('Load existing dataset') 112 | data = pd.read_csv(CSV_PATH, index_col=0) 113 | data.index = pd.to_datetime(data.index) 114 | 115 | if hour_one_hot: 116 | # Add Hour one-hot vector 117 | data['hour'] = data.index.hour 118 | data = pd.get_dummies(data, columns=['hour'], prefix='h') 119 | 120 | dataset = data[['active_power', 'reactive_power', 'voltage', 'intensity', 'sub1', 121 | 'sub2', 'sub3', 'h_0', 'h_1', 'h_2', 'h_3', 'h_4', 'h_5', 122 | 'h_6', 'h_7', 'h_8', 'h_9', 'h_10', 'h_11', 'h_12', 'h_13', 'h_14', 123 | 'h_15', 'h_16', 'h_17', 'h_18', 'h_19', 'h_20', 'h_21', 'h_22', 'h_23']] 124 | return dataset # , data[['diff_next']].as_matrix() 125 | 126 | 127 | def calculate_datetime_diffs(dataset): 128 | times = pd.Series(pd.to_datetime(dataset.index)) 129 | diffs = times.diff(1).dt.total_seconds().shift(-1) 130 | return diffs.as_matrix().reshape(-1, 1) 131 | 132 | 133 | def to_timeseries(data, diffs, t=30): 134 | if isinstance(data, pd.DataFrame): 135 | data = data.as_matrix() 136 | 137 | deque = queue.deque(maxlen=t) 138 | timeseries = list() 139 | for i in range(len(data)): 140 | diff = diffs[i] 141 | if diff >= 120: 142 | deque.clear() 143 | 144 | deque.append(data[i]) 145 | if len(deque) == t: 146 | timeseries.append(deque.copy()) 147 | 148 | return np.array(timeseries, dtype=np.float64) 149 | 150 | 151 | def split_x_y(dataset, seq_n): 152 | if isinstance(dataset, pd.DataFrame): 153 | dataset = dataset.as_matrix() 154 | 155 | x = dataset[:-seq_n] 156 | y = dataset[seq_n:, 0].reshape(-1, 1) 157 | return x, y 158 | 159 | 160 | def split_train_test(data_x, data_y, train_ratio=0.8): 161 | n = len(data_x) 162 | train_n = int(n * train_ratio) 163 | 164 | train_x, test_x = data_x[:train_n], data_x[train_n:] 165 | train_y, test_y = data_y[:train_n], data_y[train_n:] 166 | return train_x, train_y, test_x, test_y 167 | 168 | 169 | def get_task2(y): 170 | n = y.shape[0] 171 | data = np.zeros((n, 2)) 172 | 173 | for i in range(n): 174 | data[i] = (y[i].min(), y[i].max()) 175 | 176 | return data 177 | 178 | 179 | def vis_evaluate(model, test_x, test_y, batch=32): 180 | n = len(test_x) 181 | seq_n = test_x.shape[1] 182 | 183 | fig, plots = pylab.subplots(4, 4) 184 | plots = plots.reshape(-1) 185 | 186 | fig.set_figwidth(12) 187 | fig.set_figheight(7) 188 | 189 | for i, p in enumerate(plots): 190 | idx = np.random.randint(0, n) 191 | input_y = test_x[idx, :, 0] 192 | x1 = np.arange(seq_n) 193 | x2 = np.arange(seq_n, seq_n * 2) 194 | 195 | true_y = test_y[idx] 196 | pred_y = model.predict(test_x[idx:idx + 1], batch_size=batch) 197 | 198 | score = r2_score(true_y.reshape(-1), pred_y.reshape(-1)) 199 | print(f'[{idx:<4}] r^2: {score:<12.4}') 200 | p.plot(x1, input_y, color='#555555') 201 | p.plot(x2, true_y, color='blue', label='true') 202 | p.plot(x2, pred_y[0], color='red', label='pred') 203 | if i == 0: 204 | p.legend() 205 | 206 | 207 | if __name__ == '__main__': 208 | data = load_household_power_consumption() 209 | print([(c, data[c].dtype) for c in data.columns]) 210 | --------------------------------------------------------------------------------