├── kernel ├── __init__.py ├── data_import.py └── model.py ├── data └── rebar_mins.csv ├── LICENSE ├── README.md ├── .gitignore └── demo.ipynb /kernel/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_import import load_min_data, prepare_data 2 | from .model import NetModel 3 | -------------------------------------------------------------------------------- /data/rebar_mins.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shannonycj/convolutional-autoencoder-trading/HEAD/data/rebar_mins.csv -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Yang Chenjie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Applying Convolutional Auto-Encoder in Trading 2 | 3 | In this project, I try to build a model to extract useful patterns in financial timeseries for predicting the directions of future price movements. Traditional multi-variate timeseries models (even some [modern approach like LSTM](https://www.researchgate.net/publication/327967988_Predicting_Stock_Prices_Using_LSTM)) tend to look at and extract information from each input features independently, which ignores potential correlations between inpputs. For example, looing at historical volume and adjust close prices jointly could povide new information. As such, people have been exploring using [CNN to learn spatial patterns](https://arxiv.org/pdf/1703.04691.pdf). 4 | 5 | It is well-known that the information/noise ration is low in general for financial time-series. Here we try a novel approach, called Convolutional Auto-Encoder (CAE), which proved [successful in computer visions](https://xifengguo.github.io/papers/ICONIP17-DCEC.pdf). 6 | 7 | This repo contains a set of data points from the commodity-trading market. It consists 3 years, 5-mins open, high, low, close, volume and open interests. We use two years data as training-validation-test sets to build our model, and the last year data to backtest our strategy. 8 | 9 | Our CAE and other utils are contained in 'kernel' folder, and there is a demo.ipynb for demostrating the experiment results. 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /kernel/data_import.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon May 13 10:16:39 2019 5 | 6 | @author: chenjieyang 7 | """ 8 | import pandas as pd 9 | import numpy as np 10 | import datetime 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.preprocessing import MinMaxScaler 13 | 14 | 15 | def load_min_data(train_end='2018-05-13 00:00'): 16 | df = pd.read_csv('data/rebar_mins.csv', header=2) 17 | 18 | def time_parser(t): return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M") 19 | time = list(map(time_parser, df.time)) 20 | df['time'] = time 21 | df.set_index('time', inplace=True) 22 | train_end = datetime.datetime.strptime(train_end, "%Y-%m-%d %H:%M") 23 | df_train = df.loc[df.index < train_end] 24 | df_test = df.loc[df.index >= train_end] 25 | return df_train, df_test 26 | 27 | 28 | def get_idx(i, n, step): 29 | x_start = i * step 30 | x_end = x_start + n 31 | y_start = x_end 32 | y_end = y_start + step 33 | return x_start, x_end, y_start, y_end 34 | 35 | 36 | def prepare_data(df, n, step, test_size=0.3): 37 | delta = df.drop('volume', axis=1).pct_change() 38 | log_volume_delta = np.log(df.volume) - np.log(df.volume.shift(1)) 39 | delta['volume'] = log_volume_delta 40 | delta = delta.dropna(how='all') 41 | df = df.iloc[1:, :] 42 | nrows = delta.shape[0] 43 | i = 0 44 | X = [] 45 | y = [] 46 | while True: 47 | x_start, x_end, y_start, y_end = get_idx(i, n, step) 48 | if y_end > nrows - 1: 49 | break 50 | x = delta.iloc[x_start:x_end, :].values 51 | x = MinMaxScaler().fit_transform(x) * 255 52 | X.append(x.astype('int')) 53 | y.append((df.iloc[y_end, :].close - df.iloc[y_start, :].close) / df.iloc[y_start, :].close) 54 | i += 1 55 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) 56 | X_train, X_test = np.expand_dims(X_train, -1), np.expand_dims(X_test, -1) 57 | y_train, y_test = np.array(y_train) >= 0, np.array(y_test) >= 0 58 | return X_train, X_test, y_train * 1.0, y_test * 1.0 59 | -------------------------------------------------------------------------------- /kernel/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tensorflow.keras.layers import Input, Dense, Convolution2D, MaxPooling2D, UpSampling2D, Flatten, Reshape, Dropout, Conv2D 4 | from tensorflow.keras.models import Model 5 | from tensorflow.keras import backend as K 6 | from sklearn.metrics import classification_report, mean_squared_error 7 | from sklearn.model_selection import RandomizedSearchCV 8 | from scipy.stats import randint as sp_randint 9 | 10 | 11 | class NetModel: 12 | def __init__(self, X_train, X_test, y_train, y_test): 13 | self.X_train, self.X_test = X_train, X_test 14 | self.y_train, self.y_test = y_train, y_test 15 | self.dims = X_train.shape 16 | 17 | def build_net(self, conv_window=(6, 3), pooling_window=(10, 1), n_filters=(64, 32, 16)): 18 | 19 | input_img = Input(shape=self.dims[1:]) # adapt this if using `channels_first` image data format 20 | print("shape of input", K.int_shape(input_img)) 21 | conv_1 = Conv2D(n_filters[0], conv_window, activation='relu', padding='same')(input_img) 22 | print("shape after first conv", K.int_shape(conv_1)) 23 | pool_1 = MaxPooling2D(pooling_window, padding='same')(conv_1) 24 | print("shape after first pooling", K.int_shape(pool_1)) 25 | conv_2 = Conv2D(n_filters[1], conv_window, activation='relu', padding='same')(pool_1) 26 | print("shape after second conv", K.int_shape(conv_2)) 27 | 28 | pool_2 = MaxPooling2D(pooling_window, padding='same')(conv_2) 29 | print("shape after second pooling", K.int_shape(pool_2)) 30 | 31 | conv_3 = Conv2D(n_filters[2], conv_window, activation='relu', padding='same')(pool_2) 32 | print("shape after third conv", K.int_shape(conv_3)) 33 | 34 | encoded = MaxPooling2D(pooling_window, padding='same')(conv_3) 35 | print("shape of encoded", K.int_shape(encoded)) 36 | 37 | up_3 = UpSampling2D(pooling_window)(encoded) 38 | print("shape after upsample third pooling", K.int_shape(up_3)) 39 | 40 | conv_neg_3 = Conv2D(n_filters[2], conv_window, activation='relu', padding='same')(up_3) 41 | print("shape after decode third conv", K.int_shape(conv_neg_3)) 42 | 43 | up_2 = UpSampling2D(pooling_window)(conv_neg_3) 44 | print("shape after upsample second pooling", K.int_shape(up_2)) 45 | 46 | conv_neg_2 = Conv2D(n_filters[1], conv_window, activation='relu', padding='same')(up_2) 47 | print("shape after decode second conv", K.int_shape(conv_neg_2)) 48 | up_1 = UpSampling2D(pooling_window)(conv_neg_2) 49 | print("shape after upsample first pooling", K.int_shape(up_1)) 50 | conv_neg_3 = Conv2D(n_filters[0], conv_window, activation='relu', padding='same')(up_1) 51 | print("shape after decode first conv", K.int_shape(conv_neg_3)) 52 | decoded = Conv2D(1, conv_window, activation='linear', padding='same')(conv_neg_3) 53 | print("shape after decode to input", K.int_shape(decoded)) 54 | 55 | self.autoencoder = Model(input_img, decoded) 56 | self.autoencoder.compile(optimizer='adam', loss='mean_squared_error') 57 | self.encoder_model = Model(self.autoencoder.input, self.autoencoder.layers[6].output) 58 | 59 | def train_encoder(self, n_epochs=100, batch_size=64): 60 | self.autoencoder.fit(self.X_train, self.X_train, epochs=n_epochs, 61 | batch_size=batch_size, shuffle=True) 62 | 63 | def get_encoded_series(self): 64 | self.reconstructed_train = self.autoencoder.predict(self.X_train) 65 | self.reconstructed_test = self.autoencoder.predict(self.X_test) 66 | self.lf_train = self.flatten_arr(self.encoder_model.predict(self.X_train)) 67 | self.lf_test = self.flatten_arr(self.encoder_model.predict(self.X_test)) 68 | self.train_features = self.merge_features(self.reconstructed_train, self.X_train, self.lf_train) 69 | self.test_features = self.merge_features(self.reconstructed_test, self.X_test, self.lf_test) 70 | 71 | @staticmethod 72 | def merge_features(X_, X, lf): 73 | recon_loss = [mean_squared_error(X_[i][:, :, 0], X[i][:, :, 0]) for i in range(len(X))] 74 | keys = [f'feature_{i}' for i in range(lf.shape[1])] 75 | vals = lf.T 76 | df = pd.DataFrame(dict(list(zip(keys, vals)))) 77 | df['recon_loss'] = recon_loss 78 | return df 79 | 80 | @staticmethod 81 | def flatten_arr(arr): 82 | flat = [] 83 | for a in arr: 84 | flat.append(a.reshape(-1,)) 85 | return np.array(flat) 86 | 87 | def train_classifier(self, model='xgb', n_search=10): 88 | if model == 'rf': 89 | from sklearn.ensemble import RandomForestClassifier 90 | param_grid = {"max_depth": [10, 20, 40, None], 91 | "max_features": sp_randint(1, 20), 92 | "min_samples_split": sp_randint(5, 50), 93 | "min_samples_leaf": sp_randint(5, 50), 94 | "bootstrap": [True, False], 95 | "criterion": ["gini", "entropy"]} 96 | clf = RandomForestClassifier(verbose=0, n_estimators=100) 97 | elif model == 'xgb': 98 | import xgboost as xgb 99 | param_grid = {'silent': [True], 100 | 'max_depth': [5, 10, 20], 101 | 'learning_rate': [0.001, 0.01], 102 | 'subsample': [0.2, 0.3, 0.5, 0.6, 0.9, 1.0], 103 | 'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 104 | 'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 105 | 'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], 106 | 'gamma': [0, 0.25, 0.5, 1.0], 107 | 'reg_lambda': [0.1, 1.0, 50.0, 100.0, 200.0], 108 | 'n_estimators': [100], 109 | 'max_features': [3, 10, None]} 110 | clf = xgb.XGBClassifier() 111 | 112 | clf_grid = RandomizedSearchCV(clf, param_distributions=param_grid, 113 | n_iter=n_seach, cv=3, iid=False) 114 | clf_grid.fit(self.train_features.values, self.y_train) 115 | self.train_acc = clf_grid.score(self.train_features.values, self.y_train) 116 | print(f'training acc: {self.train_acc}') 117 | 118 | y_pred = clf_grid.predict(self.test_features.values) 119 | print(classification_report(self.y_test, y_pred)) 120 | 121 | self.clf = clf_grid.best_estimator_ 122 | -------------------------------------------------------------------------------- /demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "from sklearn.preprocessing import MinMaxScaler\n", 12 | "from kernel import load_min_data, prepare_data, NetModel" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "
| \n", 41 | " | high | \n", 42 | "open | \n", 43 | "low | \n", 44 | "close | \n", 45 | "volume | \n", 46 | "oi | \n", 47 | "
|---|---|---|---|---|---|---|
| time | \n", 50 | "\n", 51 | " | \n", 52 | " | \n", 53 | " | \n", 54 | " | \n", 55 | " | \n", 56 | " |
| 2016-05-13 14:00:00 | \n", 61 | "645 | \n", 62 | "632 | \n", 63 | "632 | \n", 64 | "645 | \n", 65 | "148052 | \n", 66 | "3570070 | \n", 67 | "
| 2016-05-13 14:05:00 | \n", 70 | "647 | \n", 71 | "644 | \n", 72 | "639 | \n", 73 | "640 | \n", 74 | "202724 | \n", 75 | "3586682 | \n", 76 | "
| 2016-05-13 14:10:00 | \n", 79 | "643 | \n", 80 | "640 | \n", 81 | "636 | \n", 82 | "641 | \n", 83 | "94164 | \n", 84 | "3578016 | \n", 85 | "
| 2016-05-13 14:15:00 | \n", 88 | "655 | \n", 89 | "640 | \n", 90 | "634 | \n", 91 | "651 | \n", 92 | "333452 | \n", 93 | "3612632 | \n", 94 | "
| 2016-05-13 14:20:00 | \n", 97 | "657 | \n", 98 | "652 | \n", 99 | "639 | \n", 100 | "641 | \n", 101 | "300840 | \n", 102 | "3585022 | \n", 103 | "