├── .gitignore
├── LICENSE
├── README.md
├── data
└── rebar_mins.csv
├── demo.ipynb
└── kernel
├── __init__.py
├── data_import.py
└── model.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .ipynb_checkpoints
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Yang Chenjie
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Applying Convolutional Auto-Encoder in Trading
2 |
3 | In this project, I try to build a model to extract useful patterns in financial timeseries for predicting the directions of future price movements. Traditional multi-variate timeseries models (even some [modern approach like LSTM](https://www.researchgate.net/publication/327967988_Predicting_Stock_Prices_Using_LSTM)) tend to look at and extract information from each input features independently, which ignores potential correlations between inpputs. For example, looing at historical volume and adjust close prices jointly could povide new information. As such, people have been exploring using [CNN to learn spatial patterns](https://arxiv.org/pdf/1703.04691.pdf).
4 |
5 | It is well-known that the information/noise ration is low in general for financial time-series. Here we try a novel approach, called Convolutional Auto-Encoder (CAE), which proved [successful in computer visions](https://xifengguo.github.io/papers/ICONIP17-DCEC.pdf).
6 |
7 | This repo contains a set of data points from the commodity-trading market. It consists 3 years, 5-mins open, high, low, close, volume and open interests. We use two years data as training-validation-test sets to build our model, and the last year data to backtest our strategy.
8 |
9 | Our CAE and other utils are contained in 'kernel' folder, and there is a demo.ipynb for demostrating the experiment results.
10 |
--------------------------------------------------------------------------------
/data/rebar_mins.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shannonycj/convolutional-autoencoder-trading/HEAD/data/rebar_mins.csv
--------------------------------------------------------------------------------
/demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import matplotlib.pyplot as plt\n",
11 | "from sklearn.preprocessing import MinMaxScaler\n",
12 | "from kernel import load_min_data, prepare_data, NetModel"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/html": [
23 | "
\n",
24 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " \n",
41 | " high \n",
42 | " open \n",
43 | " low \n",
44 | " close \n",
45 | " volume \n",
46 | " oi \n",
47 | " \n",
48 | " \n",
49 | " time \n",
50 | " \n",
51 | " \n",
52 | " \n",
53 | " \n",
54 | " \n",
55 | " \n",
56 | " \n",
57 | " \n",
58 | " \n",
59 | " \n",
60 | " 2016-05-13 14:00:00 \n",
61 | " 645 \n",
62 | " 632 \n",
63 | " 632 \n",
64 | " 645 \n",
65 | " 148052 \n",
66 | " 3570070 \n",
67 | " \n",
68 | " \n",
69 | " 2016-05-13 14:05:00 \n",
70 | " 647 \n",
71 | " 644 \n",
72 | " 639 \n",
73 | " 640 \n",
74 | " 202724 \n",
75 | " 3586682 \n",
76 | " \n",
77 | " \n",
78 | " 2016-05-13 14:10:00 \n",
79 | " 643 \n",
80 | " 640 \n",
81 | " 636 \n",
82 | " 641 \n",
83 | " 94164 \n",
84 | " 3578016 \n",
85 | " \n",
86 | " \n",
87 | " 2016-05-13 14:15:00 \n",
88 | " 655 \n",
89 | " 640 \n",
90 | " 634 \n",
91 | " 651 \n",
92 | " 333452 \n",
93 | " 3612632 \n",
94 | " \n",
95 | " \n",
96 | " 2016-05-13 14:20:00 \n",
97 | " 657 \n",
98 | " 652 \n",
99 | " 639 \n",
100 | " 641 \n",
101 | " 300840 \n",
102 | " 3585022 \n",
103 | " \n",
104 | " \n",
105 | "
\n",
106 | "
"
107 | ],
108 | "text/plain": [
109 | " high open low close volume oi\n",
110 | "time \n",
111 | "2016-05-13 14:00:00 645 632 632 645 148052 3570070\n",
112 | "2016-05-13 14:05:00 647 644 639 640 202724 3586682\n",
113 | "2016-05-13 14:10:00 643 640 636 641 94164 3578016\n",
114 | "2016-05-13 14:15:00 655 640 634 651 333452 3612632\n",
115 | "2016-05-13 14:20:00 657 652 639 641 300840 3585022"
116 | ]
117 | },
118 | "execution_count": 2,
119 | "metadata": {},
120 | "output_type": "execute_result"
121 | }
122 | ],
123 | "source": [
124 | "train_end = '2018-05-13 00:00' # we use 2016 - 2018 for training\n",
125 | "df_train, df_test = load_min_data()\n",
126 | "df_train.head()"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "## Prepare Data for Model\n",
134 | "> - transform data into images consisting the past 1000 points \n",
135 | "> - aim to predict the return after 3 periods = 15 mins \n",
136 | "> - next split training data into training set and validation set (for model selection) "
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 3,
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "training set size: 8682, array shape: (8682, 1000, 6, 1)\n",
149 | "validation set size: 3721, and array shape: (3721, 1000, 6, 1)\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "time_window = 1000\n",
155 | "forecast_period = 3\n",
156 | "X_train, X_valid, y_train, y_valid = prepare_data(df_train, time_window, forecast_period)\n",
157 | "print(f'training set size: {X_train.shape[0]}, array shape: {X_train.shape}')\n",
158 | "print(f'validation set size: {X_valid.shape[0]}, and array shape: {X_valid.shape}')"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "## Build Convolution AutoEncoder Network\n",
166 | "> The reommanded number of filters is (64, 32, 16). \n",
167 | "> If you don't have a GPU-accelerated machine, use small number of filters such as (16, 8, 8). Or try google colab."
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 4,
173 | "metadata": {},
174 | "outputs": [
175 | {
176 | "name": "stdout",
177 | "output_type": "stream",
178 | "text": [
179 | "shape of input (None, 1000, 6, 1)\n",
180 | "shape after first conv (None, 1000, 6, 32)\n",
181 | "shape after first pooling (None, 200, 6, 32)\n",
182 | "shape after second conv (None, 200, 6, 16)\n",
183 | "shape after second pooling (None, 40, 6, 16)\n",
184 | "shape after third conv (None, 40, 6, 8)\n",
185 | "shape of encoded (None, 8, 6, 8)\n",
186 | "shape after upsample third pooling (None, 40, 6, 8)\n",
187 | "shape after decode third conv (None, 40, 6, 8)\n",
188 | "shape after upsample second pooling (None, 200, 6, 8)\n",
189 | "shape after decode second conv (None, 200, 6, 16)\n",
190 | "shape after upsample first pooling (None, 1000, 6, 16)\n",
191 | "shape after decode first conv (None, 1000, 6, 32)\n",
192 | "shape after decode to input (None, 1000, 6, 1)\n"
193 | ]
194 | }
195 | ],
196 | "source": [
197 | "net = NetModel(X_train, X_valid, y_train, y_valid)\n",
198 | "conv_window=(3, 3)\n",
199 | "pooling_window=(5, 1)\n",
200 | "n_filters=(32, 16, 8)\n",
201 | "net.build_net(conv_window, pooling_window, n_filters)"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "## Train AutoEncoder and Examine the Reconstruction"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 5,
214 | "metadata": {},
215 | "outputs": [
216 | {
217 | "name": "stdout",
218 | "output_type": "stream",
219 | "text": [
220 | "Epoch 1/3\n",
221 | "8682/8682 [==============================] - 174s 20ms/step - loss: 1472.3104\n",
222 | "Epoch 2/3\n",
223 | "8682/8682 [==============================] - 206s 24ms/step - loss: 377.1670\n",
224 | "Epoch 3/3\n",
225 | "8682/8682 [==============================] - 197s 23ms/step - loss: 355.8918\n"
226 | ]
227 | }
228 | ],
229 | "source": [
230 | "epoches = 3\n",
231 | "batch_size = 64\n",
232 | "net.train_encoder(epoches, batch_size)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 6,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "data": {
242 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAC7CAYAAABFJnSnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAACu5JREFUeJzt3W1sXnUZx/Hfb33Y2jKKyMbGOkXCQ5gikDSLsoTINLoBgUTfAIoxYvpGkmEwCC994VvlDYlZgEgCspAACSE8JjoRQaDjSWYBF4JsDB1zMMY2t7a7fNF2uTfu0tPS/3161e8nadZ7OzvXdbqrv/13ds65HRECAOSxoO4GAADTQ3ADQDIENwAkQ3ADQDIENwAkQ3ADQDIENwAkQ3ADQDIENwAkQ3ADQDLtRXba1ROdi08useuj4sTRovufcMqij4vX2Lu1yB/DJxz6QnfxGh37XLzGof17NHxof/lCx2nv7Y7Opb1Fa5zZtbvo/ie8sWdZ8RoL391fvIYknfHV8t+jC1R+3N7ZPqLde0YrFSqSGJ2LT9Y53/tZiV0fNbz+w6L7n/Djs54tXuPRL59UvIYkvXnL6uI1Tttc/h9xf3vi1uI1mulc2quzf3Nd0RoPXXBH0f1PuOTest+fknTGLc8XryFJmx75c/EaC11+cXXx+n9V3pZTJQCQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMlUCm7b62y/YXub7ZtLNwUAmNyUwW27TdJtktZLWiXpaturSjcGAGiuyop7taRtEfFWRByWtEnSlWXbAgBMpkpwr5C0veH1jvGfAwDUoEpwN7t3Pj6xkT1ge9D24MjB1jyjACjtmLnee6DudgBJ1YJ7h6SVDa/7JO08fqOI2BgR/RHR397VM1v9AbU6Zq57yz+kC6iiSnC/IOks21+y3SnpKkkPlW0LADCZKR95FREjtq+X9LikNkl3RsTW4p0BAJqq9KzCiHhE0iOFewEAVMCdkwCQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMkQ3ACQTJH3nO/4z0EtvfvVErs+6s1zzyu6/wlDpy0vXqPt7NOK15Ck71/0bPEaTz3x9eI1FJ94VA7wf4UVNwAkQ3ADQDIENwAkQ3ADQDIENwAkQ3ADQDIENwAkQ3ADQDIENwAkM2Vw277T9i7br7WiIQDAp6uy4v6dpHWF+wAAVDRlcEfEU5L2tKAXAEAFnOMGgGRmLbhtD9getD14OP47W7sFatU41yN7D9TdDiBpFoM7IjZGRH9E9Hd60WztFqhV41y393bX3Q4giVMlAJBOlcsB75X0rKRzbO+wfV35tgAAk5nyHXAi4upWNAIAqIZTJQCQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMkQ3ACQzJTXcc/E4VO6tPOa80vs+qgji0aK7n/Ca3uWF6+x97unFq8hSfrgYPESH55RZKSOMfqMi9cA5jJW3ACQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMlUeQeclbb/aHvI9lbbG1rRGACguSq3uY1IujEiXrS9WNIW209GxN8L9wYAaGLKFXdEvBcRL45/vk/SkKQVpRsDADQ3rXPctk+XdKGk50o0AwCYWuXgtn2CpPsl3RARHzX59QHbg7YHRw/un80egdo0zvXI3gN1twNIqhjctjs0Ftr3RMQDzbaJiI0R0R8R/W1dPbPZI1Cbxrlu7+2uux1AUrWrSizpDklDEfHr8i0BAD5NlRX3GknXSlpr++Xxj0sL9wUAmMSUlwNGxNOSeHI9AMwR3DkJAMkQ3ACQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMlUeazrtHXsP6JTny/7vJJrfvJM0f1P+MXn/1G8xmU/+k7xGpK0f21X8Rp9j+0uXmP73pHiNYC5jBU3ACRDcANAMgQ3ACRDcANAMgQ3ACRDcANAMgQ3ACRDcANAMlXeumyR7edtv2J7q+1ftqIxAEBzVe6cPCRpbUR8PP6mwU/bfjQi/lq4NwBAE1XeuiwkfTz+smP8I0o2BQCYXKVz3LbbbL8saZekJyPiubJtAQAmUym4I2I0Ii6Q1Cdpte2vHL+N7QHbg7YHh4fLPmAKaJXGuR7Ze6DudgBJ07yqJCI+lLRZ0romv7YxIvojor+jo2eW2gPq1TjX7b3ddbcDSKp2VckS2yeNf94l6VuSXi/dGACguSpXlSyXdJftNo0F/X0R8XDZtgAAk6lyVcmrki5sQS8AgAq4cxIAkiG4ASAZghsAkiG4ASAZghsAkiG4ASAZghsAkiG4ASCZKndOTlsssEa7iuz6qN/+5ZKi+5/wynl9xWscPnN58RqS9M7OtuI1Tr6os3iN4X+Xna3JRFijR8qudQ616IHJPtKaOq2w0OXn4VCMFK9xZBpPy2bFDQDJENwAkAzBDQDJENwAkAzBDQDJENwAkAzBDQDJENwAkEzl4LbdZvsl27xtGQDUaDor7g2Shko1AgCoplJw2+6TdJmk28u2AwCYStUV962SbpI0j55wAAA5TRncti+XtCsitkyx3YDtQduDw8P7Z61BoE6Ncz360YG62wEkVVtxr5F0he23JW2StNb23cdvFBEbI6I/Ivo7OnpmuU2gHo1z3XZid93tAJIqBHdE3BIRfRFxuqSrJP0hIn5QvDMAQFNcxw0AyUzrCeQRsVnS5iKdAAAqYcUNAMkQ3ACQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMlM6zruqoYXWzsvXlhi10ctWfl+0f1PGFj2p+I1fqUfFq8hScuXfVC8xoHOZcVryOVL1GVhi46t7VALCh0ZLV9DUveCzvJFWvB4vQXTGGxW3ACQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMkQ3ACQDMENAMlUugFn/P0m90kalTQSEf0lmwIATG46d05eEhG7i3UCAKiEUyUAkEzV4A5JT9jeYnugZEMAgE9X9VTJmojYaXuppCdtvx4RTzVuMB7oA5LU3vu5WW4TqEfjXHcs6a25G2BMpRV3ROwc/3GXpAclrW6yzcaI6I+I/raentntEqjJMXN9Ynfd7QCSKgS37R7biyc+l/RtSa+VbgwA0FyVUyWnSnrQ9sT2v4+Ix4p2BQCY1JTBHRFvSTq/Bb0AACrgckAASIbgBoBkCG4ASIbgBoBkCG4ASIbgBoBkCG4ASIbgBoBkHBGzv1P7fUn/nMZvOUXSfHnWN8dS3hcjYkmri85grqW5+zWciflyLHP1OCrPdZHgni7bg/PlXXU4FjSaT1/D+XIs8+E4OFUCAMkQ3ACQzFwJ7o11NzCLOBY0mk9fw/lyLOmPY06c4wYAVDdXVtwAgIpqD27b62y/YXub7Zvr7membK+0/UfbQ7a32t5Qd0+fhe022y/ZfrjuXjJirueu+TDbtQa37TZJt0laL2mVpKttr6qzp89gRNKNEXGupK9J+mniY5GkDZKG6m4iI+Z6zks/23WvuFdL2hYRb0XEYUmbJF1Zc08zEhHvRcSL45/v09hgrKi3q5mx3SfpMkm3191LUsz1HDVfZrvu4F4haXvD6x1KPBQTbJ8u6UJJz9XbyYzdKukmSUfqbiQp5nrumhezXXdwu8nPpb7MxfYJku6XdENEfFR3P9Nl+3JJuyJiS929JMZcz0HzabbrDu4dklY2vO6TtLOmXj4z2x0aG+57IuKBuvuZoTWSrrD9tsb+ib/W9t31tpQOcz03zZvZrvU6btvtkt6U9E1J70p6QdI1EbG1tqZmyLYl3SVpT0TcUHc/s8H2NyT9PCIur7uXTJjruS/7bNe64o6IEUnXS3pcY//pcV/G4R63RtK1Gvtb/OXxj0vrbgqtx1yjNO6cBIBk6j7HDQCYJoIbAJIhuAEgGYIbAJIhuAEgGYIbAJIhuAEgGYIbAJL5H26s+wya/iE2AAAAAElFTkSuQmCC\n",
243 | "text/plain": [
244 | ""
245 | ]
246 | },
247 | "metadata": {
248 | "needs_background": "light"
249 | },
250 | "output_type": "display_data"
251 | }
252 | ],
253 | "source": [
254 | "net.get_encoded_series()\n",
255 | "img = X_train[0][:, :, 0].copy()\n",
256 | "img_c = net.reconstructed_train[0][:, :, 0].copy()\n",
257 | "img_c = (MinMaxScaler().fit_transform(img_c) * 255).astype('int')\n",
258 | "\n",
259 | "b=20\n",
260 | "f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)\n",
261 | "ax1.imshow(img[6*(b-1):6*b, :])\n",
262 | "ax2.imshow(img_c[6*(b-1):6*b, :])\n",
263 | "plt.show()"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "## Train Asset Return Predictor\n",
271 | "- 'rf' means random forest classifier\n",
272 | "- 'xgb' is XGBoost classifier\n",
273 | "- 'n_search' is the number of random grid-search to perform"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 10,
279 | "metadata": {},
280 | "outputs": [
281 | {
282 | "name": "stdout",
283 | "output_type": "stream",
284 | "text": [
285 | "training acc: 0.6446671273900023\n",
286 | " precision recall f1-score support\n",
287 | "\n",
288 | " 0.0 0.49 0.15 0.23 1697\n",
289 | " 1.0 0.55 0.87 0.67 2024\n",
290 | "\n",
291 | " micro avg 0.54 0.54 0.54 3721\n",
292 | " macro avg 0.52 0.51 0.45 3721\n",
293 | "weighted avg 0.52 0.54 0.47 3721\n",
294 | "\n",
295 | "145.05216813087463\n"
296 | ]
297 | }
298 | ],
299 | "source": [
300 | "import time\n",
301 | "start_time = time.time()\n",
302 | "net.train_classifier('rf', n_search=10)\n",
303 | "print(time.time() - start_time)"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "## Applying Strategy"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": []
319 | }
320 | ],
321 | "metadata": {
322 | "kernelspec": {
323 | "display_name": "Python 3",
324 | "language": "python",
325 | "name": "python3"
326 | },
327 | "language_info": {
328 | "codemirror_mode": {
329 | "name": "ipython",
330 | "version": 3
331 | },
332 | "file_extension": ".py",
333 | "mimetype": "text/x-python",
334 | "name": "python",
335 | "nbconvert_exporter": "python",
336 | "pygments_lexer": "ipython3",
337 | "version": "3.6.7"
338 | }
339 | },
340 | "nbformat": 4,
341 | "nbformat_minor": 2
342 | }
343 |
--------------------------------------------------------------------------------
/kernel/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_import import load_min_data, prepare_data
2 | from .model import NetModel
3 |
--------------------------------------------------------------------------------
/kernel/data_import.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Mon May 13 10:16:39 2019
5 |
6 | @author: chenjieyang
7 | """
8 | import pandas as pd
9 | import numpy as np
10 | import datetime
11 | from sklearn.model_selection import train_test_split
12 | from sklearn.preprocessing import MinMaxScaler
13 |
14 |
15 | def load_min_data(train_end='2018-05-13 00:00'):
16 | df = pd.read_csv('data/rebar_mins.csv', header=2)
17 |
18 | def time_parser(t): return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M")
19 | time = list(map(time_parser, df.time))
20 | df['time'] = time
21 | df.set_index('time', inplace=True)
22 | train_end = datetime.datetime.strptime(train_end, "%Y-%m-%d %H:%M")
23 | df_train = df.loc[df.index < train_end]
24 | df_test = df.loc[df.index >= train_end]
25 | return df_train, df_test
26 |
27 |
28 | def get_idx(i, n, step):
29 | x_start = i * step
30 | x_end = x_start + n
31 | y_start = x_end
32 | y_end = y_start + step
33 | return x_start, x_end, y_start, y_end
34 |
35 |
36 | def prepare_data(df, n, step, test_size=0.3):
37 | delta = df.drop('volume', axis=1).pct_change()
38 | log_volume_delta = np.log(df.volume) - np.log(df.volume.shift(1))
39 | delta['volume'] = log_volume_delta
40 | delta = delta.dropna(how='all')
41 | df = df.iloc[1:, :]
42 | nrows = delta.shape[0]
43 | i = 0
44 | X = []
45 | y = []
46 | while True:
47 | x_start, x_end, y_start, y_end = get_idx(i, n, step)
48 | if y_end > nrows - 1:
49 | break
50 | x = delta.iloc[x_start:x_end, :].values
51 | x = MinMaxScaler().fit_transform(x) * 255
52 | X.append(x.astype('int'))
53 | y.append((df.iloc[y_end, :].close - df.iloc[y_start, :].close) / df.iloc[y_start, :].close)
54 | i += 1
55 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
56 | X_train, X_test = np.expand_dims(X_train, -1), np.expand_dims(X_test, -1)
57 | y_train, y_test = np.array(y_train) >= 0, np.array(y_test) >= 0
58 | return X_train, X_test, y_train * 1.0, y_test * 1.0
59 |
--------------------------------------------------------------------------------
/kernel/model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from tensorflow.keras.layers import Input, Dense, Convolution2D, MaxPooling2D, UpSampling2D, Flatten, Reshape, Dropout, Conv2D
4 | from tensorflow.keras.models import Model
5 | from tensorflow.keras import backend as K
6 | from sklearn.metrics import classification_report, mean_squared_error
7 | from sklearn.model_selection import RandomizedSearchCV
8 | from scipy.stats import randint as sp_randint
9 |
10 |
11 | class NetModel:
12 | def __init__(self, X_train, X_test, y_train, y_test):
13 | self.X_train, self.X_test = X_train, X_test
14 | self.y_train, self.y_test = y_train, y_test
15 | self.dims = X_train.shape
16 |
17 | def build_net(self, conv_window=(6, 3), pooling_window=(10, 1), n_filters=(64, 32, 16)):
18 |
19 | input_img = Input(shape=self.dims[1:]) # adapt this if using `channels_first` image data format
20 | print("shape of input", K.int_shape(input_img))
21 | conv_1 = Conv2D(n_filters[0], conv_window, activation='relu', padding='same')(input_img)
22 | print("shape after first conv", K.int_shape(conv_1))
23 | pool_1 = MaxPooling2D(pooling_window, padding='same')(conv_1)
24 | print("shape after first pooling", K.int_shape(pool_1))
25 | conv_2 = Conv2D(n_filters[1], conv_window, activation='relu', padding='same')(pool_1)
26 | print("shape after second conv", K.int_shape(conv_2))
27 |
28 | pool_2 = MaxPooling2D(pooling_window, padding='same')(conv_2)
29 | print("shape after second pooling", K.int_shape(pool_2))
30 |
31 | conv_3 = Conv2D(n_filters[2], conv_window, activation='relu', padding='same')(pool_2)
32 | print("shape after third conv", K.int_shape(conv_3))
33 |
34 | encoded = MaxPooling2D(pooling_window, padding='same')(conv_3)
35 | print("shape of encoded", K.int_shape(encoded))
36 |
37 | up_3 = UpSampling2D(pooling_window)(encoded)
38 | print("shape after upsample third pooling", K.int_shape(up_3))
39 |
40 | conv_neg_3 = Conv2D(n_filters[2], conv_window, activation='relu', padding='same')(up_3)
41 | print("shape after decode third conv", K.int_shape(conv_neg_3))
42 |
43 | up_2 = UpSampling2D(pooling_window)(conv_neg_3)
44 | print("shape after upsample second pooling", K.int_shape(up_2))
45 |
46 | conv_neg_2 = Conv2D(n_filters[1], conv_window, activation='relu', padding='same')(up_2)
47 | print("shape after decode second conv", K.int_shape(conv_neg_2))
48 | up_1 = UpSampling2D(pooling_window)(conv_neg_2)
49 | print("shape after upsample first pooling", K.int_shape(up_1))
50 | conv_neg_3 = Conv2D(n_filters[0], conv_window, activation='relu', padding='same')(up_1)
51 | print("shape after decode first conv", K.int_shape(conv_neg_3))
52 | decoded = Conv2D(1, conv_window, activation='linear', padding='same')(conv_neg_3)
53 | print("shape after decode to input", K.int_shape(decoded))
54 |
55 | self.autoencoder = Model(input_img, decoded)
56 | self.autoencoder.compile(optimizer='adam', loss='mean_squared_error')
57 | self.encoder_model = Model(self.autoencoder.input, self.autoencoder.layers[6].output)
58 |
59 | def train_encoder(self, n_epochs=100, batch_size=64):
60 | self.autoencoder.fit(self.X_train, self.X_train, epochs=n_epochs,
61 | batch_size=batch_size, shuffle=True)
62 |
63 | def get_encoded_series(self):
64 | self.reconstructed_train = self.autoencoder.predict(self.X_train)
65 | self.reconstructed_test = self.autoencoder.predict(self.X_test)
66 | self.lf_train = self.flatten_arr(self.encoder_model.predict(self.X_train))
67 | self.lf_test = self.flatten_arr(self.encoder_model.predict(self.X_test))
68 | self.train_features = self.merge_features(self.reconstructed_train, self.X_train, self.lf_train)
69 | self.test_features = self.merge_features(self.reconstructed_test, self.X_test, self.lf_test)
70 |
71 | @staticmethod
72 | def merge_features(X_, X, lf):
73 | recon_loss = [mean_squared_error(X_[i][:, :, 0], X[i][:, :, 0]) for i in range(len(X))]
74 | keys = [f'feature_{i}' for i in range(lf.shape[1])]
75 | vals = lf.T
76 | df = pd.DataFrame(dict(list(zip(keys, vals))))
77 | df['recon_loss'] = recon_loss
78 | return df
79 |
80 | @staticmethod
81 | def flatten_arr(arr):
82 | flat = []
83 | for a in arr:
84 | flat.append(a.reshape(-1,))
85 | return np.array(flat)
86 |
87 | def train_classifier(self, model='xgb', n_search=10):
88 | if model == 'rf':
89 | from sklearn.ensemble import RandomForestClassifier
90 | param_grid = {"max_depth": [10, 20, 40, None],
91 | "max_features": sp_randint(1, 20),
92 | "min_samples_split": sp_randint(5, 50),
93 | "min_samples_leaf": sp_randint(5, 50),
94 | "bootstrap": [True, False],
95 | "criterion": ["gini", "entropy"]}
96 | clf = RandomForestClassifier(verbose=0, n_estimators=100)
97 | elif model == 'xgb':
98 | import xgboost as xgb
99 | param_grid = {'silent': [True],
100 | 'max_depth': [5, 10, 20],
101 | 'learning_rate': [0.001, 0.01],
102 | 'subsample': [0.2, 0.3, 0.5, 0.6, 0.9, 1.0],
103 | 'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
104 | 'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
105 | 'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
106 | 'gamma': [0, 0.25, 0.5, 1.0],
107 | 'reg_lambda': [0.1, 1.0, 50.0, 100.0, 200.0],
108 | 'n_estimators': [100],
109 | 'max_features': [3, 10, None]}
110 | clf = xgb.XGBClassifier()
111 |
112 | clf_grid = RandomizedSearchCV(clf, param_distributions=param_grid,
113 | n_iter=n_seach, cv=3, iid=False)
114 | clf_grid.fit(self.train_features.values, self.y_train)
115 | self.train_acc = clf_grid.score(self.train_features.values, self.y_train)
116 | print(f'training acc: {self.train_acc}')
117 |
118 | y_pred = clf_grid.predict(self.test_features.values)
119 | print(classification_report(self.y_test, y_pred))
120 |
121 | self.clf = clf_grid.best_estimator_
122 |
--------------------------------------------------------------------------------