├── .gitignore ├── pnls.pick ├── marketdata.pick ├── vol_curves.pick ├── README.md ├── PCA_QT.ipynb ├── .ipynb_checkpoints ├── PCA_QT-checkpoint.ipynb ├── PCA_StatArb-checkpoint.ipynb └── PCA_StatArb-Old-checkpoint.ipynb └── PCA_StatArb-Old.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | marketdata.csv 2 | parse_mdata.py 3 | PCA_StatArb-Old.ipynb 4 | -------------------------------------------------------------------------------- /pnls.pick: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodler/quantinsti_statarb/HEAD/pnls.pick -------------------------------------------------------------------------------- /marketdata.pick: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodler/quantinsti_statarb/HEAD/marketdata.pick -------------------------------------------------------------------------------- /vol_curves.pick: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodler/quantinsti_statarb/HEAD/vol_curves.pick -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # quantinsti_statarb 2 | 3 | This is an introductory lecture for Quantinti. 4 | It is part of a webinar and not self-sufficient. 5 | If you have questions please contact 6 | Dr Tom Starke from AAAQuants. 7 | -------------------------------------------------------------------------------- /PCA_QT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "import time\n", 13 | "import datetime\n", 14 | "from quantopian.pipeline.classifiers.morningstar import Sector\n", 15 | "from quantopian.pipeline import Pipeline\n", 16 | "from quantopian.pipeline.data.builtin import USEquityPricing\n", 17 | "from quantopian.research import run_pipeline\n", 18 | "from quantopian.pipeline.data import morningstar, Fundamentals\n", 19 | "from quantopian.pipeline.factors import CustomFactor,AverageDollarVolume,SimpleMovingAverage, ExponentialWeightedMovingAverage, EWMA\n", 20 | "from quantopian.pipeline.filters.morningstar import IsPrimaryShare\n", 21 | "from quantopian.pipeline.factors import AverageDollarVolume\n", 22 | "from quantopian.pipeline.factors.morningstar import MarketCap\n", 23 | "from quantopian.pipeline.experimental import QTradableStocksUS\n", 24 | "from statsmodels.tsa.stattools import coint\n", 25 | "from scipy import stats as stats\n", 26 | "from sklearn.linear_model import LinearRegression\n", 27 | "from sklearn.decomposition import PCA\n", 28 | "import scipy\n", 29 | "import statsmodels.api as sm" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "def make_pipe(start,end): \n", 39 | " mcap = MarketCap()\n", 40 | " min_mcap = mcap > 5e10\n", 41 | " \n", 42 | " price = USEquityPricing.close.latest\n", 43 | " can_trade = QTradableStocksUS()\n", 44 | " sector = Sector()\n", 45 | " \n", 46 | " asset_filter = can_trade & min_mcap\n", 47 | " pipe = Pipeline(screen = asset_filter)\n", 48 | "\n", 49 | " pipe.add(price,'price')\n", 50 | " pipe.add(sector,\"Sector\")\n", 51 | " \n", 52 | "\n", 53 | " res2 = run_pipeline(pipe, start, end)\n", 54 | " return res2" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "MORNINGSTAR_SECTOR_CODES = { \n", 64 | " -1: 'Misc', \n", 65 | " 101: 'Basic Materials', \n", 66 | " 102: 'Consumer Cyclical', \n", 67 | " 103: 'Financial Services', \n", 68 | " 104: 'Real Estate', \n", 69 | " 205: 'Consumer Defensive', \n", 70 | " 206: 'Healthcare', \n", 71 | " 207: 'Utilities', \n", 72 | " 308: 'Communication Services', \n", 73 | " 309: 'Energy', \n", 74 | " 310: 'Industrials', \n", 75 | " 311: 'Technology' , \n", 76 | "}" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "end = datetime.datetime(2019,5,5) - datetime.timedelta(2)\n", 86 | "start = datetime.datetime(2019,5,5) - datetime.timedelta(18)\n", 87 | "result = make_pipe(start,end)\n", 88 | "result.head()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "date = result.index[0][0]\n", 98 | "\n", 99 | "companies = []\n", 100 | "for company in result.loc[date]['Sector'].index:\n", 101 | " if result.loc[date]['Sector'][company] == 311:\n", 102 | "# if result.loc[date]['Sector'][company] == 206:\n", 103 | "# if result.loc[date]['Sector'][company] == 310:\n", 104 | " companies.append(company)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "companies" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "prices = get_pricing(companies, start_date=start, end_date=end, frequency='minute', fields='price')" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "coints = []\n", 132 | "corrs = []\n", 133 | "for c1 in range(len(companies)):\n", 134 | " for c2 in range(c1,len(companies)):\n", 135 | " if c1==c2: continue\n", 136 | " coints.append(coint(prices[companies[c1]],prices[companies[c2]])[0])\n", 137 | " corrs.append(np.corrcoef(prices[companies[c1]],prices[companies[c2]])[0][1])\n", 138 | " print(c1,c2,coints[-1])" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "print(np.mean(coints),np.mean(corrs))" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "def backtest(prices,max_pos=1,num_factors=1,initial_cash=1e6,lkbk=500):\n", 157 | " pr = np.asarray(prices.T)\n", 158 | " entry = {}\n", 159 | " pnls = []\n", 160 | " dates = []\n", 161 | " #resids = run_pca(pr,num_factors)\n", 162 | " \n", 163 | " if max_pos > pr.shape[0]/2:\n", 164 | " print('max_pos too large!')\n", 165 | " return\n", 166 | "\n", 167 | " for i,pri in enumerate(pr.T):\n", 168 | "\n", 169 | " if i < 60: continue\n", 170 | " \n", 171 | " resids, factors = run_pca(pr[:,max(0,i-lkbk):i],num_factors,log_prices=True)\n", 172 | " zs = {}\n", 173 | " for inst in range(len(pri)):\n", 174 | " #zs[inst] = Zscore(resids[inst])[i]\n", 175 | " zs[inst] = Zscore(resids[inst])[-1]\n", 176 | "\n", 177 | " idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n", 178 | " idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n", 179 | " \n", 180 | " pnl = 0\n", 181 | " for j,idx in enumerate(entry):\n", 182 | " wgt = np.round((initial_cash/len(pri))/entry[idx])\n", 183 | " #pnl += ((pri[idx]-np.abs(entry[idx]))/np.abs(entry[idx]))*wgt/initial_cash\n", 184 | " pnl += ((pri[idx]-np.abs(entry[idx])))*wgt\n", 185 | " #print pnl\n", 186 | " pnls.append(pnl)\n", 187 | " dates.append(prices.index[i])\n", 188 | " \n", 189 | " entry = {}\n", 190 | " \n", 191 | "\n", 192 | " #print(idx_long, idx_short)\n", 193 | " for idx in idx_long:\n", 194 | " entry[idx] = pri[idx]\n", 195 | " for idx in idx_short:\n", 196 | " entry[idx] = -pri[idx]\n", 197 | " #print(i,entry)\n", 198 | " \n", 199 | " print(i,sum(pnls))\n", 200 | " return pnls,dates" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "def Zscore(X):\n", 210 | " return np.array((X - np.mean(X)) / np.std(X))" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "def run_pca(pr,components=1,log_prices=True):\n", 220 | " pca = PCA(n_components=components)\n", 221 | " if log_prices:\n", 222 | " comps = pca.fit(np.log(pr.T)).components_.T\n", 223 | " else:\n", 224 | " comps = pca.fit(pr.T).components_.T\n", 225 | " factors = sm.add_constant(pr.T.dot(comps))\n", 226 | " mm = [sm.OLS(s.T, factors).fit() for s in pr]\n", 227 | " resids = list(map(lambda x: x.resid, mm))\n", 228 | " return resids, factors" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "for p in np.asarray(prices.T):\n", 238 | " plt.plot((p-p[0])/np.std(p))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "scrolled": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "pnls,dates = backtest(prices,max_pos=2,num_factors=2,initial_cash=1e6,lkbk=400)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "plt.plot(np.cumsum(pnls));" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "plt.plot(dates,np.cumsum(pnls));" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [] 276 | } 277 | ], 278 | "metadata": { 279 | "kernelspec": { 280 | "display_name": "Python 3", 281 | "language": "python", 282 | "name": "python3" 283 | }, 284 | "language_info": { 285 | "codemirror_mode": { 286 | "name": "ipython", 287 | "version": 3 288 | }, 289 | "file_extension": ".py", 290 | "mimetype": "text/x-python", 291 | "name": "python", 292 | "nbconvert_exporter": "python", 293 | "pygments_lexer": "ipython3", 294 | "version": "3.6.1" 295 | } 296 | }, 297 | "nbformat": 4, 298 | "nbformat_minor": 2 299 | } 300 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/PCA_QT-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "import time\n", 13 | "import datetime\n", 14 | "from quantopian.pipeline.classifiers.morningstar import Sector\n", 15 | "from quantopian.pipeline import Pipeline\n", 16 | "from quantopian.pipeline.data.builtin import USEquityPricing\n", 17 | "from quantopian.research import run_pipeline\n", 18 | "from quantopian.pipeline.data import morningstar, Fundamentals\n", 19 | "from quantopian.pipeline.factors import CustomFactor,AverageDollarVolume,SimpleMovingAverage, ExponentialWeightedMovingAverage, EWMA\n", 20 | "from quantopian.pipeline.filters.morningstar import IsPrimaryShare\n", 21 | "from quantopian.pipeline.factors import AverageDollarVolume\n", 22 | "from quantopian.pipeline.factors.morningstar import MarketCap\n", 23 | "from quantopian.pipeline.experimental import QTradableStocksUS\n", 24 | "from statsmodels.tsa.stattools import coint\n", 25 | "from scipy import stats as stats\n", 26 | "from sklearn.linear_model import LinearRegression\n", 27 | "from sklearn.decomposition import PCA\n", 28 | "import scipy\n", 29 | "import statsmodels.api as sm" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "def make_pipe(start,end): \n", 39 | " mcap = MarketCap()\n", 40 | " min_mcap = mcap > 5e10\n", 41 | " \n", 42 | " price = USEquityPricing.close.latest\n", 43 | " can_trade = QTradableStocksUS()\n", 44 | " sector = Sector()\n", 45 | " \n", 46 | " asset_filter = can_trade & min_mcap\n", 47 | " pipe = Pipeline(screen = asset_filter)\n", 48 | "\n", 49 | " pipe.add(price,'price')\n", 50 | " pipe.add(sector,\"Sector\")\n", 51 | " \n", 52 | "\n", 53 | " res2 = run_pipeline(pipe, start, end)\n", 54 | " return res2" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "MORNINGSTAR_SECTOR_CODES = { \n", 64 | " -1: 'Misc', \n", 65 | " 101: 'Basic Materials', \n", 66 | " 102: 'Consumer Cyclical', \n", 67 | " 103: 'Financial Services', \n", 68 | " 104: 'Real Estate', \n", 69 | " 205: 'Consumer Defensive', \n", 70 | " 206: 'Healthcare', \n", 71 | " 207: 'Utilities', \n", 72 | " 308: 'Communication Services', \n", 73 | " 309: 'Energy', \n", 74 | " 310: 'Industrials', \n", 75 | " 311: 'Technology' , \n", 76 | "}" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "end = datetime.datetime(2019,5,5) - datetime.timedelta(2)\n", 86 | "start = datetime.datetime(2019,5,5) - datetime.timedelta(18)\n", 87 | "result = make_pipe(start,end)\n", 88 | "result.head()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "date = result.index[0][0]\n", 98 | "\n", 99 | "companies = []\n", 100 | "for company in result.loc[date]['Sector'].index:\n", 101 | " if result.loc[date]['Sector'][company] == 311:\n", 102 | "# if result.loc[date]['Sector'][company] == 206:\n", 103 | "# if result.loc[date]['Sector'][company] == 310:\n", 104 | " companies.append(company)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "companies" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "prices = get_pricing(companies, start_date=start, end_date=end, frequency='minute', fields='price')" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "coints = []\n", 132 | "corrs = []\n", 133 | "for c1 in range(len(companies)):\n", 134 | " for c2 in range(c1,len(companies)):\n", 135 | " if c1==c2: continue\n", 136 | " coints.append(coint(prices[companies[c1]],prices[companies[c2]])[0])\n", 137 | " corrs.append(np.corrcoef(prices[companies[c1]],prices[companies[c2]])[0][1])\n", 138 | " print(c1,c2,coints[-1])" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "print(np.mean(coints),np.mean(corrs))" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "def backtest(prices,max_pos=1,num_factors=1,initial_cash=1e6,lkbk=500):\n", 157 | " pr = np.asarray(prices.T)\n", 158 | " entry = {}\n", 159 | " pnls = []\n", 160 | " dates = []\n", 161 | " #resids = run_pca(pr,num_factors)\n", 162 | " \n", 163 | " if max_pos > pr.shape[0]/2:\n", 164 | " print('max_pos too large!')\n", 165 | " return\n", 166 | "\n", 167 | " for i,pri in enumerate(pr.T):\n", 168 | "\n", 169 | " if i < 60: continue\n", 170 | " \n", 171 | " resids, factors = run_pca(pr[:,max(0,i-lkbk):i],num_factors,log_prices=True)\n", 172 | " zs = {}\n", 173 | " for inst in range(len(pri)):\n", 174 | " #zs[inst] = Zscore(resids[inst])[i]\n", 175 | " zs[inst] = Zscore(resids[inst])[-1]\n", 176 | "\n", 177 | " idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n", 178 | " idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n", 179 | " \n", 180 | " pnl = 0\n", 181 | " for j,idx in enumerate(entry):\n", 182 | " wgt = np.round((initial_cash/len(pri))/entry[idx])\n", 183 | " #pnl += ((pri[idx]-np.abs(entry[idx]))/np.abs(entry[idx]))*wgt/initial_cash\n", 184 | " pnl += ((pri[idx]-np.abs(entry[idx])))*wgt\n", 185 | " #print pnl\n", 186 | " pnls.append(pnl)\n", 187 | " dates.append(prices.index[i])\n", 188 | " \n", 189 | " entry = {}\n", 190 | " \n", 191 | "\n", 192 | " #print(idx_long, idx_short)\n", 193 | " for idx in idx_long:\n", 194 | " entry[idx] = pri[idx]\n", 195 | " for idx in idx_short:\n", 196 | " entry[idx] = -pri[idx]\n", 197 | " #print(i,entry)\n", 198 | " \n", 199 | " print(i,sum(pnls))\n", 200 | " return pnls,dates" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "def Zscore(X):\n", 210 | " return np.array((X - np.mean(X)) / np.std(X))" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "def run_pca(pr,components=1,log_prices=True):\n", 220 | " pca = PCA(n_components=components)\n", 221 | " if log_prices:\n", 222 | " comps = pca.fit(np.log(pr.T)).components_.T\n", 223 | " else:\n", 224 | " comps = pca.fit(pr.T).components_.T\n", 225 | " factors = sm.add_constant(pr.T.dot(comps))\n", 226 | " mm = [sm.OLS(s.T, factors).fit() for s in pr]\n", 227 | " resids = list(map(lambda x: x.resid, mm))\n", 228 | " return resids, factors" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "for p in np.asarray(prices.T):\n", 238 | " plt.plot((p-p[0])/np.std(p))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "scrolled": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "pnls,dates = backtest(prices,max_pos=2,num_factors=2,initial_cash=1e6,lkbk=400)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "plt.plot(np.cumsum(pnls));" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "plt.plot(dates,np.cumsum(pnls));" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [] 276 | } 277 | ], 278 | "metadata": { 279 | "kernelspec": { 280 | "display_name": "Python 3", 281 | "language": "python", 282 | "name": "python3" 283 | }, 284 | "language_info": { 285 | "codemirror_mode": { 286 | "name": "ipython", 287 | "version": 3 288 | }, 289 | "file_extension": ".py", 290 | "mimetype": "text/x-python", 291 | "name": "python", 292 | "nbconvert_exporter": "python", 293 | "pygments_lexer": "ipython3", 294 | "version": "3.6.1" 295 | } 296 | }, 297 | "nbformat": 4, 298 | "nbformat_minor": 2 299 | } 300 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/PCA_StatArb-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# PCA Statistical Arbitrage\n", 8 | "\n", 9 | "(This notebook can be found on GitHub: https://github.com/rodler/quantinsti_statarb)\n", 10 | "\n", 11 | "### Dr Tom Starke \n", 12 | "\n", 13 | "*Homepage: www.aaaquants.com *\n", 14 | "\n", 15 | "*Email: tom@aaaquants.com *\n", 16 | "\n", 17 | "*Linkedin: Dr Tom Starke *\n", 18 | "\n", 19 | "### What we will learn:\n", 20 | "- Building a PCA manually\n", 21 | "- Conduct a pairs-trading backtest using PCA\n", 22 | "- Simulation of multiple cointegrated assets\n", 23 | "- Sector statistical arbitrage using PCA " 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "### Factor Investing\n", 31 | "\n", 32 | "In factor investing we aim to find time-dependent signals that are correlated to future returns. These factors could be:\n", 33 | "- technical indicators (e.g. difference of two moving averages)\n", 34 | "- fundamental factors (e.g. company data such as P/E ratio)\n", 35 | "- macro factors (e.g. interest rates)\n", 36 | "- abstract factors (e.g. PCA)\n", 37 | "\n", 38 | "Abstract factors derive from other factors. Assume, we have N factors, which have some degree of correlation, we can construct M both are random walks (drunk does not own dog).\n", 278 | "- if one c is zero and the other non-zero there is a one way causality (drunk owns dog).\n", 279 | "- if both c are non-zero there is two-way causality (dog sometimes pulls drunk)." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "def make_coint_0(N,T0=[0,0],sigma=[1,1],c=[0.1,0.1]):\n", 289 | " '''\n", 290 | " Algorithm from:\n", 291 | " https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n", 292 | " '''\n", 293 | " X = [0]\n", 294 | " Y = [0]\n", 295 | " for i in range(N):\n", 296 | " rx = np.random.randn()*sigma[0] - c[0]*(X[-1] - Y[-1])\n", 297 | " ry = np.random.randn()*sigma[1] + c[1]*(X[-1] - Y[-1])\n", 298 | " X.append(X[-1]+rx)\n", 299 | " Y.append(Y[-1]+ry)\n", 300 | " return np.array(X)+T0[0],np.array(Y)+T0[1]" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "When plotting X and Y we can see that they follow each other closely.\n", 308 | "Now, vary c as follows and observe what happens:\n", 309 | "- c = [ 0.9, 0.0 ]\n", 310 | "- c = [ 0.1, 0.1 ]\n", 311 | "- c = [ 0.1, 0.9 ]\n", 312 | "- c = [ 0.0 , 0.0]" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "np.random.seed(452)\n", 322 | "X,Y = make_coint_0(200,T0=[50,50],c=[0.1,0.1])\n", 323 | "plt.plot(X,'r-',Y,'b-');" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Assessing the quality of our cointegration:\n", 331 | "- Critical values for 0.1, 0.05 and 0.01.\n", 332 | "- T-statistic should be below crit." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "crit = coint(X,Y)\n", 342 | "print('Critical Values:',crit[2])\n", 343 | "print('T-statistic:',crit[0])\n", 344 | "print('P-value:',crit[1])" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "### Application of PCA to pairs trading:\n", 352 | "- Use the sklearn PCA package to generate components.\n", 353 | "- Linear regression with the price data.\n", 354 | "- Z-score the residual to normalise for varying price levels and volatility.\n", 355 | "- Trade when residual sufficiently deviates from mean. \n", 356 | "- Use of log prices can help to mitigate large price swings (e.g. in penny stocks)\n", 357 | "\n", 358 | "Below the code for the PCA pairs trade:" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "def Zscore(X):\n", 368 | " return np.array((X - np.mean(X)) / np.std(X))\n", 369 | "\n", 370 | "def run_pca(pr,components=1,log_prices=True):\n", 371 | " \n", 372 | " # Instanciate PCA \n", 373 | " pca = PCA(n_components=components)\n", 374 | " px = pr.T-np.mean(pr.T)\n", 375 | " \n", 376 | " if log_prices:\n", 377 | " \n", 378 | " # Calculate the priciple components using log prices\n", 379 | " comps = pca.fit(np.log(pr.T)).components_.T\n", 380 | " \n", 381 | " # Create the factors from the pricinple components\n", 382 | " factors = sm.add_constant(pr.T.dot(comps))\n", 383 | " \n", 384 | " else:\n", 385 | " \n", 386 | " # Calculate the N priciple components using normal prices\n", 387 | " comps = pca.fit(px).components_.T\n", 388 | " \n", 389 | " # Create the factors from the pricinple components\n", 390 | " factors = sm.add_constant(px.dot(comps)) \n", 391 | " \n", 392 | "\n", 393 | " \n", 394 | " # Regress each factor with the actual underlying prices\n", 395 | " mm = [sm.OLS(s.T, factors).fit() for s in pr]\n", 396 | " \n", 397 | " # Calculate the residuals\n", 398 | " resids = list(map(lambda x: x.resid, mm))\n", 399 | " \n", 400 | " return resids, factors" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "- Running the PCA we can now see the factors with equal values of opposite sign.\n", 408 | "- Same as in \"regular\" pairs trade where opposite sign is expressed by long/short.\n", 409 | "- PCA gives reversible results when X and Y are switched, linear regression does not." 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "# Create input array from cointegrated price series\n", 419 | "R = np.array([X,Y,X])\n", 420 | "\n", 421 | "# Run the PCA calculation\n", 422 | "residuals, factors = run_pca(R,log_prices=True)\n", 423 | "\n", 424 | "# Plot the residuals\n", 425 | "plt.plot(residuals[0],label='resid X')\n", 426 | "plt.plot(residuals[1],label='resid Y')\n", 427 | "plt.xlabel('time')\n", 428 | "plt.ylabel('residuals')\n", 429 | "plt.legend()\n", 430 | "plt.grid()" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "As a side-node, observe that linear regression is not reversible.\n", 438 | "- Residuals are calculated as distances to fitting line along to y-axis.\n", 439 | "- In PCA residuals are calculated orthogonal to principal component." 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "m = np.polyfit(X,Y,1)[0]\n", 449 | "m_rev = np.polyfit(Y,X,1)[0]\n", 450 | "print('Slope of regression:',m)\n", 451 | "print('Inverse slope of reverse regression:',1/m_rev)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "### Pairs-Trade Backtest\n", 459 | "- Sequentially step through time and instruments.\n", 460 | "- Calculate if z-score of residuals is large enough to trade.\n", 461 | "- If in trade, see if residuals have mean reverted enough to exit.\n", 462 | "- Calculate the pnl.\n", 463 | "\n", 464 | "(For simplicity we calculate the residuals first, thus introducing a forward-looking bias. This is to make the calcs faster and it is rectified later.)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "inpos = np.zeros(R.shape[0]) # side: long=+1; short=-1\n", 474 | "pnl = [0] # PnL vector\n", 475 | "bw = 2 # z-score threshold (bandwidth)\n", 476 | "op = {} # dict of entry prices\n", 477 | "\n", 478 | "# loop through time steps\n", 479 | "for i in range(len(residuals[0])):\n", 480 | " p = 0 # initialise pnl-calc for a particular time step\n", 481 | " \n", 482 | " # loop through instruments\n", 483 | " for inst in range(R.shape[0]):\n", 484 | " \n", 485 | " # calculate the z-score of residuals\n", 486 | " zs = Zscore(residuals[inst])[i]\n", 487 | " \n", 488 | " # Entry condition: z-score above bandwith and no position on\n", 489 | " if np.abs(zs)>bw and inpos[inst] == 0:\n", 490 | " op[inst] = R[inst,i] # record the open price\n", 491 | " inpos[inst] = zs # tell algo that we have a position\n", 492 | " \n", 493 | " # Exit condition: z-score has crossed zero and position on\n", 494 | " elif zs*np.sign(inpos[inst])<0:\n", 495 | " \n", 496 | " # Calculate pnl as (exit-entry)*side\n", 497 | " p+=((-R[inst,i]+op[inst])*np.sign(inpos[inst]))\n", 498 | " inpos[inst] = 0 # set side to zero\n", 499 | " \n", 500 | " # append the new pnl to vector\n", 501 | " pnl.append(p)\n", 502 | " \n", 503 | "# Plot the results of the backtest\n", 504 | "plt.plot(np.cumsum(pnl),'-')\n", 505 | "plt.xlabel('time')\n", 506 | "plt.ylabel('realised PnL')\n", 507 | "plt.show()" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "### Simulate sector cointegration" 515 | ] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "metadata": {}, 520 | "source": [ 521 | "- Simulated time series provide understandable results. \n", 522 | "- Stocks within a sector are often cointegrated.\n", 523 | "- Maths of \"Drunk and her dog\" generalisation shown below:\n", 524 | "\n", 525 | "\\begin{align}\n", 526 | "c_{ij} = \\Bigg\\{ \n", 527 | "\\begin{split}\n", 528 | "-a_{ij} \\quad for \\quad i \\leq j \\\\ \n", 529 | "a_{ij} \\quad for \\quad i \\geq j \\\\ \n", 530 | "-a_{ij} \\quad for \\quad i = j\n", 531 | "\\end{split}\n", 532 | "\\end{align}\n", 533 | "\n", 534 | "\\begin{align}\n", 535 | "X_{t}^{(i)}-X_{t-1}^{(i)} = \\sum_{j} c_{ij} X_{t-1}^{(j)} + \\epsilon_{i} \\quad with \\quad a_{ij} \\geq 0\n", 536 | "\\end{align}\n", 537 | "\n", 538 | "- *X* denotes the time series, *c* is the causality matrix.\n", 539 | "- *a* are the positive elements of the causality matrix. \n", 540 | "\n", 541 | "(Note that the *a's* denote the relationships between different series. We can simply use random numbers to start with. As we increase the number of series, we need to keep *a* small to avoid positive feedback scenarios.) \n", 542 | "\n", 543 | "Below the code that implements the above equations." 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "def make_coint_1(N,steps,a=0.1):\n", 553 | " X = [np.zeros(N)]\n", 554 | " \n", 555 | " # Create the causality matrix\n", 556 | " c = (np.tril(np.ones(N))-np.triu(np.ones(N))-np.diag(np.ones(N),0))*a #c = np.random.rand(N,N)*0.1\n", 557 | "\n", 558 | " # loop through time steps\n", 559 | " for i in range(steps):\n", 560 | " \n", 561 | " # Calculate the returns for each time series\n", 562 | " rx = (np.sum(c*X[-1],axis=1)+np.random.randn(N))\n", 563 | " \n", 564 | " # Add the new return to the last price of the time series\n", 565 | " X.append(X[-1]+rx)\n", 566 | " \n", 567 | " # return array of all series\n", 568 | " return np.array(X).T" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "Let's create a cointegrated pair with this technique.\n", 576 | "\n", 577 | "__Play with this by varying *a* and observe the results.__" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "np.random.seed(21)\n", 587 | "N = 3\n", 588 | "a1 = 0.1 # general case\n", 589 | "a2=[[0.02,0.1],[0.1,0.02]] # for N = 2\n", 590 | "a3=[[0.06,0.04,0.08],[0.06,0.06,0.04],[0.06,0.08,0.04]] # for N = 3\n", 591 | "X1 = make_coint_1(N,200,a=a1).T\n", 592 | "\n", 593 | "for i in range(N):\n", 594 | " plt.plot(X1[:,i])" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "metadata": {}, 600 | "source": [ 601 | "We can see we have produced a set of stationary time series, testing for cointegration we see that most of them are below the critical values." 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "print('Critical values:',coint(X1.T[0],X1.T[1])[2])\n", 611 | "for i in range(X1.T.shape[0]):\n", 612 | " for k in range(i,X1.T.shape[0]):\n", 613 | " if not i==k:\n", 614 | " print('t-stats for coint of series %s and %s:'%(i,k), coint(X1.T[i],X1.T[k])[0])\n", 615 | " " 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "Let's now apply our statistical arbitrage system to a simple long/short pair in order to test if our system is working properly. First, let's produce a cointegrated time series and plot it to confirm its properties." 623 | ] 624 | }, 625 | { 626 | "cell_type": "markdown", 627 | "metadata": {}, 628 | "source": [ 629 | "### PCA Portfolio Trading\n", 630 | "\n", 631 | "- Application of strategy to larger portfolio.\n", 632 | "- Careful with the causality coefficients as large numbers of strong cross-dependencies can create positive feedback loops between the series. \n", 633 | "- Larger N - higher to probability of the feedack loops for a given *alpha*.\n", 634 | "\n", 635 | "__Please vary alpha in this exercise and observe how the behaviour of our time series changes.__" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "#np.random.seed(231)\n", 645 | "N = 10\n", 646 | "alpha = 0.03\n", 647 | "X2 = make_coint_1(N,300,a=np.random.rand(N,N)*alpha) + 50" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "for i in X2:\n", 657 | " plt.plot(i)" 658 | ] 659 | }, 660 | { 661 | "cell_type": "markdown", 662 | "metadata": {}, 663 | "source": [ 664 | "### Cumulative explained variance\n", 665 | "\n", 666 | "- 3 principle components explain 75% of variance\n", 667 | "- Too many components lead to very high correlation and very small PnL/trade\n", 668 | "- Enough PnL/trade to overcome trading costs" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "pca = PCA(n_components=10)\n", 678 | "pca.fit(np.log(X2))\n", 679 | "\n", 680 | "# Plot cumulative explained variance\n", 681 | "plt.plot(np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_),'-o')\n", 682 | "plt.grid()\n", 683 | "plt.xlabel('Component')\n", 684 | "plt.ylabel('Explained Variance')" 685 | ] 686 | }, 687 | { 688 | "cell_type": "markdown", 689 | "metadata": {}, 690 | "source": [ 691 | "- Useful to look at the average t-statistics between all possible pairs.\n", 692 | "- High average t-stats - good probability of strategy success.\n", 693 | "- Johansen test tends to perform poorly out-of-sample." 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [ 702 | "# Cointegration test:\n", 703 | "coints = []\n", 704 | "print('Critical values:',coint(X2[0],X2[1])[2])\n", 705 | "for i in range(X2.shape[0]):\n", 706 | " for k in range(i,X2.shape[0]):\n", 707 | " if not i==k:\n", 708 | " coints.append(coint(X2[i],X2[k])[0])\n", 709 | " \n", 710 | "print('Average coint t-stats:',np.mean(coints))" 711 | ] 712 | }, 713 | { 714 | "cell_type": "markdown", 715 | "metadata": {}, 716 | "source": [ 717 | "- Simulated data can give us an understanding how well our backtest performs under idealised conditions. \n", 718 | "\n", 719 | "- In the next part we are looking at applying this algorithm to real market data using Quantopian." 720 | ] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "metadata": {}, 725 | "source": [ 726 | "### Sector-portfolio backtest\n", 727 | "\n", 728 | "There are many strategies we can deploy based on our techniques such as:\n", 729 | "- sort the z-scores of our factors and go long the lowest and short the N assets with the highest z-scores. \n", 730 | "- scale the position size of each instrument according to z-score.\n", 731 | "- only rebalance portfolio when sum of z-scores exceeds a threshold.\n", 732 | "\n", 733 | "All of them have their uses and they need to be tested on a case-by-case basis. Here, we choose the first example as shown below. This time we eliminate the forward-looking bias by recalculating the residuals at every time step." 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": null, 739 | "metadata": {}, 740 | "outputs": [], 741 | "source": [ 742 | "mdata = pickle.load(open('marketdata.pick','rb'))" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": null, 748 | "metadata": { 749 | "scrolled": true 750 | }, 751 | "outputs": [], 752 | "source": [ 753 | "pr = np.array(mdata).T#[:12,:]\n", 754 | "max_pos=3\n", 755 | "num_factors=3\n", 756 | "initial_cash=1e6\n", 757 | "\n", 758 | "entry = {} # create a vector of entry prices\n", 759 | "pnls = [] # create a pnl vector\n", 760 | "\n", 761 | "# Exit if we specified too large long/short position size\n", 762 | "if max_pos > pr.shape[0]/2:\n", 763 | " print('max_pos too large!')\n", 764 | " adfadsf\n", 765 | "\n", 766 | "# loop through the prices\n", 767 | "for i,pri in enumerate(pr.T):\n", 768 | "\n", 769 | " # Make sure you have enough data points for PCA\n", 770 | " if i < 50: continue\n", 771 | "\n", 772 | " # Run the PCA, only on the past prices\n", 773 | " resids, factors = run_pca(pr.T[max([0,i-400]):i],num_factors,log_prices=False)\n", 774 | " zs = {}\n", 775 | "\n", 776 | " # Calculate the z-scores for each instrument. \n", 777 | " for inst in range(len(pri)):\n", 778 | " try: zs[inst] = Zscore(resids[inst])[-1]\n", 779 | " except: pass\n", 780 | "\n", 781 | " pnl = 0\n", 782 | " # Calculate the Pnl for each position over the prevoius period\n", 783 | " for j,idx in enumerate(entry):\n", 784 | "\n", 785 | " # Calculate the position size\n", 786 | " # The sign of the position depends on the sign of the entry price\n", 787 | " pos = np.round((initial_cash/len(pri))/entry[idx])\n", 788 | "\n", 789 | " # Add up the pnls for all positions for the last period\n", 790 | " # We neutralize the sign of the entry price and let it \n", 791 | " # come in through the position.\n", 792 | " pnl += (pri[idx]-np.abs(entry[idx]))*pos\n", 793 | " pnls.append(pnl)\n", 794 | "\n", 795 | " # Reset the portfolio\n", 796 | " entry = {}\n", 797 | "\n", 798 | " # Find the new instruments to be traded based on their z-scores\n", 799 | " idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n", 800 | " idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n", 801 | "\n", 802 | " # Add them to the entry list\n", 803 | " # The entry gets a positive or negative sign depending on the side of the trade\n", 804 | " for idx in idx_long:\n", 805 | " entry[idx] = pri[idx]\n", 806 | " for idx in idx_short:\n", 807 | " entry[idx] = -pri[idx]\n", 808 | "\n", 809 | " print(i,sum(pnls))" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [ 818 | "plt.plot(np.cumsum(pnls))" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": null, 824 | "metadata": {}, 825 | "outputs": [], 826 | "source": [] 827 | } 828 | ], 829 | "metadata": { 830 | "kernelspec": { 831 | "display_name": "Python 3", 832 | "language": "python", 833 | "name": "tribo" 834 | }, 835 | "language_info": { 836 | "codemirror_mode": { 837 | "name": "ipython", 838 | "version": 3 839 | }, 840 | "file_extension": ".py", 841 | "mimetype": "text/x-python", 842 | "name": "python", 843 | "nbconvert_exporter": "python", 844 | "pygments_lexer": "ipython3", 845 | "version": "3.6.1" 846 | } 847 | }, 848 | "nbformat": 4, 849 | "nbformat_minor": 2 850 | } 851 | -------------------------------------------------------------------------------- /PCA_StatArb-Old.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# PCA Statistical Arbitrage\n", 8 | "\n", 9 | "(This notebook can be found on GitHub: https://github.com/rodler/quantinsti_statarb)\n", 10 | "\n", 11 | "### Dr Tom Starke \n", 12 | "\n", 13 | "*Homepage: www.aaaquants.com *\n", 14 | "\n", 15 | "*Email: tom@aaaquants.com *\n", 16 | "\n", 17 | "*Linkedin: Dr Tom Starke *\n", 18 | "\n", 19 | "### What we will learn:\n", 20 | "- Building a PCA manually\n", 21 | "- Conduct a pairs-trading backtest using PCA\n", 22 | "- Simulation of multiple cointegrated assets\n", 23 | "- Sector statistical arbitrage using PCA " 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "### Factor Investing\n", 31 | "\n", 32 | "In factor investing we aim to find time-dependent signals that are correlated to future returns. These factors could be:\n", 33 | "- technical indicators (e.g. difference of two moving averages)\n", 34 | "- fundamental factors (e.g. company data such as P/E ratio)\n", 35 | "- macro factors (e.g. interest rates)\n", 36 | "- abstract factors (e.g. PCA)\n", 37 | "\n", 38 | "Abstract factors derive from other factors. Assume, we have N factors, which have some degree of correlation, we can construct M both are random walks (drunk does not own dog).\n", 277 | "- if one c is zero and the other non-zero there is a one way causality (drunk owns dog).\n", 278 | "- if both c are non-zero there is two-way causality (dog sometimes pulls drunk)." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "def make_coint_0(N,T0=[0,0],sigma=[1,1],c=[0.1,0.1]):\n", 288 | " '''\n", 289 | " Algorithm from:\n", 290 | " https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n", 291 | " '''\n", 292 | " X = [0]\n", 293 | " Y = [0]\n", 294 | " for i in range(N):\n", 295 | " rx = np.random.randn()*sigma[0] - c[0]*(X[-1] - Y[-1])\n", 296 | " ry = np.random.randn()*sigma[1] + c[1]*(X[-1] - Y[-1])\n", 297 | " X.append(X[-1]+rx)\n", 298 | " Y.append(Y[-1]+ry)\n", 299 | " return np.array(X)+T0[0],np.array(Y)+T0[1]" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "When plotting X and Y we can see that they follow each other closely.\n", 307 | "Now, vary c as follows and observe what happens:\n", 308 | "- c = [ 0.9, 0.0 ]\n", 309 | "- c = [ 0.1, 0.1 ]\n", 310 | "- c = [ 0.1, 0.9 ]\n", 311 | "- c = [ 0.0 , 0.0]" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "np.random.seed(452)\n", 321 | "X,Y = make_coint_0(200,T0=[50,50],c=[0.1,0.1])\n", 322 | "plt.plot(X,'r-',Y,'b-');" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "Assessing the quality of our cointegration:\n", 330 | "- Critical values for 0.1, 0.05 and 0.01.\n", 331 | "- T-statistic should be below crit." 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "crit = coint(X,Y)\n", 341 | "print('Critical Values:',crit[2])\n", 342 | "print('T-statistic:',crit[0])\n", 343 | "print('P-value:',crit[1])" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "### Application of PCA to pairs trading:\n", 351 | "- Use the sklearn PCA package to generate components.\n", 352 | "- Linear regression with the price data.\n", 353 | "- Z-score the residual to normalise for varying price levels and volatility.\n", 354 | "- Trade when residual sufficiently deviates from mean. \n", 355 | "- Use of log prices can help to mitigate large price swings (e.g. in penny stocks)\n", 356 | "\n", 357 | "Below the code for the PCA pairs trade:" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "def Zscore(X):\n", 367 | " return np.array((X - np.mean(X)) / np.std(X))\n", 368 | "\n", 369 | "def run_pca(pr,components=1,log_prices=True):\n", 370 | " \n", 371 | " # Instanciate PCA \n", 372 | " pca = PCA(n_components=components)\n", 373 | " px = pr.T-np.mean(pr.T)\n", 374 | " \n", 375 | " if log_prices:\n", 376 | " \n", 377 | " # Calculate the priciple components using log prices\n", 378 | " comps = pca.fit(np.log(pr.T)).components_.T\n", 379 | " \n", 380 | " # Create the factors from the pricinple components\n", 381 | " factors = sm.add_constant(pr.T.dot(comps))\n", 382 | " else:\n", 383 | " \n", 384 | " # Calculate the N priciple components using normal prices\n", 385 | " comps = pca.fit(px).components_.T\n", 386 | " \n", 387 | " # Create the factors from the pricinple components\n", 388 | " factors = sm.add_constant(px.dot(comps)) \n", 389 | "\n", 390 | " \n", 391 | " # Regress the factors with the actual prices\n", 392 | " mm = [sm.OLS(s.T, factors).fit() for s in pr]\n", 393 | " \n", 394 | " # Calculate the residuals\n", 395 | " resids = list(map(lambda x: x.resid, mm))\n", 396 | " \n", 397 | " return resids, factors" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "- Running the PCA we can now see the factors with equal values of opposite sign.\n", 405 | "- Same as in \"regular\" pairs trade where opposite sign is expressed by long/short.\n", 406 | "- PCA gives reversible results when X and Y are switched, linear regression does not." 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "# Create input array from cointegrated price series\n", 416 | "R = np.array([X,Y])\n", 417 | "\n", 418 | "# Run the PCA calculation\n", 419 | "residuals, factors = run_pca(R,log_prices=True)\n", 420 | "\n", 421 | "# Plot the residuals\n", 422 | "plt.plot(residuals[0],label='resid X')\n", 423 | "plt.plot(residuals[1],label='resid Y')\n", 424 | "plt.xlabel('time')\n", 425 | "plt.ylabel('residuals')\n", 426 | "plt.legend()\n", 427 | "plt.grid()" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "As a side-node, observe that linear regression is not reversible.\n", 435 | "- Residuals are calculated as distances to fitting line along to y-axis.\n", 436 | "- In PCA residuals are calculated orthogonal to principal component." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "m = np.polyfit(X,Y,1)[0]\n", 446 | "m_rev = np.polyfit(Y,X,1)[0]\n", 447 | "print('Slope of regression:',m)\n", 448 | "print('Inverse slope of reverse regression:',1/m_rev)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "### Pairs-Trade Backtest\n", 456 | "- Sequentially step through time and instruments.\n", 457 | "- Calculate if z-score of residuals is large enough to trade.\n", 458 | "- If in trade, see if residuals have mean reverted enough to exit.\n", 459 | "- Calculate the pnl.\n", 460 | "\n", 461 | "(For simplicity we calculate the residuals first, thus introducing a forward-looking bias. This is to make the calcs faster and it is rectified later.)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "inpos = np.zeros(R.shape[0]) # side: long=+1; short=-1\n", 471 | "pnl = [0] # PnL vector\n", 472 | "bw = 2 # z-score threshold (bandwidth)\n", 473 | "op = {} # dict of entry prices\n", 474 | "\n", 475 | "# loop through time steps\n", 476 | "for i in range(len(residuals[0])):\n", 477 | " p = 0 # initialise pnl-calc for a particular time step\n", 478 | " \n", 479 | " # loop through instruments\n", 480 | " for inst in range(R.shape[0]):\n", 481 | " \n", 482 | " # calculate the z-score of residuals\n", 483 | " zs = Zscore(residuals[inst])[i]\n", 484 | " \n", 485 | " # Entry condition: z-score above bandwith and no position on\n", 486 | " if np.abs(zs)>bw and inpos[inst] == 0:\n", 487 | " op[inst] = R[inst,i] # record the open price\n", 488 | " inpos[inst] = zs # tell algo that we have a position\n", 489 | " \n", 490 | " # Exit condition: z-score has crossed zero and position on\n", 491 | " elif zs*np.sign(inpos[inst])<0:\n", 492 | " \n", 493 | " # Calculate pnl as (exit-entry)*side\n", 494 | " p+=((-R[inst,i]+op[inst])*np.sign(inpos[inst]))\n", 495 | " inpos[inst] = 0 # set side to zero\n", 496 | " \n", 497 | " # append the new pnl to vector\n", 498 | " pnl.append(p)\n", 499 | " \n", 500 | "# Plot the results of the backtest\n", 501 | "plt.plot(np.cumsum(pnl),'-')\n", 502 | "plt.xlabel('time')\n", 503 | "plt.ylabel('realised PnL')\n", 504 | "plt.show()" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "### Simulate sector cointegration" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "- Simulated time series provide understandable results. \n", 519 | "- Stocks within a sector are often cointegrated.\n", 520 | "- Maths of \"Drunk and her dog\" generalisation shown below:\n", 521 | "\n", 522 | "\\begin{align}\n", 523 | "c_{ij} = \\Bigg\\{ \n", 524 | "\\begin{split}\n", 525 | "-a_{ij} \\quad for \\quad i \\leq j \\\\ \n", 526 | "a_{ij} \\quad for \\quad i \\geq j \\\\ \n", 527 | "-a_{ij} \\quad for \\quad i = j\n", 528 | "\\end{split}\n", 529 | "\\end{align}\n", 530 | "\n", 531 | "\\begin{align}\n", 532 | "X_{t}^{(i)}-X_{t-1}^{(i)} = \\sum_{j} c_{ij} X_{t-1}^{(j)} + \\epsilon_{i} \\quad with \\quad a_{ij} \\geq 0\n", 533 | "\\end{align}\n", 534 | "\n", 535 | "- *X* denotes the time series, *c* is the causality matrix.\n", 536 | "- *a* are the positive elements of the causality matrix. \n", 537 | "\n", 538 | "(Note that the *a's* denote the relationships between different series. We can simply use random numbers to start with. As we increase the number of series, we need to keep *a* small to avoid positive feedback scenarios.) \n", 539 | "\n", 540 | "Below the code that implements the above equations." 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "def make_coint_1(N,steps,a=0.1):\n", 550 | " X = [np.zeros(N)]\n", 551 | " \n", 552 | " # Create the causality matrix\n", 553 | " c = (np.tril(np.ones(N))-np.triu(np.ones(N))-np.diag(np.ones(N),0))*a #c = np.random.rand(N,N)*0.1\n", 554 | "\n", 555 | " # loop through time steps\n", 556 | " for i in range(steps):\n", 557 | " \n", 558 | " # Calculate the returns for each time series\n", 559 | " rx = (np.sum(c*X[-1],axis=1)+np.random.randn(N))\n", 560 | " \n", 561 | " # Add the new return to the last price of the time series\n", 562 | " X.append(X[-1]+rx)\n", 563 | " \n", 564 | " # return array of all series\n", 565 | " return np.array(X).T" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "Let's create a cointegrated pair with this technique.\n", 573 | "\n", 574 | "__Play with this by varying *a* and observe the results.__" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "np.random.seed(21)\n", 584 | "N = 3\n", 585 | "a1 = 0.1 # general case\n", 586 | "a2=[[0.02,0.1],[0.1,0.02]] # for N = 2\n", 587 | "a3=[[0.06,0.04,0.08],[0.06,0.06,0.04],[0.06,0.08,0.04]] # for N = 3\n", 588 | "X1 = make_coint_1(N,200,a=a1).T\n", 589 | "\n", 590 | "for i in range(N):\n", 591 | " plt.plot(X1[:,i])" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "We can see we have produced a set of stationary time series, testing for cointegration we see that most of them are below the critical values." 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "print('Critical values:',coint(X1.T[0],X1.T[1])[2])\n", 608 | "for i in range(X1.T.shape[0]):\n", 609 | " for k in range(i,X1.T.shape[0]):\n", 610 | " if not i==k:\n", 611 | " print('t-stats for coint of series %s and %s:'%(i,k), coint(X1.T[i],X1.T[k])[0])\n", 612 | " " 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": {}, 618 | "source": [ 619 | "### Sector-portfolio backtest\n", 620 | "\n", 621 | "There are many strategies we can deploy based on our techniques such as:\n", 622 | "- sort the z-scores of our factors and go long the lowest and short the N assets with the highest z-scores. \n", 623 | "- scale the position size of each instrument according to z-score.\n", 624 | "- only rebalance portfolio when sum of z-scores exceeds a threshold.\n", 625 | "\n", 626 | "All of them have their uses and they need to be tested on a case-by-case basis. Here, we choose the first example as shown below. This time we eliminate the forward-looking bias by recalculating the residuals at every time step." 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "def backtest(pr,max_pos=2,num_factors=1,initial_cash=1e6):\n", 636 | " entry = {} # create a vector of entry prices\n", 637 | " pnls = [] # create a pnl vector\n", 638 | " \n", 639 | " # Exit if we specified too large long/short position size\n", 640 | " if max_pos > pr.shape[0]/2:\n", 641 | " print('max_pos too large!')\n", 642 | " return\n", 643 | "\n", 644 | " # loop through the prices\n", 645 | " for i,pri in enumerate(pr.T):\n", 646 | " \n", 647 | " # Make sure you have enough data points for PCA\n", 648 | " if i < 50: continue\n", 649 | " \n", 650 | " # Run the PCA, only on the past prices\n", 651 | " resids, factors = run_pca(pr[:i],num_factors,log_prices=False)\n", 652 | " zs = {}\n", 653 | " \n", 654 | " # Calculate the z-scores for each instrument. \n", 655 | " for inst in range(len(pri)):\n", 656 | " zs[inst] = Zscore(resids[inst])[-1]\n", 657 | "\n", 658 | " pnl = 0\n", 659 | " # Calculate the Pnl for each position over the prevoius period\n", 660 | " for j,idx in enumerate(entry):\n", 661 | " \n", 662 | " # Calculate the position size\n", 663 | " pos = np.round((initial_cash/len(pri))/entry[idx])\n", 664 | " \n", 665 | " # Add up the pnls for all positions for the last period\n", 666 | " pnl += (pri[idx]-np.abs(entry[idx]))*pos\n", 667 | " pnls.append(pnl)\n", 668 | " \n", 669 | " # Reset the portfolio\n", 670 | " entry = {}\n", 671 | " \n", 672 | " # Find the new instruments to be traded based on their z-scores\n", 673 | " idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n", 674 | " idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n", 675 | " \n", 676 | " # Add them to the entry list\n", 677 | " for idx in idx_long:\n", 678 | " entry[idx] = pri[idx]\n", 679 | " for idx in idx_short:\n", 680 | " entry[idx] = -pri[idx]\n", 681 | " \n", 682 | " \n", 683 | " return(pnls)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "metadata": {}, 689 | "source": [ 690 | "Let's now apply our statistical arbitrage system to a simple long/short pair in order to test if our system is working properly. First, let's produce a cointegrated time series and plot it to confirm its properties." 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "# Create a seed for consistency\n", 700 | "np.random.seed(27)\n", 701 | "N = 2 # Number of assets\n", 702 | "alpha = 0.1 # causality factor\n", 703 | "X1 = make_coint_1(N,500,a=np.random.rand(N,N)*alpha) + 50 " 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "plt.plot(X1.T[:,0])\n", 713 | "plt.plot(X1.T[:,1])\n", 714 | "coint(X1.T[:,0],X1.T[:,1])" 715 | ] 716 | }, 717 | { 718 | "cell_type": "markdown", 719 | "metadata": {}, 720 | "source": [ 721 | "- series is highly cointegrated, we expect to see a very good result.\n", 722 | "\n", 723 | "__Please run the same simulation with a less cointegrate series by lowering *alpha* to 0.01.__" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "# run the backtest\n", 733 | "pnls = backtest(X1,max_pos=1,num_factors=1,initial_cash=1e6)\n", 734 | "\n", 735 | "# plot the result\n", 736 | "plt.plot(np.cumsum(pnls));" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": null, 742 | "metadata": {}, 743 | "outputs": [], 744 | "source": [ 745 | "pnls" 746 | ] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "metadata": {}, 751 | "source": [ 752 | "### PCA Portfolio Trading\n", 753 | "\n", 754 | "- Application of strategy to larger portfolio.\n", 755 | "- Careful with the causality coefficients as large numbers of strong cross-dependencies can create positive feedback loops between the series. \n", 756 | "- Larger N - higher to probability of the feedack loops for a given *alpha*.\n", 757 | "\n", 758 | "__Please vary alpha in this exercise and observe how the behaviour of our time series changes.__" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [ 767 | "#np.random.seed(231)\n", 768 | "N = 10\n", 769 | "alpha = 0.03\n", 770 | "X2 = make_coint_1(N,100,a=np.random.rand(N,N)*alpha) + 50" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [ 779 | "for i in X2:\n", 780 | " plt.plot(i)" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "### Cumulative explained variance\n", 788 | "\n", 789 | "- 3 principle components explain 75% of variance\n", 790 | "- Too many components lead to very high correlation and very small PnL/trade\n", 791 | "- Enough PnL/trade to overcome trading costs" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": {}, 798 | "outputs": [], 799 | "source": [ 800 | "pca = PCA(n_components=10)\n", 801 | "pca.fit(np.log(X2))\n", 802 | "\n", 803 | "# Plot cumulative explained variance\n", 804 | "plt.plot(np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_),'-o')\n", 805 | "plt.grid()\n", 806 | "plt.xlabel('Component')\n", 807 | "plt.ylabel('Explained Variance')" 808 | ] 809 | }, 810 | { 811 | "cell_type": "markdown", 812 | "metadata": {}, 813 | "source": [ 814 | "- Useful to look at the average t-statistics between all possible pairs.\n", 815 | "- High average t-stats - good probability of strategy success.\n", 816 | "- Johansen test tends to perform poorly out-of-sample." 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": null, 822 | "metadata": {}, 823 | "outputs": [], 824 | "source": [ 825 | "# Cointegration test:\n", 826 | "coints = []\n", 827 | "print('Critical values:',coint(X2[0],X2[1])[2])\n", 828 | "for i in range(X2.shape[0]):\n", 829 | " for k in range(i,X2.shape[0]):\n", 830 | " if not i==k:\n", 831 | " coints.append(coint(X2[i],X2[k])[0])\n", 832 | " \n", 833 | "print('Average coint t-stats:',np.mean(coints))" 834 | ] 835 | }, 836 | { 837 | "cell_type": "markdown", 838 | "metadata": {}, 839 | "source": [ 840 | "- Simulated data can give us an understanding how well our backtest performs under idealised conditions. \n", 841 | "\n", 842 | "- In the next part we are looking at applying this algorithm to real market data using Quantopian." 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "pnls = backtest(X2,max_pos=1,num_factors=1,initial_cash=1e6)\n", 852 | "plt.plot(np.cumsum(pnls));\n", 853 | "plt.show()" 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": null, 859 | "metadata": {}, 860 | "outputs": [], 861 | "source": [] 862 | } 863 | ], 864 | "metadata": { 865 | "kernelspec": { 866 | "display_name": "Python 3", 867 | "language": "python", 868 | "name": "tribo" 869 | }, 870 | "language_info": { 871 | "codemirror_mode": { 872 | "name": "ipython", 873 | "version": 3 874 | }, 875 | "file_extension": ".py", 876 | "mimetype": "text/x-python", 877 | "name": "python", 878 | "nbconvert_exporter": "python", 879 | "pygments_lexer": "ipython3", 880 | "version": "3.6.1" 881 | } 882 | }, 883 | "nbformat": 4, 884 | "nbformat_minor": 2 885 | } 886 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/PCA_StatArb-Old-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# PCA Statistical Arbitrage\n", 8 | "\n", 9 | "(This notebook can be found on GitHub: https://github.com/rodler/quantinsti_statarb)\n", 10 | "\n", 11 | "### Dr Tom Starke \n", 12 | "\n", 13 | "*Homepage: www.aaaquants.com *\n", 14 | "\n", 15 | "*Email: tom@aaaquants.com *\n", 16 | "\n", 17 | "*Linkedin: Dr Tom Starke *\n", 18 | "\n", 19 | "### What we will learn:\n", 20 | "- Building a PCA manually\n", 21 | "- Conduct a pairs-trading backtest using PCA\n", 22 | "- Simulation of multiple cointegrated assets\n", 23 | "- Sector statistical arbitrage using PCA " 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "### Factor Investing\n", 31 | "\n", 32 | "In factor investing we aim to find time-dependent signals that are correlated to future returns. These factors could be:\n", 33 | "- technical indicators (e.g. difference of two moving averages)\n", 34 | "- fundamental factors (e.g. company data such as P/E ratio)\n", 35 | "- macro factors (e.g. interest rates)\n", 36 | "- abstract factors (e.g. PCA)\n", 37 | "\n", 38 | "Abstract factors derive from other factors. Assume, we have N factors, which have some degree of correlation, we can construct M both are random walks (drunk does not own dog).\n", 277 | "- if one c is zero and the other non-zero there is a one way causality (drunk owns dog).\n", 278 | "- if both c are non-zero there is two-way causality (dog sometimes pulls drunk)." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "def make_coint_0(N,T0=[0,0],sigma=[1,1],c=[0.1,0.1]):\n", 288 | " '''\n", 289 | " Algorithm from:\n", 290 | " https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n", 291 | " '''\n", 292 | " X = [0]\n", 293 | " Y = [0]\n", 294 | " for i in range(N):\n", 295 | " rx = np.random.randn()*sigma[0] - c[0]*(X[-1] - Y[-1])\n", 296 | " ry = np.random.randn()*sigma[1] + c[1]*(X[-1] - Y[-1])\n", 297 | " X.append(X[-1]+rx)\n", 298 | " Y.append(Y[-1]+ry)\n", 299 | " return np.array(X)+T0[0],np.array(Y)+T0[1]" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "When plotting X and Y we can see that they follow each other closely.\n", 307 | "Now, vary c as follows and observe what happens:\n", 308 | "- c = [ 0.9, 0.0 ]\n", 309 | "- c = [ 0.1, 0.1 ]\n", 310 | "- c = [ 0.1, 0.9 ]\n", 311 | "- c = [ 0.0 , 0.0]" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "np.random.seed(452)\n", 321 | "X,Y = make_coint_0(200,T0=[50,50],c=[0.1,0.1])\n", 322 | "plt.plot(X,'r-',Y,'b-');" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "Assessing the quality of our cointegration:\n", 330 | "- Critical values for 0.1, 0.05 and 0.01.\n", 331 | "- T-statistic should be below crit." 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "crit = coint(X,Y)\n", 341 | "print('Critical Values:',crit[2])\n", 342 | "print('T-statistic:',crit[0])\n", 343 | "print('P-value:',crit[1])" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "### Application of PCA to pairs trading:\n", 351 | "- Use the sklearn PCA package to generate components.\n", 352 | "- Linear regression with the price data.\n", 353 | "- Z-score the residual to normalise for varying price levels and volatility.\n", 354 | "- Trade when residual sufficiently deviates from mean. \n", 355 | "- Use of log prices can help to mitigate large price swings (e.g. in penny stocks)\n", 356 | "\n", 357 | "Below the code for the PCA pairs trade:" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "def Zscore(X):\n", 367 | " return np.array((X - np.mean(X)) / np.std(X))\n", 368 | "\n", 369 | "def run_pca(pr,components=1,log_prices=True):\n", 370 | " \n", 371 | " # Instanciate PCA \n", 372 | " pca = PCA(n_components=components)\n", 373 | " px = pr.T-np.mean(pr.T)\n", 374 | " \n", 375 | " if log_prices:\n", 376 | " \n", 377 | " # Calculate the priciple components using log prices\n", 378 | " comps = pca.fit(np.log(pr.T)).components_.T\n", 379 | " \n", 380 | " # Create the factors from the pricinple components\n", 381 | " factors = sm.add_constant(pr.T.dot(comps))\n", 382 | " else:\n", 383 | " \n", 384 | " # Calculate the N priciple components using normal prices\n", 385 | " comps = pca.fit(px).components_.T\n", 386 | " \n", 387 | " # Create the factors from the pricinple components\n", 388 | " factors = sm.add_constant(px.dot(comps)) \n", 389 | "\n", 390 | " \n", 391 | " # Regress the factors with the actual prices\n", 392 | " mm = [sm.OLS(s.T, factors).fit() for s in pr]\n", 393 | " \n", 394 | " # Calculate the residuals\n", 395 | " resids = list(map(lambda x: x.resid, mm))\n", 396 | " \n", 397 | " return resids, factors" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "- Running the PCA we can now see the factors with equal values of opposite sign.\n", 405 | "- Same as in \"regular\" pairs trade where opposite sign is expressed by long/short.\n", 406 | "- PCA gives reversible results when X and Y are switched, linear regression does not." 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "# Create input array from cointegrated price series\n", 416 | "R = np.array([X,Y])\n", 417 | "\n", 418 | "# Run the PCA calculation\n", 419 | "residuals, factors = run_pca(R,log_prices=True)\n", 420 | "\n", 421 | "# Plot the residuals\n", 422 | "plt.plot(residuals[0],label='resid X')\n", 423 | "plt.plot(residuals[1],label='resid Y')\n", 424 | "plt.xlabel('time')\n", 425 | "plt.ylabel('residuals')\n", 426 | "plt.legend()\n", 427 | "plt.grid()" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "As a side-node, observe that linear regression is not reversible.\n", 435 | "- Residuals are calculated as distances to fitting line along to y-axis.\n", 436 | "- In PCA residuals are calculated orthogonal to principal component." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "m = np.polyfit(X,Y,1)[0]\n", 446 | "m_rev = np.polyfit(Y,X,1)[0]\n", 447 | "print('Slope of regression:',m)\n", 448 | "print('Inverse slope of reverse regression:',1/m_rev)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "### Pairs-Trade Backtest\n", 456 | "- Sequentially step through time and instruments.\n", 457 | "- Calculate if z-score of residuals is large enough to trade.\n", 458 | "- If in trade, see if residuals have mean reverted enough to exit.\n", 459 | "- Calculate the pnl.\n", 460 | "\n", 461 | "(For simplicity we calculate the residuals first, thus introducing a forward-looking bias. This is to make the calcs faster and it is rectified later.)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "inpos = np.zeros(R.shape[0]) # side: long=+1; short=-1\n", 471 | "pnl = [0] # PnL vector\n", 472 | "bw = 2 # z-score threshold (bandwidth)\n", 473 | "op = {} # dict of entry prices\n", 474 | "\n", 475 | "# loop through time steps\n", 476 | "for i in range(len(residuals[0])):\n", 477 | " p = 0 # initialise pnl-calc for a particular time step\n", 478 | " \n", 479 | " # loop through instruments\n", 480 | " for inst in range(R.shape[0]):\n", 481 | " \n", 482 | " # calculate the z-score of residuals\n", 483 | " zs = Zscore(residuals[inst])[i]\n", 484 | " \n", 485 | " # Entry condition: z-score above bandwith and no position on\n", 486 | " if np.abs(zs)>bw and inpos[inst] == 0:\n", 487 | " op[inst] = R[inst,i] # record the open price\n", 488 | " inpos[inst] = zs # tell algo that we have a position\n", 489 | " \n", 490 | " # Exit condition: z-score has crossed zero and position on\n", 491 | " elif zs*np.sign(inpos[inst])<0:\n", 492 | " \n", 493 | " # Calculate pnl as (exit-entry)*side\n", 494 | " p+=((-R[inst,i]+op[inst])*np.sign(inpos[inst]))\n", 495 | " inpos[inst] = 0 # set side to zero\n", 496 | " \n", 497 | " # append the new pnl to vector\n", 498 | " pnl.append(p)\n", 499 | " \n", 500 | "# Plot the results of the backtest\n", 501 | "plt.plot(np.cumsum(pnl),'-')\n", 502 | "plt.xlabel('time')\n", 503 | "plt.ylabel('realised PnL')\n", 504 | "plt.show()" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "### Simulate sector cointegration" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "- Simulated time series provide understandable results. \n", 519 | "- Stocks within a sector are often cointegrated.\n", 520 | "- Maths of \"Drunk and her dog\" generalisation shown below:\n", 521 | "\n", 522 | "\\begin{align}\n", 523 | "c_{ij} = \\Bigg\\{ \n", 524 | "\\begin{split}\n", 525 | "-a_{ij} \\quad for \\quad i \\leq j \\\\ \n", 526 | "a_{ij} \\quad for \\quad i \\geq j \\\\ \n", 527 | "-a_{ij} \\quad for \\quad i = j\n", 528 | "\\end{split}\n", 529 | "\\end{align}\n", 530 | "\n", 531 | "\\begin{align}\n", 532 | "X_{t}^{(i)}-X_{t-1}^{(i)} = \\sum_{j} c_{ij} X_{t-1}^{(j)} + \\epsilon_{i} \\quad with \\quad a_{ij} \\geq 0\n", 533 | "\\end{align}\n", 534 | "\n", 535 | "- *X* denotes the time series, *c* is the causality matrix.\n", 536 | "- *a* are the positive elements of the causality matrix. \n", 537 | "\n", 538 | "(Note that the *a's* denote the relationships between different series. We can simply use random numbers to start with. As we increase the number of series, we need to keep *a* small to avoid positive feedback scenarios.) \n", 539 | "\n", 540 | "Below the code that implements the above equations." 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "def make_coint_1(N,steps,a=0.1):\n", 550 | " X = [np.zeros(N)]\n", 551 | " \n", 552 | " # Create the causality matrix\n", 553 | " c = (np.tril(np.ones(N))-np.triu(np.ones(N))-np.diag(np.ones(N),0))*a #c = np.random.rand(N,N)*0.1\n", 554 | "\n", 555 | " # loop through time steps\n", 556 | " for i in range(steps):\n", 557 | " \n", 558 | " # Calculate the returns for each time series\n", 559 | " rx = (np.sum(c*X[-1],axis=1)+np.random.randn(N))\n", 560 | " \n", 561 | " # Add the new return to the last price of the time series\n", 562 | " X.append(X[-1]+rx)\n", 563 | " \n", 564 | " # return array of all series\n", 565 | " return np.array(X).T" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "Let's create a cointegrated pair with this technique.\n", 573 | "\n", 574 | "__Play with this by varying *a* and observe the results.__" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "np.random.seed(21)\n", 584 | "N = 3\n", 585 | "a1 = 0.1 # general case\n", 586 | "a2=[[0.02,0.1],[0.1,0.02]] # for N = 2\n", 587 | "a3=[[0.06,0.04,0.08],[0.06,0.06,0.04],[0.06,0.08,0.04]] # for N = 3\n", 588 | "X1 = make_coint_1(N,200,a=a1).T\n", 589 | "\n", 590 | "for i in range(N):\n", 591 | " plt.plot(X1[:,i])" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "We can see we have produced a set of stationary time series, testing for cointegration we see that most of them are below the critical values." 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "print('Critical values:',coint(X1.T[0],X1.T[1])[2])\n", 608 | "for i in range(X1.T.shape[0]):\n", 609 | " for k in range(i,X1.T.shape[0]):\n", 610 | " if not i==k:\n", 611 | " print('t-stats for coint of series %s and %s:'%(i,k), coint(X1.T[i],X1.T[k])[0])\n", 612 | " " 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": {}, 618 | "source": [ 619 | "### Sector-portfolio backtest\n", 620 | "\n", 621 | "There are many strategies we can deploy based on our techniques such as:\n", 622 | "- sort the z-scores of our factors and go long the lowest and short the N assets with the highest z-scores. \n", 623 | "- scale the position size of each instrument according to z-score.\n", 624 | "- only rebalance portfolio when sum of z-scores exceeds a threshold.\n", 625 | "\n", 626 | "All of them have their uses and they need to be tested on a case-by-case basis. Here, we choose the first example as shown below. This time we eliminate the forward-looking bias by recalculating the residuals at every time step." 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "def backtest(pr,max_pos=2,num_factors=1,initial_cash=1e6):\n", 636 | " entry = {} # create a vector of entry prices\n", 637 | " pnls = [] # create a pnl vector\n", 638 | " \n", 639 | " # Exit if we specified too large long/short position size\n", 640 | " if max_pos > pr.shape[0]/2:\n", 641 | " print('max_pos too large!')\n", 642 | " return\n", 643 | "\n", 644 | " # loop through the prices\n", 645 | " for i,pri in enumerate(pr.T):\n", 646 | " \n", 647 | " # Make sure you have enough data points for PCA\n", 648 | " if i < 50: continue\n", 649 | " \n", 650 | " # Run the PCA, only on the past prices\n", 651 | " resids, factors = run_pca(pr[:i],num_factors,log_prices=False)\n", 652 | " zs = {}\n", 653 | " \n", 654 | " # Calculate the z-scores for each instrument. \n", 655 | " for inst in range(len(pri)):\n", 656 | " zs[inst] = Zscore(resids[inst])[-1]\n", 657 | "\n", 658 | " pnl = 0\n", 659 | " # Calculate the Pnl for each position over the prevoius period\n", 660 | " for j,idx in enumerate(entry):\n", 661 | " \n", 662 | " # Calculate the position size\n", 663 | " pos = np.round((initial_cash/len(pri))/entry[idx])\n", 664 | " \n", 665 | " # Add up the pnls for all positions for the last period\n", 666 | " pnl += (pri[idx]-np.abs(entry[idx]))*pos\n", 667 | " pnls.append(pnl)\n", 668 | " \n", 669 | " # Reset the portfolio\n", 670 | " entry = {}\n", 671 | " \n", 672 | " # Find the new instruments to be traded based on their z-scores\n", 673 | " idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n", 674 | " idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n", 675 | " \n", 676 | " # Add them to the entry list\n", 677 | " for idx in idx_long:\n", 678 | " entry[idx] = pri[idx]\n", 679 | " for idx in idx_short:\n", 680 | " entry[idx] = -pri[idx]\n", 681 | " \n", 682 | " \n", 683 | " return(pnls)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "metadata": {}, 689 | "source": [ 690 | "Let's now apply our statistical arbitrage system to a simple long/short pair in order to test if our system is working properly. First, let's produce a cointegrated time series and plot it to confirm its properties." 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "# Create a seed for consistency\n", 700 | "np.random.seed(27)\n", 701 | "N = 2 # Number of assets\n", 702 | "alpha = 0.1 # causality factor\n", 703 | "X1 = make_coint_1(N,500,a=np.random.rand(N,N)*alpha) + 50 " 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "plt.plot(X1.T[:,0])\n", 713 | "plt.plot(X1.T[:,1])\n", 714 | "coint(X1.T[:,0],X1.T[:,1])" 715 | ] 716 | }, 717 | { 718 | "cell_type": "markdown", 719 | "metadata": {}, 720 | "source": [ 721 | "- series is highly cointegrated, we expect to see a very good result.\n", 722 | "\n", 723 | "__Please run the same simulation with a less cointegrate series by lowering *alpha* to 0.01.__" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "# run the backtest\n", 733 | "pnls = backtest(X1,max_pos=1,num_factors=1,initial_cash=1e6)\n", 734 | "\n", 735 | "# plot the result\n", 736 | "plt.plot(np.cumsum(pnls));" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": null, 742 | "metadata": {}, 743 | "outputs": [], 744 | "source": [ 745 | "pnls" 746 | ] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "metadata": {}, 751 | "source": [ 752 | "### PCA Portfolio Trading\n", 753 | "\n", 754 | "- Application of strategy to larger portfolio.\n", 755 | "- Careful with the causality coefficients as large numbers of strong cross-dependencies can create positive feedback loops between the series. \n", 756 | "- Larger N - higher to probability of the feedack loops for a given *alpha*.\n", 757 | "\n", 758 | "__Please vary alpha in this exercise and observe how the behaviour of our time series changes.__" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [ 767 | "#np.random.seed(231)\n", 768 | "N = 10\n", 769 | "alpha = 0.03\n", 770 | "X2 = make_coint_1(N,100,a=np.random.rand(N,N)*alpha) + 50" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [ 779 | "for i in X2:\n", 780 | " plt.plot(i)" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "### Cumulative explained variance\n", 788 | "\n", 789 | "- 3 principle components explain 75% of variance\n", 790 | "- Too many components lead to very high correlation and very small PnL/trade\n", 791 | "- Enough PnL/trade to overcome trading costs" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": {}, 798 | "outputs": [], 799 | "source": [ 800 | "pca = PCA(n_components=10)\n", 801 | "pca.fit(np.log(X2))\n", 802 | "\n", 803 | "# Plot cumulative explained variance\n", 804 | "plt.plot(np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_),'-o')\n", 805 | "plt.grid()\n", 806 | "plt.xlabel('Component')\n", 807 | "plt.ylabel('Explained Variance')" 808 | ] 809 | }, 810 | { 811 | "cell_type": "markdown", 812 | "metadata": {}, 813 | "source": [ 814 | "- Useful to look at the average t-statistics between all possible pairs.\n", 815 | "- High average t-stats - good probability of strategy success.\n", 816 | "- Johansen test tends to perform poorly out-of-sample." 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": null, 822 | "metadata": {}, 823 | "outputs": [], 824 | "source": [ 825 | "# Cointegration test:\n", 826 | "coints = []\n", 827 | "print('Critical values:',coint(X2[0],X2[1])[2])\n", 828 | "for i in range(X2.shape[0]):\n", 829 | " for k in range(i,X2.shape[0]):\n", 830 | " if not i==k:\n", 831 | " coints.append(coint(X2[i],X2[k])[0])\n", 832 | " \n", 833 | "print('Average coint t-stats:',np.mean(coints))" 834 | ] 835 | }, 836 | { 837 | "cell_type": "markdown", 838 | "metadata": {}, 839 | "source": [ 840 | "- Simulated data can give us an understanding how well our backtest performs under idealised conditions. \n", 841 | "\n", 842 | "- In the next part we are looking at applying this algorithm to real market data using Quantopian." 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "pnls = backtest(X2,max_pos=1,num_factors=1,initial_cash=1e6)\n", 852 | "plt.plot(np.cumsum(pnls));\n", 853 | "plt.show()" 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": null, 859 | "metadata": {}, 860 | "outputs": [], 861 | "source": [] 862 | } 863 | ], 864 | "metadata": { 865 | "kernelspec": { 866 | "display_name": "Python 3", 867 | "language": "python", 868 | "name": "tribo" 869 | }, 870 | "language_info": { 871 | "codemirror_mode": { 872 | "name": "ipython", 873 | "version": 3 874 | }, 875 | "file_extension": ".py", 876 | "mimetype": "text/x-python", 877 | "name": "python", 878 | "nbconvert_exporter": "python", 879 | "pygments_lexer": "ipython3", 880 | "version": "3.6.1" 881 | } 882 | }, 883 | "nbformat": 4, 884 | "nbformat_minor": 2 885 | } 886 | --------------------------------------------------------------------------------