├── .gitignore
├── pnls.pick
├── marketdata.pick
├── vol_curves.pick
├── README.md
├── PCA_QT.ipynb
├── .ipynb_checkpoints
    ├── PCA_QT-checkpoint.ipynb
    ├── PCA_StatArb-checkpoint.ipynb
    └── PCA_StatArb-Old-checkpoint.ipynb
└── PCA_StatArb-Old.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | marketdata.csv
2 | parse_mdata.py
3 | PCA_StatArb-Old.ipynb
4 | 


--------------------------------------------------------------------------------
/pnls.pick:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rodler/quantinsti_statarb/HEAD/pnls.pick


--------------------------------------------------------------------------------
/marketdata.pick:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rodler/quantinsti_statarb/HEAD/marketdata.pick


--------------------------------------------------------------------------------
/vol_curves.pick:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rodler/quantinsti_statarb/HEAD/vol_curves.pick


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # quantinsti_statarb
2 | 
3 | This is an introductory lecture for Quantinti.
4 | It is part of a webinar and not self-sufficient.
5 | If you have questions please contact
6 | Dr Tom Starke from AAAQuants.
7 | 


--------------------------------------------------------------------------------
/PCA_QT.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "import time\n",
 13 |     "import datetime\n",
 14 |     "from quantopian.pipeline.classifiers.morningstar import Sector\n",
 15 |     "from quantopian.pipeline import Pipeline\n",
 16 |     "from quantopian.pipeline.data.builtin import USEquityPricing\n",
 17 |     "from quantopian.research import run_pipeline\n",
 18 |     "from quantopian.pipeline.data import morningstar, Fundamentals\n",
 19 |     "from quantopian.pipeline.factors import CustomFactor,AverageDollarVolume,SimpleMovingAverage, ExponentialWeightedMovingAverage, EWMA\n",
 20 |     "from quantopian.pipeline.filters.morningstar import IsPrimaryShare\n",
 21 |     "from quantopian.pipeline.factors import AverageDollarVolume\n",
 22 |     "from quantopian.pipeline.factors.morningstar import MarketCap\n",
 23 |     "from quantopian.pipeline.experimental import QTradableStocksUS\n",
 24 |     "from statsmodels.tsa.stattools import coint\n",
 25 |     "from scipy import stats as stats\n",
 26 |     "from sklearn.linear_model import LinearRegression\n",
 27 |     "from sklearn.decomposition import PCA\n",
 28 |     "import scipy\n",
 29 |     "import statsmodels.api as sm"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "def make_pipe(start,end):        \n",
 39 |     "    mcap = MarketCap()\n",
 40 |     "    min_mcap = mcap > 5e10\n",
 41 |     "    \n",
 42 |     "    price = USEquityPricing.close.latest\n",
 43 |     "    can_trade = QTradableStocksUS()\n",
 44 |     "    sector = Sector()\n",
 45 |     "    \n",
 46 |     "    asset_filter = can_trade & min_mcap\n",
 47 |     "    pipe = Pipeline(screen = asset_filter)\n",
 48 |     "\n",
 49 |     "    pipe.add(price,'price')\n",
 50 |     "    pipe.add(sector,\"Sector\")\n",
 51 |     "    \n",
 52 |     "\n",
 53 |     "    res2 = run_pipeline(pipe, start, end)\n",
 54 |     "    return res2"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "MORNINGSTAR_SECTOR_CODES = {  \n",
 64 |     "     -1: 'Misc',  \n",
 65 |     "    101: 'Basic Materials',  \n",
 66 |     "    102: 'Consumer Cyclical',  \n",
 67 |     "    103: 'Financial Services',  \n",
 68 |     "    104: 'Real Estate',  \n",
 69 |     "    205: 'Consumer Defensive',  \n",
 70 |     "    206: 'Healthcare',  \n",
 71 |     "    207: 'Utilities',  \n",
 72 |     "    308: 'Communication Services',  \n",
 73 |     "    309: 'Energy',  \n",
 74 |     "    310: 'Industrials',  \n",
 75 |     "    311: 'Technology' ,  \n",
 76 |     "}"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "end = datetime.datetime(2019,5,5)  - datetime.timedelta(2)\n",
 86 |     "start = datetime.datetime(2019,5,5) - datetime.timedelta(18)\n",
 87 |     "result = make_pipe(start,end)\n",
 88 |     "result.head()"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "date = result.index[0][0]\n",
 98 |     "\n",
 99 |     "companies = []\n",
100 |     "for company in result.loc[date]['Sector'].index:\n",
101 |     "    if result.loc[date]['Sector'][company] ==  311:\n",
102 |     "#     if result.loc[date]['Sector'][company] ==  206:\n",
103 |     "#     if result.loc[date]['Sector'][company] ==  310:\n",
104 |     "        companies.append(company)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "companies"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "prices = get_pricing(companies, start_date=start, end_date=end, frequency='minute', fields='price')"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "coints = []\n",
132 |     "corrs = []\n",
133 |     "for c1 in range(len(companies)):\n",
134 |     "    for c2 in range(c1,len(companies)):\n",
135 |     "        if c1==c2: continue\n",
136 |     "        coints.append(coint(prices[companies[c1]],prices[companies[c2]])[0])\n",
137 |     "        corrs.append(np.corrcoef(prices[companies[c1]],prices[companies[c2]])[0][1])\n",
138 |     "        print(c1,c2,coints[-1])"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "print(np.mean(coints),np.mean(corrs))"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "def backtest(prices,max_pos=1,num_factors=1,initial_cash=1e6,lkbk=500):\n",
157 |     "    pr = np.asarray(prices.T)\n",
158 |     "    entry = {}\n",
159 |     "    pnls = []\n",
160 |     "    dates = []\n",
161 |     "    #resids = run_pca(pr,num_factors)\n",
162 |     "    \n",
163 |     "    if max_pos > pr.shape[0]/2:\n",
164 |     "        print('max_pos too large!')\n",
165 |     "        return\n",
166 |     "\n",
167 |     "    for i,pri in enumerate(pr.T):\n",
168 |     "\n",
169 |     "        if i < 60: continue\n",
170 |     " \n",
171 |     "        resids, factors = run_pca(pr[:,max(0,i-lkbk):i],num_factors,log_prices=True)\n",
172 |     "        zs = {}\n",
173 |     "        for inst in range(len(pri)):\n",
174 |     "            #zs[inst] = Zscore(resids[inst])[i]\n",
175 |     "            zs[inst] = Zscore(resids[inst])[-1]\n",
176 |     "\n",
177 |     "        idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n",
178 |     "        idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n",
179 |     "        \n",
180 |     "        pnl = 0\n",
181 |     "        for j,idx in enumerate(entry):\n",
182 |     "            wgt = np.round((initial_cash/len(pri))/entry[idx])\n",
183 |     "            #pnl += ((pri[idx]-np.abs(entry[idx]))/np.abs(entry[idx]))*wgt/initial_cash\n",
184 |     "            pnl += ((pri[idx]-np.abs(entry[idx])))*wgt\n",
185 |     "            #print pnl\n",
186 |     "        pnls.append(pnl)\n",
187 |     "        dates.append(prices.index[i])\n",
188 |     "            \n",
189 |     "        entry = {}\n",
190 |     "        \n",
191 |     "\n",
192 |     "        #print(idx_long, idx_short)\n",
193 |     "        for idx in idx_long:\n",
194 |     "            entry[idx] = pri[idx]\n",
195 |     "        for idx in idx_short:\n",
196 |     "            entry[idx] = -pri[idx]\n",
197 |     "        #print(i,entry)\n",
198 |     "        \n",
199 |     "        print(i,sum(pnls))\n",
200 |     "    return pnls,dates"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "def Zscore(X):\n",
210 |     "    return np.array((X - np.mean(X)) / np.std(X))"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "def run_pca(pr,components=1,log_prices=True):\n",
220 |     "    pca = PCA(n_components=components)\n",
221 |     "    if log_prices:\n",
222 |     "        comps = pca.fit(np.log(pr.T)).components_.T\n",
223 |     "    else:\n",
224 |     "        comps = pca.fit(pr.T).components_.T\n",
225 |     "    factors = sm.add_constant(pr.T.dot(comps))\n",
226 |     "    mm = [sm.OLS(s.T, factors).fit() for s in pr]\n",
227 |     "    resids = list(map(lambda x: x.resid, mm))\n",
228 |     "    return resids, factors"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "for p in np.asarray(prices.T):\n",
238 |     "    plt.plot((p-p[0])/np.std(p))"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "scrolled": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "pnls,dates = backtest(prices,max_pos=2,num_factors=2,initial_cash=1e6,lkbk=400)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "plt.plot(np.cumsum(pnls));"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "plt.plot(dates,np.cumsum(pnls));"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": []
276 |   }
277 |  ],
278 |  "metadata": {
279 |   "kernelspec": {
280 |    "display_name": "Python 3",
281 |    "language": "python",
282 |    "name": "python3"
283 |   },
284 |   "language_info": {
285 |    "codemirror_mode": {
286 |     "name": "ipython",
287 |     "version": 3
288 |    },
289 |    "file_extension": ".py",
290 |    "mimetype": "text/x-python",
291 |    "name": "python",
292 |    "nbconvert_exporter": "python",
293 |    "pygments_lexer": "ipython3",
294 |    "version": "3.6.1"
295 |   }
296 |  },
297 |  "nbformat": 4,
298 |  "nbformat_minor": 2
299 | }
300 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/PCA_QT-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "import time\n",
 13 |     "import datetime\n",
 14 |     "from quantopian.pipeline.classifiers.morningstar import Sector\n",
 15 |     "from quantopian.pipeline import Pipeline\n",
 16 |     "from quantopian.pipeline.data.builtin import USEquityPricing\n",
 17 |     "from quantopian.research import run_pipeline\n",
 18 |     "from quantopian.pipeline.data import morningstar, Fundamentals\n",
 19 |     "from quantopian.pipeline.factors import CustomFactor,AverageDollarVolume,SimpleMovingAverage, ExponentialWeightedMovingAverage, EWMA\n",
 20 |     "from quantopian.pipeline.filters.morningstar import IsPrimaryShare\n",
 21 |     "from quantopian.pipeline.factors import AverageDollarVolume\n",
 22 |     "from quantopian.pipeline.factors.morningstar import MarketCap\n",
 23 |     "from quantopian.pipeline.experimental import QTradableStocksUS\n",
 24 |     "from statsmodels.tsa.stattools import coint\n",
 25 |     "from scipy import stats as stats\n",
 26 |     "from sklearn.linear_model import LinearRegression\n",
 27 |     "from sklearn.decomposition import PCA\n",
 28 |     "import scipy\n",
 29 |     "import statsmodels.api as sm"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "def make_pipe(start,end):        \n",
 39 |     "    mcap = MarketCap()\n",
 40 |     "    min_mcap = mcap > 5e10\n",
 41 |     "    \n",
 42 |     "    price = USEquityPricing.close.latest\n",
 43 |     "    can_trade = QTradableStocksUS()\n",
 44 |     "    sector = Sector()\n",
 45 |     "    \n",
 46 |     "    asset_filter = can_trade & min_mcap\n",
 47 |     "    pipe = Pipeline(screen = asset_filter)\n",
 48 |     "\n",
 49 |     "    pipe.add(price,'price')\n",
 50 |     "    pipe.add(sector,\"Sector\")\n",
 51 |     "    \n",
 52 |     "\n",
 53 |     "    res2 = run_pipeline(pipe, start, end)\n",
 54 |     "    return res2"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "MORNINGSTAR_SECTOR_CODES = {  \n",
 64 |     "     -1: 'Misc',  \n",
 65 |     "    101: 'Basic Materials',  \n",
 66 |     "    102: 'Consumer Cyclical',  \n",
 67 |     "    103: 'Financial Services',  \n",
 68 |     "    104: 'Real Estate',  \n",
 69 |     "    205: 'Consumer Defensive',  \n",
 70 |     "    206: 'Healthcare',  \n",
 71 |     "    207: 'Utilities',  \n",
 72 |     "    308: 'Communication Services',  \n",
 73 |     "    309: 'Energy',  \n",
 74 |     "    310: 'Industrials',  \n",
 75 |     "    311: 'Technology' ,  \n",
 76 |     "}"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "end = datetime.datetime(2019,5,5)  - datetime.timedelta(2)\n",
 86 |     "start = datetime.datetime(2019,5,5) - datetime.timedelta(18)\n",
 87 |     "result = make_pipe(start,end)\n",
 88 |     "result.head()"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "date = result.index[0][0]\n",
 98 |     "\n",
 99 |     "companies = []\n",
100 |     "for company in result.loc[date]['Sector'].index:\n",
101 |     "    if result.loc[date]['Sector'][company] ==  311:\n",
102 |     "#     if result.loc[date]['Sector'][company] ==  206:\n",
103 |     "#     if result.loc[date]['Sector'][company] ==  310:\n",
104 |     "        companies.append(company)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "companies"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "prices = get_pricing(companies, start_date=start, end_date=end, frequency='minute', fields='price')"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "coints = []\n",
132 |     "corrs = []\n",
133 |     "for c1 in range(len(companies)):\n",
134 |     "    for c2 in range(c1,len(companies)):\n",
135 |     "        if c1==c2: continue\n",
136 |     "        coints.append(coint(prices[companies[c1]],prices[companies[c2]])[0])\n",
137 |     "        corrs.append(np.corrcoef(prices[companies[c1]],prices[companies[c2]])[0][1])\n",
138 |     "        print(c1,c2,coints[-1])"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "print(np.mean(coints),np.mean(corrs))"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "def backtest(prices,max_pos=1,num_factors=1,initial_cash=1e6,lkbk=500):\n",
157 |     "    pr = np.asarray(prices.T)\n",
158 |     "    entry = {}\n",
159 |     "    pnls = []\n",
160 |     "    dates = []\n",
161 |     "    #resids = run_pca(pr,num_factors)\n",
162 |     "    \n",
163 |     "    if max_pos > pr.shape[0]/2:\n",
164 |     "        print('max_pos too large!')\n",
165 |     "        return\n",
166 |     "\n",
167 |     "    for i,pri in enumerate(pr.T):\n",
168 |     "\n",
169 |     "        if i < 60: continue\n",
170 |     " \n",
171 |     "        resids, factors = run_pca(pr[:,max(0,i-lkbk):i],num_factors,log_prices=True)\n",
172 |     "        zs = {}\n",
173 |     "        for inst in range(len(pri)):\n",
174 |     "            #zs[inst] = Zscore(resids[inst])[i]\n",
175 |     "            zs[inst] = Zscore(resids[inst])[-1]\n",
176 |     "\n",
177 |     "        idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n",
178 |     "        idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n",
179 |     "        \n",
180 |     "        pnl = 0\n",
181 |     "        for j,idx in enumerate(entry):\n",
182 |     "            wgt = np.round((initial_cash/len(pri))/entry[idx])\n",
183 |     "            #pnl += ((pri[idx]-np.abs(entry[idx]))/np.abs(entry[idx]))*wgt/initial_cash\n",
184 |     "            pnl += ((pri[idx]-np.abs(entry[idx])))*wgt\n",
185 |     "            #print pnl\n",
186 |     "        pnls.append(pnl)\n",
187 |     "        dates.append(prices.index[i])\n",
188 |     "            \n",
189 |     "        entry = {}\n",
190 |     "        \n",
191 |     "\n",
192 |     "        #print(idx_long, idx_short)\n",
193 |     "        for idx in idx_long:\n",
194 |     "            entry[idx] = pri[idx]\n",
195 |     "        for idx in idx_short:\n",
196 |     "            entry[idx] = -pri[idx]\n",
197 |     "        #print(i,entry)\n",
198 |     "        \n",
199 |     "        print(i,sum(pnls))\n",
200 |     "    return pnls,dates"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "def Zscore(X):\n",
210 |     "    return np.array((X - np.mean(X)) / np.std(X))"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "def run_pca(pr,components=1,log_prices=True):\n",
220 |     "    pca = PCA(n_components=components)\n",
221 |     "    if log_prices:\n",
222 |     "        comps = pca.fit(np.log(pr.T)).components_.T\n",
223 |     "    else:\n",
224 |     "        comps = pca.fit(pr.T).components_.T\n",
225 |     "    factors = sm.add_constant(pr.T.dot(comps))\n",
226 |     "    mm = [sm.OLS(s.T, factors).fit() for s in pr]\n",
227 |     "    resids = list(map(lambda x: x.resid, mm))\n",
228 |     "    return resids, factors"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "for p in np.asarray(prices.T):\n",
238 |     "    plt.plot((p-p[0])/np.std(p))"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "scrolled": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "pnls,dates = backtest(prices,max_pos=2,num_factors=2,initial_cash=1e6,lkbk=400)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "plt.plot(np.cumsum(pnls));"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "plt.plot(dates,np.cumsum(pnls));"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": []
276 |   }
277 |  ],
278 |  "metadata": {
279 |   "kernelspec": {
280 |    "display_name": "Python 3",
281 |    "language": "python",
282 |    "name": "python3"
283 |   },
284 |   "language_info": {
285 |    "codemirror_mode": {
286 |     "name": "ipython",
287 |     "version": 3
288 |    },
289 |    "file_extension": ".py",
290 |    "mimetype": "text/x-python",
291 |    "name": "python",
292 |    "nbconvert_exporter": "python",
293 |    "pygments_lexer": "ipython3",
294 |    "version": "3.6.1"
295 |   }
296 |  },
297 |  "nbformat": 4,
298 |  "nbformat_minor": 2
299 | }
300 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/PCA_StatArb-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# PCA Statistical Arbitrage\n",
  8 |     "\n",
  9 |     "(This notebook can be found on GitHub: https://github.com/rodler/quantinsti_statarb)\n",
 10 |     "\n",
 11 |     "### Dr Tom Starke \n",
 12 |     "\n",
 13 |     "*Homepage: www.aaaquants.com *\n",
 14 |     "\n",
 15 |     "*Email: tom@aaaquants.com *\n",
 16 |     "\n",
 17 |     "*Linkedin: Dr Tom Starke *\n",
 18 |     "\n",
 19 |     "### What we will learn:\n",
 20 |     "- Building a PCA manually\n",
 21 |     "- Conduct a pairs-trading backtest using PCA\n",
 22 |     "- Simulation of multiple cointegrated assets\n",
 23 |     "- Sector statistical arbitrage using PCA "
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "### Factor Investing\n",
 31 |     "\n",
 32 |     "In factor investing we aim to find time-dependent signals that are correlated to future returns. These factors could be:\n",
 33 |     "- technical indicators (e.g. difference of two moving averages)\n",
 34 |     "- fundamental factors (e.g. company data such as P/E ratio)\n",
 35 |     "- macro factors (e.g. interest rates)\n",
 36 |     "- abstract factors (e.g. PCA)\n",
 37 |     "\n",
 38 |     "Abstract factors derive from other factors. Assume, we have N factors, which have some degree of correlation, we can construct M<N factors that are orthogonal (uncorrelated)."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "### Pairs trading as factor investing\n",
 46 |     "\n",
 47 |     "- Pairs trading can be seen as a type of factor investing\n",
 48 |     "- The factor is the net asset value\n",
 49 |     "- The constraint is to be cash neutral"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "import numpy as np\n",
 59 |     "import matplotlib.pyplot as plt\n",
 60 |     "from scipy import stats as stats\n",
 61 |     "from sklearn.linear_model import LinearRegression\n",
 62 |     "from sklearn.decomposition import PCA\n",
 63 |     "import scipy\n",
 64 |     "import statsmodels.api as sm\n",
 65 |     "import pandas as pd\n",
 66 |     "from statsmodels.tsa.stattools import coint\n",
 67 |     "import matplotlib\n",
 68 |     "from itertools import groupby, count\n",
 69 |     "import pickle\n",
 70 |     "%matplotlib inline"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### Short recap: cointegration versus correlation\n",
 78 |     "\n",
 79 |     "- Cointegrated price series do not necessarily have to be correlated and vice-versa \n",
 80 |     "\n",
 81 |     "- Drunk and her dog: https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n",
 82 |     "\n",
 83 |     "- Same behaviour in the markets and causality relationships change. For more information please refer to Engle-Granger causality (https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=2ahUKEwjcvKPcsqDeAhXTbCsKHTmNAe8QFjAAegQIBxAB&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FGranger_causality&usg=AOvVaw1mYq3HhcjsVNJ9zJ6zgqdV)\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "Engel-Granger Causality"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "### Principal Component Analysis (PCA) to construct abstract factors\n",
 94 |     "https://systematicedge.wordpress.com/2013/06/02/principal-component-analysis-in-portfolio-management/\n",
 95 |     "\n",
 96 |     "- \"Principal components\" are \"modes\" of a system, similar to vibrational overtones of a guitar string.\n",
 97 |     "- They are eigenvectors of the covariance matrix.\n",
 98 |     "- Number of eigenvectors equals the number of features.\n",
 99 |     "- \"Dimensionality reduction\": reducing the number of eigenvectors.\n",
100 |     "- Eigenvectors are \"orthogonal\" - uncorrelated.\n",
101 |     "\n",
102 |     "To illustrate this, let's produce two correlated time series:"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# Create a random seed for consistency\n",
112 |     "np.random.seed(20)\n",
113 |     "\n",
114 |     "# Produce a series as the cumulative sum of normally distributed random numbers.\n",
115 |     "x = np.cumsum(np.random.randn(200)) + 100\n",
116 |     "\n",
117 |     "# Produce a second series with the same behaviour but higher standard deviation.\n",
118 |     "y = x*2 + np.random.randn(200) - 100\n",
119 |     "\n",
120 |     "# Generate an array to be used in our PCA calculation.\n",
121 |     "# Note that we have to de-mean our values first.\n",
122 |     "R = np.array([x-np.mean(x),y-np.mean(y)])\n",
123 |     "\n",
124 |     "# Plot x and y\n",
125 |     "plt.plot(x)\n",
126 |     "plt.plot(y)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "For PCA we calculate the covariance matrix of R and subsequently the eigenvectors and eigenvalues. Here, the eigenvectors tell us the axis of the largest variance and the eigenvalues tell us the magnitude of the variance along each axis."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "# Calculate the covariance matrix\n",
143 |     "S = np.cov(R)\n",
144 |     "print('Cov Matrix:', S)\n",
145 |     "\n",
146 |     "# Calculate the eigenvalues and eigenvectors\n",
147 |     "EIG = np.linalg.eig(S)\n",
148 |     "print('Eigenvalues: ',EIG[0])\n",
149 |     "print('Eigenvectors: ',EIG[1])"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "Now we create the vectors in space and plot our x and y values. We can see that we are capturing the direction of the largest variance. These vectors are called \"principal components\"."
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "# Extract the eigenvectors from EIG\n",
166 |     "EV = EIG[1]\n",
167 |     "\n",
168 |     "# Create x-values for plotting eigenvectors\n",
169 |     "xx = np.linspace(min(R[0,:]),max(R[0,:]),200)\n",
170 |     "\n",
171 |     "# Create y-values for plotting eigenvectors\n",
172 |     "\n",
173 |     "yy1 = (EV[1][0]/EV[0][0])*xx\n",
174 |     "yy2 = (EV[1][1]/EV[0][1])*xx"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "The most significant eigenvector is the one with the highest eigenvalue. Here we extract its row number."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "most_significant_factor = np.argmax(EIG[0])\n",
191 |     "print('Most significant factor: ',most_significant_factor)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "Now we plot our de-meaned price values along with their priciple components."
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "# Plot de-meaned x - y correlation\n",
208 |     "plt.plot(R[0,:],R[1,:],'o')\n",
209 |     "\n",
210 |     "# Plot the first principal component\n",
211 |     "plt.plot(xx,yy1,label='first')\n",
212 |     "\n",
213 |     "# Plot the second principal component\n",
214 |     "plt.plot(xx,yy2,label='second')\n",
215 |     "\n",
216 |     "plt.legend()\n",
217 |     "\n",
218 |     "# Make sure axis are equal to illustrate orthogonality\n",
219 |     "plt.axis('equal');"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "We can see that for the two-dimensional case we get exactly two principal components. With the priciple components we can now calculate our abstract factors:"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "# Calculating the factor values from the eigenvector\n",
236 |     "factors = np.dot(EV.T,R)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "The plot below shows how well our factors are correlated with x and y. Note that for each return curve we have two factors.\n",
244 |     "We can see that component 1 with an eigenvalue of 84.46 has a strong correlation with x and y, component 0 with an eigenvalue of only 0.18 does not."
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "# Correlations between factors\n",
254 |     "plt.plot(factors[0],R[0,:],'bo',label='x, component 0')\n",
255 |     "plt.plot(factors[1],R[1,:],'ro',label='y, component 1')\n",
256 |     "plt.plot(factors[0],R[1,:],'go',label='y, component 0')\n",
257 |     "plt.plot(factors[1],R[0,:],'ko',label='x, component 1')\n",
258 |     "plt.axis('equal')\n",
259 |     "plt.legend();"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "### PCA Pairs Trade\n",
267 |     "\n",
268 |     "- The use of PCA for pairs trading and subsequent generalisation of the technique to large portfolios.\n",
269 |     "- Produce pair of cointegrated price series.\n",
270 |     "- Build a backtest.\n",
271 |     "- Analyse the results.\n",
272 |     "\n",
273 |     "First we create an algorithm for \"drunk and dog\" cointegration. Here,\n",
274 |     "- T are the starting values\n",
275 |     "- Sigma are the standard deviations of each path.\n",
276 |     "- c is a variable that determines how strongly both returns are connected.\n",
277 |     "- if c[0] = 0 and c[1] = 0 -> both are random walks (drunk does not own dog).\n",
278 |     "- if one c is zero and the other non-zero there is a one way causality (drunk owns dog).\n",
279 |     "- if both c are non-zero there is two-way causality (dog sometimes pulls drunk)."
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "def make_coint_0(N,T0=[0,0],sigma=[1,1],c=[0.1,0.1]):\n",
289 |     "    '''\n",
290 |     "    Algorithm from:\n",
291 |     "    https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n",
292 |     "    '''\n",
293 |     "    X = [0]\n",
294 |     "    Y = [0]\n",
295 |     "    for i in range(N):\n",
296 |     "        rx = np.random.randn()*sigma[0] - c[0]*(X[-1] - Y[-1])\n",
297 |     "        ry = np.random.randn()*sigma[1] + c[1]*(X[-1] - Y[-1])\n",
298 |     "        X.append(X[-1]+rx)\n",
299 |     "        Y.append(Y[-1]+ry)\n",
300 |     "    return np.array(X)+T0[0],np.array(Y)+T0[1]"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "When plotting X and Y we can see that they follow each other closely.\n",
308 |     "Now, vary c as follows and observe what happens:\n",
309 |     "- c = [ 0.9, 0.0 ]\n",
310 |     "- c = [ 0.1, 0.1 ]\n",
311 |     "- c = [ 0.1, 0.9 ]\n",
312 |     "- c = [ 0.0 , 0.0]"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "np.random.seed(452)\n",
322 |     "X,Y = make_coint_0(200,T0=[50,50],c=[0.1,0.1])\n",
323 |     "plt.plot(X,'r-',Y,'b-');"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "Assessing the quality of our cointegration:\n",
331 |     "- Critical values for 0.1, 0.05 and 0.01.\n",
332 |     "- T-statistic should be below crit."
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "crit = coint(X,Y)\n",
342 |     "print('Critical Values:',crit[2])\n",
343 |     "print('T-statistic:',crit[0])\n",
344 |     "print('P-value:',crit[1])"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "### Application of PCA to pairs trading:\n",
352 |     "- Use the sklearn PCA package to generate components.\n",
353 |     "- Linear regression with the price data.\n",
354 |     "- Z-score the residual to normalise for varying price levels and volatility.\n",
355 |     "- Trade when residual sufficiently deviates from mean. \n",
356 |     "- Use of log prices can help to mitigate large price swings (e.g. in penny stocks)\n",
357 |     "\n",
358 |     "Below the code for the PCA pairs trade:"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "def Zscore(X):\n",
368 |     "    return np.array((X - np.mean(X)) / np.std(X))\n",
369 |     "\n",
370 |     "def run_pca(pr,components=1,log_prices=True):\n",
371 |     "    \n",
372 |     "    # Instanciate PCA \n",
373 |     "    pca = PCA(n_components=components)\n",
374 |     "    px = pr.T-np.mean(pr.T)\n",
375 |     "    \n",
376 |     "    if log_prices:\n",
377 |     "        \n",
378 |     "        # Calculate the priciple components using log prices\n",
379 |     "        comps = pca.fit(np.log(pr.T)).components_.T\n",
380 |     "        \n",
381 |     "        # Create the factors from the pricinple components\n",
382 |     "        factors = sm.add_constant(pr.T.dot(comps))\n",
383 |     "        \n",
384 |     "    else:\n",
385 |     "        \n",
386 |     "        # Calculate the N priciple components using normal prices\n",
387 |     "        comps = pca.fit(px).components_.T\n",
388 |     "        \n",
389 |     "         # Create the factors from the pricinple components\n",
390 |     "        factors = sm.add_constant(px.dot(comps))  \n",
391 |     "        \n",
392 |     "\n",
393 |     "    \n",
394 |     "    # Regress each factor with the actual underlying prices\n",
395 |     "    mm = [sm.OLS(s.T, factors).fit() for s in pr]\n",
396 |     "    \n",
397 |     "    # Calculate the residuals\n",
398 |     "    resids = list(map(lambda x: x.resid, mm))\n",
399 |     "    \n",
400 |     "    return resids, factors"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {},
406 |    "source": [
407 |     "- Running the PCA we can now see the factors with equal values of opposite sign.\n",
408 |     "- Same as in \"regular\" pairs trade where opposite sign is expressed by long/short.\n",
409 |     "- PCA gives reversible results when X and Y are switched, linear regression does not."
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "# Create input array from cointegrated price series\n",
419 |     "R = np.array([X,Y,X])\n",
420 |     "\n",
421 |     "# Run the PCA calculation\n",
422 |     "residuals, factors = run_pca(R,log_prices=True)\n",
423 |     "\n",
424 |     "# Plot the residuals\n",
425 |     "plt.plot(residuals[0],label='resid X')\n",
426 |     "plt.plot(residuals[1],label='resid Y')\n",
427 |     "plt.xlabel('time')\n",
428 |     "plt.ylabel('residuals')\n",
429 |     "plt.legend()\n",
430 |     "plt.grid()"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "As a side-node, observe that linear regression is not reversible.\n",
438 |     "- Residuals are calculated as distances to fitting line along to y-axis.\n",
439 |     "- In PCA residuals are calculated orthogonal to principal component."
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "m = np.polyfit(X,Y,1)[0]\n",
449 |     "m_rev = np.polyfit(Y,X,1)[0]\n",
450 |     "print('Slope of regression:',m)\n",
451 |     "print('Inverse slope of reverse regression:',1/m_rev)"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "markdown",
456 |    "metadata": {},
457 |    "source": [
458 |     "### Pairs-Trade Backtest\n",
459 |     "- Sequentially step through time and instruments.\n",
460 |     "- Calculate if z-score of residuals is large enough to trade.\n",
461 |     "- If in trade, see if residuals have mean reverted enough to exit.\n",
462 |     "- Calculate the pnl.\n",
463 |     "\n",
464 |     "(For simplicity we calculate the residuals first, thus introducing a forward-looking bias. This is to make the calcs faster and it is rectified later.)"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": null,
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": [
473 |     "inpos = np.zeros(R.shape[0]) # side: long=+1; short=-1\n",
474 |     "pnl = [0] # PnL vector\n",
475 |     "bw = 2 # z-score threshold (bandwidth)\n",
476 |     "op = {} # dict of entry prices\n",
477 |     "\n",
478 |     "# loop through time steps\n",
479 |     "for i in range(len(residuals[0])):\n",
480 |     "    p = 0 # initialise pnl-calc for a particular time step\n",
481 |     "    \n",
482 |     "    # loop through instruments\n",
483 |     "    for inst in range(R.shape[0]):\n",
484 |     "        \n",
485 |     "        # calculate the z-score of residuals\n",
486 |     "        zs = Zscore(residuals[inst])[i]\n",
487 |     "        \n",
488 |     "        # Entry condition: z-score above bandwith and no position on\n",
489 |     "        if np.abs(zs)>bw and inpos[inst] == 0:\n",
490 |     "            op[inst] = R[inst,i] # record the open price\n",
491 |     "            inpos[inst] = zs  # tell algo that we have a position\n",
492 |     "            \n",
493 |     "        # Exit condition: z-score has crossed zero and position on\n",
494 |     "        elif zs*np.sign(inpos[inst])<0:\n",
495 |     "            \n",
496 |     "            # Calculate pnl as (exit-entry)*side\n",
497 |     "            p+=((-R[inst,i]+op[inst])*np.sign(inpos[inst]))\n",
498 |     "            inpos[inst] = 0 # set side to zero\n",
499 |     "    \n",
500 |     "    # append the new pnl to vector\n",
501 |     "    pnl.append(p)\n",
502 |     "    \n",
503 |     "# Plot the results of the backtest\n",
504 |     "plt.plot(np.cumsum(pnl),'-')\n",
505 |     "plt.xlabel('time')\n",
506 |     "plt.ylabel('realised PnL')\n",
507 |     "plt.show()"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "### Simulate sector cointegration"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "markdown",
519 |    "metadata": {},
520 |    "source": [
521 |     "- Simulated time series provide understandable results. \n",
522 |     "- Stocks within a sector are often cointegrated.\n",
523 |     "- Maths of \"Drunk and her dog\" generalisation shown below:\n",
524 |     "\n",
525 |     "\\begin{align}\n",
526 |     "c_{ij} = \\Bigg\\{ \n",
527 |     "\\begin{split}\n",
528 |     "-a_{ij} \\quad for \\quad i \\leq j \\\\ \n",
529 |     "a_{ij} \\quad for \\quad i \\geq j \\\\ \n",
530 |     "-a_{ij} \\quad for \\quad i = j\n",
531 |     "\\end{split}\n",
532 |     "\\end{align}\n",
533 |     "\n",
534 |     "\\begin{align}\n",
535 |     "X_{t}^{(i)}-X_{t-1}^{(i)} = \\sum_{j} c_{ij} X_{t-1}^{(j)} + \\epsilon_{i} \\quad with \\quad a_{ij} \\geq 0\n",
536 |     "\\end{align}\n",
537 |     "\n",
538 |     "- *X* denotes the time series, *c* is the causality matrix.\n",
539 |     "- *a* are the positive elements of the causality matrix. \n",
540 |     "\n",
541 |     "(Note that the *a's* denote the relationships between different series. We can simply use random numbers to start with. As we increase the number of series, we need to keep *a* small to avoid positive feedback scenarios.) \n",
542 |     "\n",
543 |     "Below the code that implements the above equations."
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": null,
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": [
552 |     "def make_coint_1(N,steps,a=0.1):\n",
553 |     "    X = [np.zeros(N)]\n",
554 |     "    \n",
555 |     "    # Create the causality matrix\n",
556 |     "    c = (np.tril(np.ones(N))-np.triu(np.ones(N))-np.diag(np.ones(N),0))*a #c = np.random.rand(N,N)*0.1\n",
557 |     "\n",
558 |     "    # loop through time steps\n",
559 |     "    for i in range(steps):\n",
560 |     "        \n",
561 |     "        # Calculate the returns for each time series\n",
562 |     "        rx = (np.sum(c*X[-1],axis=1)+np.random.randn(N))\n",
563 |     "        \n",
564 |     "        # Add the new return to the last price of the time series\n",
565 |     "        X.append(X[-1]+rx)\n",
566 |     "        \n",
567 |     "    # return array of all series\n",
568 |     "    return np.array(X).T"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "markdown",
573 |    "metadata": {},
574 |    "source": [
575 |     "Let's create a cointegrated pair with this technique.\n",
576 |     "\n",
577 |     "__Play with this by varying *a* and observe the results.__"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": null,
583 |    "metadata": {},
584 |    "outputs": [],
585 |    "source": [
586 |     "np.random.seed(21)\n",
587 |     "N = 3\n",
588 |     "a1 = 0.1 # general case\n",
589 |     "a2=[[0.02,0.1],[0.1,0.02]] # for N = 2\n",
590 |     "a3=[[0.06,0.04,0.08],[0.06,0.06,0.04],[0.06,0.08,0.04]] # for N = 3\n",
591 |     "X1 = make_coint_1(N,200,a=a1).T\n",
592 |     "\n",
593 |     "for i in range(N):\n",
594 |     "    plt.plot(X1[:,i])"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "markdown",
599 |    "metadata": {},
600 |    "source": [
601 |     "We can see we have produced a set of stationary time series, testing for cointegration we see that most of them are below the critical values."
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": null,
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "print('Critical values:',coint(X1.T[0],X1.T[1])[2])\n",
611 |     "for i in range(X1.T.shape[0]):\n",
612 |     "    for k in range(i,X1.T.shape[0]):\n",
613 |     "        if not i==k:\n",
614 |     "            print('t-stats for coint of series %s and %s:'%(i,k), coint(X1.T[i],X1.T[k])[0])\n",
615 |     "        "
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "markdown",
620 |    "metadata": {},
621 |    "source": [
622 |     "Let's now apply our statistical arbitrage system to a simple long/short pair in order to test if our system is working properly. First, let's produce a cointegrated time series and plot it to confirm its properties."
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "markdown",
627 |    "metadata": {},
628 |    "source": [
629 |     "### PCA Portfolio Trading\n",
630 |     "\n",
631 |     "- Application of strategy to larger portfolio.\n",
632 |     "- Careful with the causality coefficients as large numbers of strong cross-dependencies can create positive feedback loops between the series. \n",
633 |     "- Larger N - higher to probability of the feedack loops for a given *alpha*.\n",
634 |     "\n",
635 |     "__Please vary alpha in this exercise and observe how the behaviour of our time series changes.__"
636 |    ]
637 |   },
638 |   {
639 |    "cell_type": "code",
640 |    "execution_count": null,
641 |    "metadata": {},
642 |    "outputs": [],
643 |    "source": [
644 |     "#np.random.seed(231)\n",
645 |     "N = 10\n",
646 |     "alpha = 0.03\n",
647 |     "X2 = make_coint_1(N,300,a=np.random.rand(N,N)*alpha) + 50"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "execution_count": null,
653 |    "metadata": {},
654 |    "outputs": [],
655 |    "source": [
656 |     "for i in X2:\n",
657 |     "    plt.plot(i)"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "markdown",
662 |    "metadata": {},
663 |    "source": [
664 |     "### Cumulative explained variance\n",
665 |     "\n",
666 |     "- 3 principle components explain 75% of variance\n",
667 |     "- Too many components lead to very high correlation and very small PnL/trade\n",
668 |     "- Enough PnL/trade to overcome trading costs"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": null,
674 |    "metadata": {},
675 |    "outputs": [],
676 |    "source": [
677 |     "pca = PCA(n_components=10)\n",
678 |     "pca.fit(np.log(X2))\n",
679 |     "\n",
680 |     "# Plot cumulative explained variance\n",
681 |     "plt.plot(np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_),'-o')\n",
682 |     "plt.grid()\n",
683 |     "plt.xlabel('Component')\n",
684 |     "plt.ylabel('Explained Variance')"
685 |    ]
686 |   },
687 |   {
688 |    "cell_type": "markdown",
689 |    "metadata": {},
690 |    "source": [
691 |     "- Useful to look at the average t-statistics between all possible pairs.\n",
692 |     "- High average t-stats - good probability of strategy success.\n",
693 |     "- Johansen test tends to perform poorly out-of-sample."
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "code",
698 |    "execution_count": null,
699 |    "metadata": {},
700 |    "outputs": [],
701 |    "source": [
702 |     "# Cointegration test:\n",
703 |     "coints = []\n",
704 |     "print('Critical values:',coint(X2[0],X2[1])[2])\n",
705 |     "for i in range(X2.shape[0]):\n",
706 |     "    for k in range(i,X2.shape[0]):\n",
707 |     "        if not i==k:\n",
708 |     "            coints.append(coint(X2[i],X2[k])[0])\n",
709 |     "            \n",
710 |     "print('Average coint t-stats:',np.mean(coints))"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "markdown",
715 |    "metadata": {},
716 |    "source": [
717 |     "- Simulated data can give us an understanding how well our backtest performs under idealised conditions. \n",
718 |     "\n",
719 |     "- In the next part we are looking at applying this algorithm to real market data using Quantopian."
720 |    ]
721 |   },
722 |   {
723 |    "cell_type": "markdown",
724 |    "metadata": {},
725 |    "source": [
726 |     "### Sector-portfolio backtest\n",
727 |     "\n",
728 |     "There are many strategies we can deploy based on our techniques such as:\n",
729 |     "- sort the z-scores of our factors and go long the lowest and short the N assets with the highest z-scores. \n",
730 |     "- scale the position size of each instrument according to z-score.\n",
731 |     "- only rebalance portfolio when sum of z-scores exceeds a threshold.\n",
732 |     "\n",
733 |     "All of them have their uses and they need to be tested on a case-by-case basis. Here, we choose the first example as shown below. This time we eliminate the forward-looking bias by recalculating the residuals at every time step."
734 |    ]
735 |   },
736 |   {
737 |    "cell_type": "code",
738 |    "execution_count": null,
739 |    "metadata": {},
740 |    "outputs": [],
741 |    "source": [
742 |     "mdata = pickle.load(open('marketdata.pick','rb'))"
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "code",
747 |    "execution_count": null,
748 |    "metadata": {
749 |     "scrolled": true
750 |    },
751 |    "outputs": [],
752 |    "source": [
753 |     "pr = np.array(mdata).T#[:12,:]\n",
754 |     "max_pos=3\n",
755 |     "num_factors=3\n",
756 |     "initial_cash=1e6\n",
757 |     "\n",
758 |     "entry = {} # create a vector of entry prices\n",
759 |     "pnls = [] # create a pnl vector\n",
760 |     "\n",
761 |     "# Exit if we specified too large long/short position size\n",
762 |     "if max_pos > pr.shape[0]/2:\n",
763 |     "    print('max_pos too large!')\n",
764 |     "    adfadsf\n",
765 |     "\n",
766 |     "# loop through the prices\n",
767 |     "for i,pri in enumerate(pr.T):\n",
768 |     "\n",
769 |     "    # Make sure you have enough data points for PCA\n",
770 |     "    if i < 50: continue\n",
771 |     "\n",
772 |     "    # Run the PCA, only on the past prices\n",
773 |     "    resids, factors = run_pca(pr.T[max([0,i-400]):i],num_factors,log_prices=False)\n",
774 |     "    zs = {}\n",
775 |     "\n",
776 |     "    # Calculate the z-scores for each instrument. \n",
777 |     "    for inst in range(len(pri)):\n",
778 |     "        try: zs[inst] = Zscore(resids[inst])[-1]\n",
779 |     "        except: pass\n",
780 |     "\n",
781 |     "    pnl = 0\n",
782 |     "    # Calculate the Pnl for each position over the prevoius period\n",
783 |     "    for j,idx in enumerate(entry):\n",
784 |     "\n",
785 |     "        # Calculate the position size\n",
786 |     "        # The sign of the position depends on the sign of the entry price\n",
787 |     "        pos = np.round((initial_cash/len(pri))/entry[idx])\n",
788 |     "\n",
789 |     "        # Add up the pnls for all positions for the last period\n",
790 |     "        # We neutralize the sign of the entry price and let it \n",
791 |     "        # come in through the position.\n",
792 |     "        pnl += (pri[idx]-np.abs(entry[idx]))*pos\n",
793 |     "    pnls.append(pnl)\n",
794 |     "\n",
795 |     "    # Reset the portfolio\n",
796 |     "    entry = {}\n",
797 |     "\n",
798 |     "    # Find the new instruments to be traded based on their z-scores\n",
799 |     "    idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n",
800 |     "    idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n",
801 |     "\n",
802 |     "    # Add them to the entry list\n",
803 |     "    # The entry gets a positive or negative sign depending on the side of the trade\n",
804 |     "    for idx in idx_long:\n",
805 |     "        entry[idx] = pri[idx]\n",
806 |     "    for idx in idx_short:\n",
807 |     "        entry[idx] = -pri[idx]\n",
808 |     "\n",
809 |     "    print(i,sum(pnls))"
810 |    ]
811 |   },
812 |   {
813 |    "cell_type": "code",
814 |    "execution_count": null,
815 |    "metadata": {},
816 |    "outputs": [],
817 |    "source": [
818 |     "plt.plot(np.cumsum(pnls))"
819 |    ]
820 |   },
821 |   {
822 |    "cell_type": "code",
823 |    "execution_count": null,
824 |    "metadata": {},
825 |    "outputs": [],
826 |    "source": []
827 |   }
828 |  ],
829 |  "metadata": {
830 |   "kernelspec": {
831 |    "display_name": "Python 3",
832 |    "language": "python",
833 |    "name": "tribo"
834 |   },
835 |   "language_info": {
836 |    "codemirror_mode": {
837 |     "name": "ipython",
838 |     "version": 3
839 |    },
840 |    "file_extension": ".py",
841 |    "mimetype": "text/x-python",
842 |    "name": "python",
843 |    "nbconvert_exporter": "python",
844 |    "pygments_lexer": "ipython3",
845 |    "version": "3.6.1"
846 |   }
847 |  },
848 |  "nbformat": 4,
849 |  "nbformat_minor": 2
850 | }
851 | 


--------------------------------------------------------------------------------
/PCA_StatArb-Old.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# PCA Statistical Arbitrage\n",
  8 |     "\n",
  9 |     "(This notebook can be found on GitHub: https://github.com/rodler/quantinsti_statarb)\n",
 10 |     "\n",
 11 |     "### Dr Tom Starke \n",
 12 |     "\n",
 13 |     "*Homepage: www.aaaquants.com *\n",
 14 |     "\n",
 15 |     "*Email: tom@aaaquants.com *\n",
 16 |     "\n",
 17 |     "*Linkedin: Dr Tom Starke *\n",
 18 |     "\n",
 19 |     "### What we will learn:\n",
 20 |     "- Building a PCA manually\n",
 21 |     "- Conduct a pairs-trading backtest using PCA\n",
 22 |     "- Simulation of multiple cointegrated assets\n",
 23 |     "- Sector statistical arbitrage using PCA "
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "### Factor Investing\n",
 31 |     "\n",
 32 |     "In factor investing we aim to find time-dependent signals that are correlated to future returns. These factors could be:\n",
 33 |     "- technical indicators (e.g. difference of two moving averages)\n",
 34 |     "- fundamental factors (e.g. company data such as P/E ratio)\n",
 35 |     "- macro factors (e.g. interest rates)\n",
 36 |     "- abstract factors (e.g. PCA)\n",
 37 |     "\n",
 38 |     "Abstract factors derive from other factors. Assume, we have N factors, which have some degree of correlation, we can construct M<N factors that are orthogonal (uncorrelated)."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "### Pairs trading as factor investing\n",
 46 |     "\n",
 47 |     "- Pairs trading can be seen as a type of factor investing\n",
 48 |     "- The factor is the net asset value\n",
 49 |     "- The constraint is to be cash neutral"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "import numpy as np\n",
 59 |     "import matplotlib.pyplot as plt\n",
 60 |     "from scipy import stats as stats\n",
 61 |     "from sklearn.linear_model import LinearRegression\n",
 62 |     "from sklearn.decomposition import PCA\n",
 63 |     "import scipy\n",
 64 |     "import statsmodels.api as sm\n",
 65 |     "import pandas as pd\n",
 66 |     "from statsmodels.tsa.stattools import coint\n",
 67 |     "import matplotlib\n",
 68 |     "from itertools import groupby, count\n",
 69 |     "%matplotlib inline"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Short recap: cointegration versus correlation\n",
 77 |     "\n",
 78 |     "- Cointegrated price series do not necessarily have to be correlated and vice-versa \n",
 79 |     "\n",
 80 |     "- Drunk and her dog: https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n",
 81 |     "\n",
 82 |     "- Same behaviour in the markets and causality relationships change. For more information please refer to Engle-Granger causality (https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=2ahUKEwjcvKPcsqDeAhXTbCsKHTmNAe8QFjAAegQIBxAB&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FGranger_causality&usg=AOvVaw1mYq3HhcjsVNJ9zJ6zgqdV)\n",
 83 |     "\n",
 84 |     "\n",
 85 |     "Engel-Granger Causality"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Principal Component Analysis (PCA) to construct abstract factors\n",
 93 |     "https://systematicedge.wordpress.com/2013/06/02/principal-component-analysis-in-portfolio-management/\n",
 94 |     "\n",
 95 |     "- \"Principal components\" are \"modes\" of a system, similar to vibrational overtones of a guitar string.\n",
 96 |     "- They are eigenvectors of the covariance matrix.\n",
 97 |     "- Number of eigenvectors equals the number of features.\n",
 98 |     "- \"Dimensionality reduction\": reducing the number of eigenvectors.\n",
 99 |     "- Eigenvectors are \"orthogonal\" - uncorrelated.\n",
100 |     "\n",
101 |     "To illustrate this, let's produce two correlated time series:"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "# Create a random seed for consistency\n",
111 |     "np.random.seed(20)\n",
112 |     "\n",
113 |     "# Produce a series as the cumulative sum of normally distributed random numbers.\n",
114 |     "x = np.cumsum(np.random.randn(200)) + 100\n",
115 |     "\n",
116 |     "# Produce a second series with the same behaviour but higher standard deviation.\n",
117 |     "y = x*2 + np.random.randn(200) - 100\n",
118 |     "\n",
119 |     "# Generate an array to be used in our PCA calculation.\n",
120 |     "# Note that we have to de-mean our values first.\n",
121 |     "R = np.array([x-np.mean(x),y-np.mean(y)])\n",
122 |     "\n",
123 |     "# Plot x and y\n",
124 |     "plt.plot(x)\n",
125 |     "plt.plot(y)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "For PCA we calculate the covariance matrix of R and subsequently the eigenvectors and eigenvalues. Here, the eigenvectors tell us the axis of the largest variance and the eigenvalues tell us the magnitude of the variance along each axis."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# Calculate the covariance matrix\n",
142 |     "S = np.cov(R)\n",
143 |     "print('Cov Matrix:', S)\n",
144 |     "\n",
145 |     "# Calculate the eigenvalues and eigenvectors\n",
146 |     "EIG = np.linalg.eig(S)\n",
147 |     "print('Eigenvalues: ',EIG[0])\n",
148 |     "print('Eigenvectors: ',EIG[1])"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "Now we create the vectors in space and plot our x and y values. We can see that we are capturing the direction of the largest variance. These vectors are called \"principal components\"."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# Extract the eigenvectors from EIG\n",
165 |     "EV = EIG[1]\n",
166 |     "\n",
167 |     "# Create x-values for plotting eigenvectors\n",
168 |     "xx = np.linspace(min(R[0,:]),max(R[0,:]),200)\n",
169 |     "\n",
170 |     "# Create y-values for plotting eigenvectors\n",
171 |     "\n",
172 |     "yy1 = (EV[1][0]/EV[0][0])*xx\n",
173 |     "yy2 = (EV[1][1]/EV[0][1])*xx"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "The most significant eigenvector is the one with the highest eigenvalue. Here we extract its row number."
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "most_significant_factor = np.argmax(EIG[0])\n",
190 |     "print('Most significant factor: ',most_significant_factor)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "Now we plot our de-meaned price values along with their priciple components."
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "# Plot de-meaned x - y correlation\n",
207 |     "plt.plot(R[0,:],R[1,:],'o')\n",
208 |     "\n",
209 |     "# Plot the first principal component\n",
210 |     "plt.plot(xx,yy1,label='first')\n",
211 |     "\n",
212 |     "# Plot the second principal component\n",
213 |     "plt.plot(xx,yy2,label='second')\n",
214 |     "\n",
215 |     "plt.legend()\n",
216 |     "\n",
217 |     "# Make sure axis are equal to illustrate orthogonality\n",
218 |     "plt.axis('equal');"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "We can see that for the two-dimensional case we get exactly two principal components. With the priciple components we can now calculate our abstract factors:"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# Calculating the factor values from the eigenvector\n",
235 |     "factors = np.dot(EV.T,R)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "The plot below shows how well our factors are correlated with x and y. Note that for each return curve we have two factors.\n",
243 |     "We can see that component 1 with an eigenvalue of 84.46 has a strong correlation with x and y, component 0 with an eigenvalue of only 0.18 does not."
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "# Correlations between factors\n",
253 |     "plt.plot(factors[0],R[0,:],'bo',label='x, component 0')\n",
254 |     "plt.plot(factors[1],R[1,:],'ro',label='y, component 1')\n",
255 |     "plt.plot(factors[0],R[1,:],'go',label='y, component 0')\n",
256 |     "plt.plot(factors[1],R[0,:],'ko',label='x, component 1')\n",
257 |     "plt.axis('equal')\n",
258 |     "plt.legend();"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "### PCA Pairs Trade\n",
266 |     "\n",
267 |     "- The use of PCA for pairs trading and subsequent generalisation of the technique to large portfolios.\n",
268 |     "- Produce pair of cointegrated price series.\n",
269 |     "- Build a backtest.\n",
270 |     "- Analyse the results.\n",
271 |     "\n",
272 |     "First we create an algorithm for \"drunk and dog\" cointegration. Here,\n",
273 |     "- T are the starting values\n",
274 |     "- Sigma are the standard deviations of each path.\n",
275 |     "- c is a variable that determines how strongly both returns are connected.\n",
276 |     "- if c[0] = 0 and c[1] = 0 -> both are random walks (drunk does not own dog).\n",
277 |     "- if one c is zero and the other non-zero there is a one way causality (drunk owns dog).\n",
278 |     "- if both c are non-zero there is two-way causality (dog sometimes pulls drunk)."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "def make_coint_0(N,T0=[0,0],sigma=[1,1],c=[0.1,0.1]):\n",
288 |     "    '''\n",
289 |     "    Algorithm from:\n",
290 |     "    https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n",
291 |     "    '''\n",
292 |     "    X = [0]\n",
293 |     "    Y = [0]\n",
294 |     "    for i in range(N):\n",
295 |     "        rx = np.random.randn()*sigma[0] - c[0]*(X[-1] - Y[-1])\n",
296 |     "        ry = np.random.randn()*sigma[1] + c[1]*(X[-1] - Y[-1])\n",
297 |     "        X.append(X[-1]+rx)\n",
298 |     "        Y.append(Y[-1]+ry)\n",
299 |     "    return np.array(X)+T0[0],np.array(Y)+T0[1]"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "When plotting X and Y we can see that they follow each other closely.\n",
307 |     "Now, vary c as follows and observe what happens:\n",
308 |     "- c = [ 0.9, 0.0 ]\n",
309 |     "- c = [ 0.1, 0.1 ]\n",
310 |     "- c = [ 0.1, 0.9 ]\n",
311 |     "- c = [ 0.0 , 0.0]"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "np.random.seed(452)\n",
321 |     "X,Y = make_coint_0(200,T0=[50,50],c=[0.1,0.1])\n",
322 |     "plt.plot(X,'r-',Y,'b-');"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "Assessing the quality of our cointegration:\n",
330 |     "- Critical values for 0.1, 0.05 and 0.01.\n",
331 |     "- T-statistic should be below crit."
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "crit = coint(X,Y)\n",
341 |     "print('Critical Values:',crit[2])\n",
342 |     "print('T-statistic:',crit[0])\n",
343 |     "print('P-value:',crit[1])"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {},
349 |    "source": [
350 |     "### Application of PCA to pairs trading:\n",
351 |     "- Use the sklearn PCA package to generate components.\n",
352 |     "- Linear regression with the price data.\n",
353 |     "- Z-score the residual to normalise for varying price levels and volatility.\n",
354 |     "- Trade when residual sufficiently deviates from mean. \n",
355 |     "- Use of log prices can help to mitigate large price swings (e.g. in penny stocks)\n",
356 |     "\n",
357 |     "Below the code for the PCA pairs trade:"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "def Zscore(X):\n",
367 |     "    return np.array((X - np.mean(X)) / np.std(X))\n",
368 |     "\n",
369 |     "def run_pca(pr,components=1,log_prices=True):\n",
370 |     "    \n",
371 |     "    # Instanciate PCA \n",
372 |     "    pca = PCA(n_components=components)\n",
373 |     "    px = pr.T-np.mean(pr.T)\n",
374 |     "    \n",
375 |     "    if log_prices:\n",
376 |     "        \n",
377 |     "        # Calculate the priciple components using log prices\n",
378 |     "        comps = pca.fit(np.log(pr.T)).components_.T\n",
379 |     "        \n",
380 |     "        # Create the factors from the pricinple components\n",
381 |     "        factors = sm.add_constant(pr.T.dot(comps))\n",
382 |     "    else:\n",
383 |     "        \n",
384 |     "        # Calculate the N priciple components using normal prices\n",
385 |     "        comps = pca.fit(px).components_.T\n",
386 |     "        \n",
387 |     "         # Create the factors from the pricinple components\n",
388 |     "        factors = sm.add_constant(px.dot(comps))       \n",
389 |     "\n",
390 |     "    \n",
391 |     "    # Regress the factors with the actual prices\n",
392 |     "    mm = [sm.OLS(s.T, factors).fit() for s in pr]\n",
393 |     "    \n",
394 |     "    # Calculate the residuals\n",
395 |     "    resids = list(map(lambda x: x.resid, mm))\n",
396 |     "    \n",
397 |     "    return resids, factors"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "- Running the PCA we can now see the factors with equal values of opposite sign.\n",
405 |     "- Same as in \"regular\" pairs trade where opposite sign is expressed by long/short.\n",
406 |     "- PCA gives reversible results when X and Y are switched, linear regression does not."
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "# Create input array from cointegrated price series\n",
416 |     "R = np.array([X,Y])\n",
417 |     "\n",
418 |     "# Run the PCA calculation\n",
419 |     "residuals, factors = run_pca(R,log_prices=True)\n",
420 |     "\n",
421 |     "# Plot the residuals\n",
422 |     "plt.plot(residuals[0],label='resid X')\n",
423 |     "plt.plot(residuals[1],label='resid Y')\n",
424 |     "plt.xlabel('time')\n",
425 |     "plt.ylabel('residuals')\n",
426 |     "plt.legend()\n",
427 |     "plt.grid()"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "As a side-node, observe that linear regression is not reversible.\n",
435 |     "- Residuals are calculated as distances to fitting line along to y-axis.\n",
436 |     "- In PCA residuals are calculated orthogonal to principal component."
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {},
443 |    "outputs": [],
444 |    "source": [
445 |     "m = np.polyfit(X,Y,1)[0]\n",
446 |     "m_rev = np.polyfit(Y,X,1)[0]\n",
447 |     "print('Slope of regression:',m)\n",
448 |     "print('Inverse slope of reverse regression:',1/m_rev)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "### Pairs-Trade Backtest\n",
456 |     "- Sequentially step through time and instruments.\n",
457 |     "- Calculate if z-score of residuals is large enough to trade.\n",
458 |     "- If in trade, see if residuals have mean reverted enough to exit.\n",
459 |     "- Calculate the pnl.\n",
460 |     "\n",
461 |     "(For simplicity we calculate the residuals first, thus introducing a forward-looking bias. This is to make the calcs faster and it is rectified later.)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "inpos = np.zeros(R.shape[0]) # side: long=+1; short=-1\n",
471 |     "pnl = [0] # PnL vector\n",
472 |     "bw = 2 # z-score threshold (bandwidth)\n",
473 |     "op = {} # dict of entry prices\n",
474 |     "\n",
475 |     "# loop through time steps\n",
476 |     "for i in range(len(residuals[0])):\n",
477 |     "    p = 0 # initialise pnl-calc for a particular time step\n",
478 |     "    \n",
479 |     "    # loop through instruments\n",
480 |     "    for inst in range(R.shape[0]):\n",
481 |     "        \n",
482 |     "        # calculate the z-score of residuals\n",
483 |     "        zs = Zscore(residuals[inst])[i]\n",
484 |     "        \n",
485 |     "        # Entry condition: z-score above bandwith and no position on\n",
486 |     "        if np.abs(zs)>bw and inpos[inst] == 0:\n",
487 |     "            op[inst] = R[inst,i] # record the open price\n",
488 |     "            inpos[inst] = zs  # tell algo that we have a position\n",
489 |     "            \n",
490 |     "        # Exit condition: z-score has crossed zero and position on\n",
491 |     "        elif zs*np.sign(inpos[inst])<0:\n",
492 |     "            \n",
493 |     "            # Calculate pnl as (exit-entry)*side\n",
494 |     "            p+=((-R[inst,i]+op[inst])*np.sign(inpos[inst]))\n",
495 |     "            inpos[inst] = 0 # set side to zero\n",
496 |     "    \n",
497 |     "    # append the new pnl to vector\n",
498 |     "    pnl.append(p)\n",
499 |     "    \n",
500 |     "# Plot the results of the backtest\n",
501 |     "plt.plot(np.cumsum(pnl),'-')\n",
502 |     "plt.xlabel('time')\n",
503 |     "plt.ylabel('realised PnL')\n",
504 |     "plt.show()"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "markdown",
509 |    "metadata": {},
510 |    "source": [
511 |     "### Simulate sector cointegration"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "markdown",
516 |    "metadata": {},
517 |    "source": [
518 |     "- Simulated time series provide understandable results. \n",
519 |     "- Stocks within a sector are often cointegrated.\n",
520 |     "- Maths of \"Drunk and her dog\" generalisation shown below:\n",
521 |     "\n",
522 |     "\\begin{align}\n",
523 |     "c_{ij} = \\Bigg\\{ \n",
524 |     "\\begin{split}\n",
525 |     "-a_{ij} \\quad for \\quad i \\leq j \\\\ \n",
526 |     "a_{ij} \\quad for \\quad i \\geq j \\\\ \n",
527 |     "-a_{ij} \\quad for \\quad i = j\n",
528 |     "\\end{split}\n",
529 |     "\\end{align}\n",
530 |     "\n",
531 |     "\\begin{align}\n",
532 |     "X_{t}^{(i)}-X_{t-1}^{(i)} = \\sum_{j} c_{ij} X_{t-1}^{(j)} + \\epsilon_{i} \\quad with \\quad a_{ij} \\geq 0\n",
533 |     "\\end{align}\n",
534 |     "\n",
535 |     "- *X* denotes the time series, *c* is the causality matrix.\n",
536 |     "- *a* are the positive elements of the causality matrix. \n",
537 |     "\n",
538 |     "(Note that the *a's* denote the relationships between different series. We can simply use random numbers to start with. As we increase the number of series, we need to keep *a* small to avoid positive feedback scenarios.) \n",
539 |     "\n",
540 |     "Below the code that implements the above equations."
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": null,
546 |    "metadata": {},
547 |    "outputs": [],
548 |    "source": [
549 |     "def make_coint_1(N,steps,a=0.1):\n",
550 |     "    X = [np.zeros(N)]\n",
551 |     "    \n",
552 |     "    # Create the causality matrix\n",
553 |     "    c = (np.tril(np.ones(N))-np.triu(np.ones(N))-np.diag(np.ones(N),0))*a #c = np.random.rand(N,N)*0.1\n",
554 |     "\n",
555 |     "    # loop through time steps\n",
556 |     "    for i in range(steps):\n",
557 |     "        \n",
558 |     "        # Calculate the returns for each time series\n",
559 |     "        rx = (np.sum(c*X[-1],axis=1)+np.random.randn(N))\n",
560 |     "        \n",
561 |     "        # Add the new return to the last price of the time series\n",
562 |     "        X.append(X[-1]+rx)\n",
563 |     "        \n",
564 |     "    # return array of all series\n",
565 |     "    return np.array(X).T"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "markdown",
570 |    "metadata": {},
571 |    "source": [
572 |     "Let's create a cointegrated pair with this technique.\n",
573 |     "\n",
574 |     "__Play with this by varying *a* and observe the results.__"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": null,
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "np.random.seed(21)\n",
584 |     "N = 3\n",
585 |     "a1 = 0.1 # general case\n",
586 |     "a2=[[0.02,0.1],[0.1,0.02]] # for N = 2\n",
587 |     "a3=[[0.06,0.04,0.08],[0.06,0.06,0.04],[0.06,0.08,0.04]] # for N = 3\n",
588 |     "X1 = make_coint_1(N,200,a=a1).T\n",
589 |     "\n",
590 |     "for i in range(N):\n",
591 |     "    plt.plot(X1[:,i])"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "markdown",
596 |    "metadata": {},
597 |    "source": [
598 |     "We can see we have produced a set of stationary time series, testing for cointegration we see that most of them are below the critical values."
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": null,
604 |    "metadata": {},
605 |    "outputs": [],
606 |    "source": [
607 |     "print('Critical values:',coint(X1.T[0],X1.T[1])[2])\n",
608 |     "for i in range(X1.T.shape[0]):\n",
609 |     "    for k in range(i,X1.T.shape[0]):\n",
610 |     "        if not i==k:\n",
611 |     "            print('t-stats for coint of series %s and %s:'%(i,k), coint(X1.T[i],X1.T[k])[0])\n",
612 |     "        "
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "markdown",
617 |    "metadata": {},
618 |    "source": [
619 |     "### Sector-portfolio backtest\n",
620 |     "\n",
621 |     "There are many strategies we can deploy based on our techniques such as:\n",
622 |     "- sort the z-scores of our factors and go long the lowest and short the N assets with the highest z-scores. \n",
623 |     "- scale the position size of each instrument according to z-score.\n",
624 |     "- only rebalance portfolio when sum of z-scores exceeds a threshold.\n",
625 |     "\n",
626 |     "All of them have their uses and they need to be tested on a case-by-case basis. Here, we choose the first example as shown below. This time we eliminate the forward-looking bias by recalculating the residuals at every time step."
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "code",
631 |    "execution_count": null,
632 |    "metadata": {},
633 |    "outputs": [],
634 |    "source": [
635 |     "def backtest(pr,max_pos=2,num_factors=1,initial_cash=1e6):\n",
636 |     "    entry = {} # create a vector of entry prices\n",
637 |     "    pnls = [] # create a pnl vector\n",
638 |     "    \n",
639 |     "    # Exit if we specified too large long/short position size\n",
640 |     "    if max_pos > pr.shape[0]/2:\n",
641 |     "        print('max_pos too large!')\n",
642 |     "        return\n",
643 |     "\n",
644 |     "    # loop through the prices\n",
645 |     "    for i,pri in enumerate(pr.T):\n",
646 |     "        \n",
647 |     "        # Make sure you have enough data points for PCA\n",
648 |     "        if i < 50: continue\n",
649 |     "            \n",
650 |     "        # Run the PCA, only on the past prices\n",
651 |     "        resids, factors = run_pca(pr[:i],num_factors,log_prices=False)\n",
652 |     "        zs = {}\n",
653 |     "        \n",
654 |     "        # Calculate the z-scores for each instrument. \n",
655 |     "        for inst in range(len(pri)):\n",
656 |     "            zs[inst] = Zscore(resids[inst])[-1]\n",
657 |     "\n",
658 |     "        pnl = 0\n",
659 |     "        # Calculate the Pnl for each position over the prevoius period\n",
660 |     "        for j,idx in enumerate(entry):\n",
661 |     "            \n",
662 |     "            # Calculate the position size\n",
663 |     "            pos = np.round((initial_cash/len(pri))/entry[idx])\n",
664 |     "            \n",
665 |     "            # Add up the pnls for all positions for the last period\n",
666 |     "            pnl += (pri[idx]-np.abs(entry[idx]))*pos\n",
667 |     "        pnls.append(pnl)\n",
668 |     "        \n",
669 |     "        # Reset the portfolio\n",
670 |     "        entry = {}\n",
671 |     "        \n",
672 |     "        # Find the new instruments to be traded based on their z-scores\n",
673 |     "        idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n",
674 |     "        idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n",
675 |     "        \n",
676 |     "        # Add them to the entry list\n",
677 |     "        for idx in idx_long:\n",
678 |     "            entry[idx] = pri[idx]\n",
679 |     "        for idx in idx_short:\n",
680 |     "            entry[idx] = -pri[idx]\n",
681 |     "        \n",
682 |     " \n",
683 |     "    return(pnls)"
684 |    ]
685 |   },
686 |   {
687 |    "cell_type": "markdown",
688 |    "metadata": {},
689 |    "source": [
690 |     "Let's now apply our statistical arbitrage system to a simple long/short pair in order to test if our system is working properly. First, let's produce a cointegrated time series and plot it to confirm its properties."
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": null,
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": [
699 |     "# Create a seed for consistency\n",
700 |     "np.random.seed(27)\n",
701 |     "N = 2 # Number of assets\n",
702 |     "alpha = 0.1 # causality factor\n",
703 |     "X1 = make_coint_1(N,500,a=np.random.rand(N,N)*alpha) + 50 "
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": null,
709 |    "metadata": {},
710 |    "outputs": [],
711 |    "source": [
712 |     "plt.plot(X1.T[:,0])\n",
713 |     "plt.plot(X1.T[:,1])\n",
714 |     "coint(X1.T[:,0],X1.T[:,1])"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "markdown",
719 |    "metadata": {},
720 |    "source": [
721 |     "- series is highly cointegrated, we expect to see a very good result.\n",
722 |     "\n",
723 |     "__Please run the same simulation with a less cointegrate series by lowering *alpha* to 0.01.__"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": null,
729 |    "metadata": {},
730 |    "outputs": [],
731 |    "source": [
732 |     "# run the backtest\n",
733 |     "pnls = backtest(X1,max_pos=1,num_factors=1,initial_cash=1e6)\n",
734 |     "\n",
735 |     "# plot the result\n",
736 |     "plt.plot(np.cumsum(pnls));"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "code",
741 |    "execution_count": null,
742 |    "metadata": {},
743 |    "outputs": [],
744 |    "source": [
745 |     "pnls"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "markdown",
750 |    "metadata": {},
751 |    "source": [
752 |     "### PCA Portfolio Trading\n",
753 |     "\n",
754 |     "- Application of strategy to larger portfolio.\n",
755 |     "- Careful with the causality coefficients as large numbers of strong cross-dependencies can create positive feedback loops between the series. \n",
756 |     "- Larger N - higher to probability of the feedack loops for a given *alpha*.\n",
757 |     "\n",
758 |     "__Please vary alpha in this exercise and observe how the behaviour of our time series changes.__"
759 |    ]
760 |   },
761 |   {
762 |    "cell_type": "code",
763 |    "execution_count": null,
764 |    "metadata": {},
765 |    "outputs": [],
766 |    "source": [
767 |     "#np.random.seed(231)\n",
768 |     "N = 10\n",
769 |     "alpha = 0.03\n",
770 |     "X2 = make_coint_1(N,100,a=np.random.rand(N,N)*alpha) + 50"
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "code",
775 |    "execution_count": null,
776 |    "metadata": {},
777 |    "outputs": [],
778 |    "source": [
779 |     "for i in X2:\n",
780 |     "    plt.plot(i)"
781 |    ]
782 |   },
783 |   {
784 |    "cell_type": "markdown",
785 |    "metadata": {},
786 |    "source": [
787 |     "### Cumulative explained variance\n",
788 |     "\n",
789 |     "- 3 principle components explain 75% of variance\n",
790 |     "- Too many components lead to very high correlation and very small PnL/trade\n",
791 |     "- Enough PnL/trade to overcome trading costs"
792 |    ]
793 |   },
794 |   {
795 |    "cell_type": "code",
796 |    "execution_count": null,
797 |    "metadata": {},
798 |    "outputs": [],
799 |    "source": [
800 |     "pca = PCA(n_components=10)\n",
801 |     "pca.fit(np.log(X2))\n",
802 |     "\n",
803 |     "# Plot cumulative explained variance\n",
804 |     "plt.plot(np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_),'-o')\n",
805 |     "plt.grid()\n",
806 |     "plt.xlabel('Component')\n",
807 |     "plt.ylabel('Explained Variance')"
808 |    ]
809 |   },
810 |   {
811 |    "cell_type": "markdown",
812 |    "metadata": {},
813 |    "source": [
814 |     "- Useful to look at the average t-statistics between all possible pairs.\n",
815 |     "- High average t-stats - good probability of strategy success.\n",
816 |     "- Johansen test tends to perform poorly out-of-sample."
817 |    ]
818 |   },
819 |   {
820 |    "cell_type": "code",
821 |    "execution_count": null,
822 |    "metadata": {},
823 |    "outputs": [],
824 |    "source": [
825 |     "# Cointegration test:\n",
826 |     "coints = []\n",
827 |     "print('Critical values:',coint(X2[0],X2[1])[2])\n",
828 |     "for i in range(X2.shape[0]):\n",
829 |     "    for k in range(i,X2.shape[0]):\n",
830 |     "        if not i==k:\n",
831 |     "            coints.append(coint(X2[i],X2[k])[0])\n",
832 |     "            \n",
833 |     "print('Average coint t-stats:',np.mean(coints))"
834 |    ]
835 |   },
836 |   {
837 |    "cell_type": "markdown",
838 |    "metadata": {},
839 |    "source": [
840 |     "- Simulated data can give us an understanding how well our backtest performs under idealised conditions. \n",
841 |     "\n",
842 |     "- In the next part we are looking at applying this algorithm to real market data using Quantopian."
843 |    ]
844 |   },
845 |   {
846 |    "cell_type": "code",
847 |    "execution_count": null,
848 |    "metadata": {},
849 |    "outputs": [],
850 |    "source": [
851 |     "pnls = backtest(X2,max_pos=1,num_factors=1,initial_cash=1e6)\n",
852 |     "plt.plot(np.cumsum(pnls));\n",
853 |     "plt.show()"
854 |    ]
855 |   },
856 |   {
857 |    "cell_type": "code",
858 |    "execution_count": null,
859 |    "metadata": {},
860 |    "outputs": [],
861 |    "source": []
862 |   }
863 |  ],
864 |  "metadata": {
865 |   "kernelspec": {
866 |    "display_name": "Python 3",
867 |    "language": "python",
868 |    "name": "tribo"
869 |   },
870 |   "language_info": {
871 |    "codemirror_mode": {
872 |     "name": "ipython",
873 |     "version": 3
874 |    },
875 |    "file_extension": ".py",
876 |    "mimetype": "text/x-python",
877 |    "name": "python",
878 |    "nbconvert_exporter": "python",
879 |    "pygments_lexer": "ipython3",
880 |    "version": "3.6.1"
881 |   }
882 |  },
883 |  "nbformat": 4,
884 |  "nbformat_minor": 2
885 | }
886 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/PCA_StatArb-Old-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# PCA Statistical Arbitrage\n",
  8 |     "\n",
  9 |     "(This notebook can be found on GitHub: https://github.com/rodler/quantinsti_statarb)\n",
 10 |     "\n",
 11 |     "### Dr Tom Starke \n",
 12 |     "\n",
 13 |     "*Homepage: www.aaaquants.com *\n",
 14 |     "\n",
 15 |     "*Email: tom@aaaquants.com *\n",
 16 |     "\n",
 17 |     "*Linkedin: Dr Tom Starke *\n",
 18 |     "\n",
 19 |     "### What we will learn:\n",
 20 |     "- Building a PCA manually\n",
 21 |     "- Conduct a pairs-trading backtest using PCA\n",
 22 |     "- Simulation of multiple cointegrated assets\n",
 23 |     "- Sector statistical arbitrage using PCA "
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "### Factor Investing\n",
 31 |     "\n",
 32 |     "In factor investing we aim to find time-dependent signals that are correlated to future returns. These factors could be:\n",
 33 |     "- technical indicators (e.g. difference of two moving averages)\n",
 34 |     "- fundamental factors (e.g. company data such as P/E ratio)\n",
 35 |     "- macro factors (e.g. interest rates)\n",
 36 |     "- abstract factors (e.g. PCA)\n",
 37 |     "\n",
 38 |     "Abstract factors derive from other factors. Assume, we have N factors, which have some degree of correlation, we can construct M<N factors that are orthogonal (uncorrelated)."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "### Pairs trading as factor investing\n",
 46 |     "\n",
 47 |     "- Pairs trading can be seen as a type of factor investing\n",
 48 |     "- The factor is the net asset value\n",
 49 |     "- The constraint is to be cash neutral"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "import numpy as np\n",
 59 |     "import matplotlib.pyplot as plt\n",
 60 |     "from scipy import stats as stats\n",
 61 |     "from sklearn.linear_model import LinearRegression\n",
 62 |     "from sklearn.decomposition import PCA\n",
 63 |     "import scipy\n",
 64 |     "import statsmodels.api as sm\n",
 65 |     "import pandas as pd\n",
 66 |     "from statsmodels.tsa.stattools import coint\n",
 67 |     "import matplotlib\n",
 68 |     "from itertools import groupby, count\n",
 69 |     "%matplotlib inline"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Short recap: cointegration versus correlation\n",
 77 |     "\n",
 78 |     "- Cointegrated price series do not necessarily have to be correlated and vice-versa \n",
 79 |     "\n",
 80 |     "- Drunk and her dog: https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n",
 81 |     "\n",
 82 |     "- Same behaviour in the markets and causality relationships change. For more information please refer to Engle-Granger causality (https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=2ahUKEwjcvKPcsqDeAhXTbCsKHTmNAe8QFjAAegQIBxAB&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FGranger_causality&usg=AOvVaw1mYq3HhcjsVNJ9zJ6zgqdV)\n",
 83 |     "\n",
 84 |     "\n",
 85 |     "Engel-Granger Causality"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Principal Component Analysis (PCA) to construct abstract factors\n",
 93 |     "https://systematicedge.wordpress.com/2013/06/02/principal-component-analysis-in-portfolio-management/\n",
 94 |     "\n",
 95 |     "- \"Principal components\" are \"modes\" of a system, similar to vibrational overtones of a guitar string.\n",
 96 |     "- They are eigenvectors of the covariance matrix.\n",
 97 |     "- Number of eigenvectors equals the number of features.\n",
 98 |     "- \"Dimensionality reduction\": reducing the number of eigenvectors.\n",
 99 |     "- Eigenvectors are \"orthogonal\" - uncorrelated.\n",
100 |     "\n",
101 |     "To illustrate this, let's produce two correlated time series:"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "# Create a random seed for consistency\n",
111 |     "np.random.seed(20)\n",
112 |     "\n",
113 |     "# Produce a series as the cumulative sum of normally distributed random numbers.\n",
114 |     "x = np.cumsum(np.random.randn(200)) + 100\n",
115 |     "\n",
116 |     "# Produce a second series with the same behaviour but higher standard deviation.\n",
117 |     "y = x*2 + np.random.randn(200) - 100\n",
118 |     "\n",
119 |     "# Generate an array to be used in our PCA calculation.\n",
120 |     "# Note that we have to de-mean our values first.\n",
121 |     "R = np.array([x-np.mean(x),y-np.mean(y)])\n",
122 |     "\n",
123 |     "# Plot x and y\n",
124 |     "plt.plot(x)\n",
125 |     "plt.plot(y)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "For PCA we calculate the covariance matrix of R and subsequently the eigenvectors and eigenvalues. Here, the eigenvectors tell us the axis of the largest variance and the eigenvalues tell us the magnitude of the variance along each axis."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# Calculate the covariance matrix\n",
142 |     "S = np.cov(R)\n",
143 |     "print('Cov Matrix:', S)\n",
144 |     "\n",
145 |     "# Calculate the eigenvalues and eigenvectors\n",
146 |     "EIG = np.linalg.eig(S)\n",
147 |     "print('Eigenvalues: ',EIG[0])\n",
148 |     "print('Eigenvectors: ',EIG[1])"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "Now we create the vectors in space and plot our x and y values. We can see that we are capturing the direction of the largest variance. These vectors are called \"principal components\"."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# Extract the eigenvectors from EIG\n",
165 |     "EV = EIG[1]\n",
166 |     "\n",
167 |     "# Create x-values for plotting eigenvectors\n",
168 |     "xx = np.linspace(min(R[0,:]),max(R[0,:]),200)\n",
169 |     "\n",
170 |     "# Create y-values for plotting eigenvectors\n",
171 |     "\n",
172 |     "yy1 = (EV[1][0]/EV[0][0])*xx\n",
173 |     "yy2 = (EV[1][1]/EV[0][1])*xx"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "The most significant eigenvector is the one with the highest eigenvalue. Here we extract its row number."
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "most_significant_factor = np.argmax(EIG[0])\n",
190 |     "print('Most significant factor: ',most_significant_factor)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "Now we plot our de-meaned price values along with their priciple components."
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "# Plot de-meaned x - y correlation\n",
207 |     "plt.plot(R[0,:],R[1,:],'o')\n",
208 |     "\n",
209 |     "# Plot the first principal component\n",
210 |     "plt.plot(xx,yy1,label='first')\n",
211 |     "\n",
212 |     "# Plot the second principal component\n",
213 |     "plt.plot(xx,yy2,label='second')\n",
214 |     "\n",
215 |     "plt.legend()\n",
216 |     "\n",
217 |     "# Make sure axis are equal to illustrate orthogonality\n",
218 |     "plt.axis('equal');"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "We can see that for the two-dimensional case we get exactly two principal components. With the priciple components we can now calculate our abstract factors:"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# Calculating the factor values from the eigenvector\n",
235 |     "factors = np.dot(EV.T,R)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "The plot below shows how well our factors are correlated with x and y. Note that for each return curve we have two factors.\n",
243 |     "We can see that component 1 with an eigenvalue of 84.46 has a strong correlation with x and y, component 0 with an eigenvalue of only 0.18 does not."
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "# Correlations between factors\n",
253 |     "plt.plot(factors[0],R[0,:],'bo',label='x, component 0')\n",
254 |     "plt.plot(factors[1],R[1,:],'ro',label='y, component 1')\n",
255 |     "plt.plot(factors[0],R[1,:],'go',label='y, component 0')\n",
256 |     "plt.plot(factors[1],R[0,:],'ko',label='x, component 1')\n",
257 |     "plt.axis('equal')\n",
258 |     "plt.legend();"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "### PCA Pairs Trade\n",
266 |     "\n",
267 |     "- The use of PCA for pairs trading and subsequent generalisation of the technique to large portfolios.\n",
268 |     "- Produce pair of cointegrated price series.\n",
269 |     "- Build a backtest.\n",
270 |     "- Analyse the results.\n",
271 |     "\n",
272 |     "First we create an algorithm for \"drunk and dog\" cointegration. Here,\n",
273 |     "- T are the starting values\n",
274 |     "- Sigma are the standard deviations of each path.\n",
275 |     "- c is a variable that determines how strongly both returns are connected.\n",
276 |     "- if c[0] = 0 and c[1] = 0 -> both are random walks (drunk does not own dog).\n",
277 |     "- if one c is zero and the other non-zero there is a one way causality (drunk owns dog).\n",
278 |     "- if both c are non-zero there is two-way causality (dog sometimes pulls drunk)."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "def make_coint_0(N,T0=[0,0],sigma=[1,1],c=[0.1,0.1]):\n",
288 |     "    '''\n",
289 |     "    Algorithm from:\n",
290 |     "    https://www.researchgate.net/publication/254330798_A_Drunk_and_Her_Dog_An_Illustration_of_Cointegration_and_Error_Correction\n",
291 |     "    '''\n",
292 |     "    X = [0]\n",
293 |     "    Y = [0]\n",
294 |     "    for i in range(N):\n",
295 |     "        rx = np.random.randn()*sigma[0] - c[0]*(X[-1] - Y[-1])\n",
296 |     "        ry = np.random.randn()*sigma[1] + c[1]*(X[-1] - Y[-1])\n",
297 |     "        X.append(X[-1]+rx)\n",
298 |     "        Y.append(Y[-1]+ry)\n",
299 |     "    return np.array(X)+T0[0],np.array(Y)+T0[1]"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "When plotting X and Y we can see that they follow each other closely.\n",
307 |     "Now, vary c as follows and observe what happens:\n",
308 |     "- c = [ 0.9, 0.0 ]\n",
309 |     "- c = [ 0.1, 0.1 ]\n",
310 |     "- c = [ 0.1, 0.9 ]\n",
311 |     "- c = [ 0.0 , 0.0]"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "np.random.seed(452)\n",
321 |     "X,Y = make_coint_0(200,T0=[50,50],c=[0.1,0.1])\n",
322 |     "plt.plot(X,'r-',Y,'b-');"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "Assessing the quality of our cointegration:\n",
330 |     "- Critical values for 0.1, 0.05 and 0.01.\n",
331 |     "- T-statistic should be below crit."
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "crit = coint(X,Y)\n",
341 |     "print('Critical Values:',crit[2])\n",
342 |     "print('T-statistic:',crit[0])\n",
343 |     "print('P-value:',crit[1])"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {},
349 |    "source": [
350 |     "### Application of PCA to pairs trading:\n",
351 |     "- Use the sklearn PCA package to generate components.\n",
352 |     "- Linear regression with the price data.\n",
353 |     "- Z-score the residual to normalise for varying price levels and volatility.\n",
354 |     "- Trade when residual sufficiently deviates from mean. \n",
355 |     "- Use of log prices can help to mitigate large price swings (e.g. in penny stocks)\n",
356 |     "\n",
357 |     "Below the code for the PCA pairs trade:"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "def Zscore(X):\n",
367 |     "    return np.array((X - np.mean(X)) / np.std(X))\n",
368 |     "\n",
369 |     "def run_pca(pr,components=1,log_prices=True):\n",
370 |     "    \n",
371 |     "    # Instanciate PCA \n",
372 |     "    pca = PCA(n_components=components)\n",
373 |     "    px = pr.T-np.mean(pr.T)\n",
374 |     "    \n",
375 |     "    if log_prices:\n",
376 |     "        \n",
377 |     "        # Calculate the priciple components using log prices\n",
378 |     "        comps = pca.fit(np.log(pr.T)).components_.T\n",
379 |     "        \n",
380 |     "        # Create the factors from the pricinple components\n",
381 |     "        factors = sm.add_constant(pr.T.dot(comps))\n",
382 |     "    else:\n",
383 |     "        \n",
384 |     "        # Calculate the N priciple components using normal prices\n",
385 |     "        comps = pca.fit(px).components_.T\n",
386 |     "        \n",
387 |     "         # Create the factors from the pricinple components\n",
388 |     "        factors = sm.add_constant(px.dot(comps))       \n",
389 |     "\n",
390 |     "    \n",
391 |     "    # Regress the factors with the actual prices\n",
392 |     "    mm = [sm.OLS(s.T, factors).fit() for s in pr]\n",
393 |     "    \n",
394 |     "    # Calculate the residuals\n",
395 |     "    resids = list(map(lambda x: x.resid, mm))\n",
396 |     "    \n",
397 |     "    return resids, factors"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "- Running the PCA we can now see the factors with equal values of opposite sign.\n",
405 |     "- Same as in \"regular\" pairs trade where opposite sign is expressed by long/short.\n",
406 |     "- PCA gives reversible results when X and Y are switched, linear regression does not."
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "# Create input array from cointegrated price series\n",
416 |     "R = np.array([X,Y])\n",
417 |     "\n",
418 |     "# Run the PCA calculation\n",
419 |     "residuals, factors = run_pca(R,log_prices=True)\n",
420 |     "\n",
421 |     "# Plot the residuals\n",
422 |     "plt.plot(residuals[0],label='resid X')\n",
423 |     "plt.plot(residuals[1],label='resid Y')\n",
424 |     "plt.xlabel('time')\n",
425 |     "plt.ylabel('residuals')\n",
426 |     "plt.legend()\n",
427 |     "plt.grid()"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "As a side-node, observe that linear regression is not reversible.\n",
435 |     "- Residuals are calculated as distances to fitting line along to y-axis.\n",
436 |     "- In PCA residuals are calculated orthogonal to principal component."
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {},
443 |    "outputs": [],
444 |    "source": [
445 |     "m = np.polyfit(X,Y,1)[0]\n",
446 |     "m_rev = np.polyfit(Y,X,1)[0]\n",
447 |     "print('Slope of regression:',m)\n",
448 |     "print('Inverse slope of reverse regression:',1/m_rev)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "### Pairs-Trade Backtest\n",
456 |     "- Sequentially step through time and instruments.\n",
457 |     "- Calculate if z-score of residuals is large enough to trade.\n",
458 |     "- If in trade, see if residuals have mean reverted enough to exit.\n",
459 |     "- Calculate the pnl.\n",
460 |     "\n",
461 |     "(For simplicity we calculate the residuals first, thus introducing a forward-looking bias. This is to make the calcs faster and it is rectified later.)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "inpos = np.zeros(R.shape[0]) # side: long=+1; short=-1\n",
471 |     "pnl = [0] # PnL vector\n",
472 |     "bw = 2 # z-score threshold (bandwidth)\n",
473 |     "op = {} # dict of entry prices\n",
474 |     "\n",
475 |     "# loop through time steps\n",
476 |     "for i in range(len(residuals[0])):\n",
477 |     "    p = 0 # initialise pnl-calc for a particular time step\n",
478 |     "    \n",
479 |     "    # loop through instruments\n",
480 |     "    for inst in range(R.shape[0]):\n",
481 |     "        \n",
482 |     "        # calculate the z-score of residuals\n",
483 |     "        zs = Zscore(residuals[inst])[i]\n",
484 |     "        \n",
485 |     "        # Entry condition: z-score above bandwith and no position on\n",
486 |     "        if np.abs(zs)>bw and inpos[inst] == 0:\n",
487 |     "            op[inst] = R[inst,i] # record the open price\n",
488 |     "            inpos[inst] = zs  # tell algo that we have a position\n",
489 |     "            \n",
490 |     "        # Exit condition: z-score has crossed zero and position on\n",
491 |     "        elif zs*np.sign(inpos[inst])<0:\n",
492 |     "            \n",
493 |     "            # Calculate pnl as (exit-entry)*side\n",
494 |     "            p+=((-R[inst,i]+op[inst])*np.sign(inpos[inst]))\n",
495 |     "            inpos[inst] = 0 # set side to zero\n",
496 |     "    \n",
497 |     "    # append the new pnl to vector\n",
498 |     "    pnl.append(p)\n",
499 |     "    \n",
500 |     "# Plot the results of the backtest\n",
501 |     "plt.plot(np.cumsum(pnl),'-')\n",
502 |     "plt.xlabel('time')\n",
503 |     "plt.ylabel('realised PnL')\n",
504 |     "plt.show()"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "markdown",
509 |    "metadata": {},
510 |    "source": [
511 |     "### Simulate sector cointegration"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "markdown",
516 |    "metadata": {},
517 |    "source": [
518 |     "- Simulated time series provide understandable results. \n",
519 |     "- Stocks within a sector are often cointegrated.\n",
520 |     "- Maths of \"Drunk and her dog\" generalisation shown below:\n",
521 |     "\n",
522 |     "\\begin{align}\n",
523 |     "c_{ij} = \\Bigg\\{ \n",
524 |     "\\begin{split}\n",
525 |     "-a_{ij} \\quad for \\quad i \\leq j \\\\ \n",
526 |     "a_{ij} \\quad for \\quad i \\geq j \\\\ \n",
527 |     "-a_{ij} \\quad for \\quad i = j\n",
528 |     "\\end{split}\n",
529 |     "\\end{align}\n",
530 |     "\n",
531 |     "\\begin{align}\n",
532 |     "X_{t}^{(i)}-X_{t-1}^{(i)} = \\sum_{j} c_{ij} X_{t-1}^{(j)} + \\epsilon_{i} \\quad with \\quad a_{ij} \\geq 0\n",
533 |     "\\end{align}\n",
534 |     "\n",
535 |     "- *X* denotes the time series, *c* is the causality matrix.\n",
536 |     "- *a* are the positive elements of the causality matrix. \n",
537 |     "\n",
538 |     "(Note that the *a's* denote the relationships between different series. We can simply use random numbers to start with. As we increase the number of series, we need to keep *a* small to avoid positive feedback scenarios.) \n",
539 |     "\n",
540 |     "Below the code that implements the above equations."
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": null,
546 |    "metadata": {},
547 |    "outputs": [],
548 |    "source": [
549 |     "def make_coint_1(N,steps,a=0.1):\n",
550 |     "    X = [np.zeros(N)]\n",
551 |     "    \n",
552 |     "    # Create the causality matrix\n",
553 |     "    c = (np.tril(np.ones(N))-np.triu(np.ones(N))-np.diag(np.ones(N),0))*a #c = np.random.rand(N,N)*0.1\n",
554 |     "\n",
555 |     "    # loop through time steps\n",
556 |     "    for i in range(steps):\n",
557 |     "        \n",
558 |     "        # Calculate the returns for each time series\n",
559 |     "        rx = (np.sum(c*X[-1],axis=1)+np.random.randn(N))\n",
560 |     "        \n",
561 |     "        # Add the new return to the last price of the time series\n",
562 |     "        X.append(X[-1]+rx)\n",
563 |     "        \n",
564 |     "    # return array of all series\n",
565 |     "    return np.array(X).T"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "markdown",
570 |    "metadata": {},
571 |    "source": [
572 |     "Let's create a cointegrated pair with this technique.\n",
573 |     "\n",
574 |     "__Play with this by varying *a* and observe the results.__"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": null,
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "np.random.seed(21)\n",
584 |     "N = 3\n",
585 |     "a1 = 0.1 # general case\n",
586 |     "a2=[[0.02,0.1],[0.1,0.02]] # for N = 2\n",
587 |     "a3=[[0.06,0.04,0.08],[0.06,0.06,0.04],[0.06,0.08,0.04]] # for N = 3\n",
588 |     "X1 = make_coint_1(N,200,a=a1).T\n",
589 |     "\n",
590 |     "for i in range(N):\n",
591 |     "    plt.plot(X1[:,i])"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "markdown",
596 |    "metadata": {},
597 |    "source": [
598 |     "We can see we have produced a set of stationary time series, testing for cointegration we see that most of them are below the critical values."
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": null,
604 |    "metadata": {},
605 |    "outputs": [],
606 |    "source": [
607 |     "print('Critical values:',coint(X1.T[0],X1.T[1])[2])\n",
608 |     "for i in range(X1.T.shape[0]):\n",
609 |     "    for k in range(i,X1.T.shape[0]):\n",
610 |     "        if not i==k:\n",
611 |     "            print('t-stats for coint of series %s and %s:'%(i,k), coint(X1.T[i],X1.T[k])[0])\n",
612 |     "        "
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "markdown",
617 |    "metadata": {},
618 |    "source": [
619 |     "### Sector-portfolio backtest\n",
620 |     "\n",
621 |     "There are many strategies we can deploy based on our techniques such as:\n",
622 |     "- sort the z-scores of our factors and go long the lowest and short the N assets with the highest z-scores. \n",
623 |     "- scale the position size of each instrument according to z-score.\n",
624 |     "- only rebalance portfolio when sum of z-scores exceeds a threshold.\n",
625 |     "\n",
626 |     "All of them have their uses and they need to be tested on a case-by-case basis. Here, we choose the first example as shown below. This time we eliminate the forward-looking bias by recalculating the residuals at every time step."
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "code",
631 |    "execution_count": null,
632 |    "metadata": {},
633 |    "outputs": [],
634 |    "source": [
635 |     "def backtest(pr,max_pos=2,num_factors=1,initial_cash=1e6):\n",
636 |     "    entry = {} # create a vector of entry prices\n",
637 |     "    pnls = [] # create a pnl vector\n",
638 |     "    \n",
639 |     "    # Exit if we specified too large long/short position size\n",
640 |     "    if max_pos > pr.shape[0]/2:\n",
641 |     "        print('max_pos too large!')\n",
642 |     "        return\n",
643 |     "\n",
644 |     "    # loop through the prices\n",
645 |     "    for i,pri in enumerate(pr.T):\n",
646 |     "        \n",
647 |     "        # Make sure you have enough data points for PCA\n",
648 |     "        if i < 50: continue\n",
649 |     "            \n",
650 |     "        # Run the PCA, only on the past prices\n",
651 |     "        resids, factors = run_pca(pr[:i],num_factors,log_prices=False)\n",
652 |     "        zs = {}\n",
653 |     "        \n",
654 |     "        # Calculate the z-scores for each instrument. \n",
655 |     "        for inst in range(len(pri)):\n",
656 |     "            zs[inst] = Zscore(resids[inst])[-1]\n",
657 |     "\n",
658 |     "        pnl = 0\n",
659 |     "        # Calculate the Pnl for each position over the prevoius period\n",
660 |     "        for j,idx in enumerate(entry):\n",
661 |     "            \n",
662 |     "            # Calculate the position size\n",
663 |     "            pos = np.round((initial_cash/len(pri))/entry[idx])\n",
664 |     "            \n",
665 |     "            # Add up the pnls for all positions for the last period\n",
666 |     "            pnl += (pri[idx]-np.abs(entry[idx]))*pos\n",
667 |     "        pnls.append(pnl)\n",
668 |     "        \n",
669 |     "        # Reset the portfolio\n",
670 |     "        entry = {}\n",
671 |     "        \n",
672 |     "        # Find the new instruments to be traded based on their z-scores\n",
673 |     "        idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])\n",
674 |     "        idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])\n",
675 |     "        \n",
676 |     "        # Add them to the entry list\n",
677 |     "        for idx in idx_long:\n",
678 |     "            entry[idx] = pri[idx]\n",
679 |     "        for idx in idx_short:\n",
680 |     "            entry[idx] = -pri[idx]\n",
681 |     "        \n",
682 |     " \n",
683 |     "    return(pnls)"
684 |    ]
685 |   },
686 |   {
687 |    "cell_type": "markdown",
688 |    "metadata": {},
689 |    "source": [
690 |     "Let's now apply our statistical arbitrage system to a simple long/short pair in order to test if our system is working properly. First, let's produce a cointegrated time series and plot it to confirm its properties."
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": null,
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": [
699 |     "# Create a seed for consistency\n",
700 |     "np.random.seed(27)\n",
701 |     "N = 2 # Number of assets\n",
702 |     "alpha = 0.1 # causality factor\n",
703 |     "X1 = make_coint_1(N,500,a=np.random.rand(N,N)*alpha) + 50 "
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": null,
709 |    "metadata": {},
710 |    "outputs": [],
711 |    "source": [
712 |     "plt.plot(X1.T[:,0])\n",
713 |     "plt.plot(X1.T[:,1])\n",
714 |     "coint(X1.T[:,0],X1.T[:,1])"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "markdown",
719 |    "metadata": {},
720 |    "source": [
721 |     "- series is highly cointegrated, we expect to see a very good result.\n",
722 |     "\n",
723 |     "__Please run the same simulation with a less cointegrate series by lowering *alpha* to 0.01.__"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": null,
729 |    "metadata": {},
730 |    "outputs": [],
731 |    "source": [
732 |     "# run the backtest\n",
733 |     "pnls = backtest(X1,max_pos=1,num_factors=1,initial_cash=1e6)\n",
734 |     "\n",
735 |     "# plot the result\n",
736 |     "plt.plot(np.cumsum(pnls));"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "code",
741 |    "execution_count": null,
742 |    "metadata": {},
743 |    "outputs": [],
744 |    "source": [
745 |     "pnls"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "markdown",
750 |    "metadata": {},
751 |    "source": [
752 |     "### PCA Portfolio Trading\n",
753 |     "\n",
754 |     "- Application of strategy to larger portfolio.\n",
755 |     "- Careful with the causality coefficients as large numbers of strong cross-dependencies can create positive feedback loops between the series. \n",
756 |     "- Larger N - higher to probability of the feedack loops for a given *alpha*.\n",
757 |     "\n",
758 |     "__Please vary alpha in this exercise and observe how the behaviour of our time series changes.__"
759 |    ]
760 |   },
761 |   {
762 |    "cell_type": "code",
763 |    "execution_count": null,
764 |    "metadata": {},
765 |    "outputs": [],
766 |    "source": [
767 |     "#np.random.seed(231)\n",
768 |     "N = 10\n",
769 |     "alpha = 0.03\n",
770 |     "X2 = make_coint_1(N,100,a=np.random.rand(N,N)*alpha) + 50"
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "code",
775 |    "execution_count": null,
776 |    "metadata": {},
777 |    "outputs": [],
778 |    "source": [
779 |     "for i in X2:\n",
780 |     "    plt.plot(i)"
781 |    ]
782 |   },
783 |   {
784 |    "cell_type": "markdown",
785 |    "metadata": {},
786 |    "source": [
787 |     "### Cumulative explained variance\n",
788 |     "\n",
789 |     "- 3 principle components explain 75% of variance\n",
790 |     "- Too many components lead to very high correlation and very small PnL/trade\n",
791 |     "- Enough PnL/trade to overcome trading costs"
792 |    ]
793 |   },
794 |   {
795 |    "cell_type": "code",
796 |    "execution_count": null,
797 |    "metadata": {},
798 |    "outputs": [],
799 |    "source": [
800 |     "pca = PCA(n_components=10)\n",
801 |     "pca.fit(np.log(X2))\n",
802 |     "\n",
803 |     "# Plot cumulative explained variance\n",
804 |     "plt.plot(np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_),'-o')\n",
805 |     "plt.grid()\n",
806 |     "plt.xlabel('Component')\n",
807 |     "plt.ylabel('Explained Variance')"
808 |    ]
809 |   },
810 |   {
811 |    "cell_type": "markdown",
812 |    "metadata": {},
813 |    "source": [
814 |     "- Useful to look at the average t-statistics between all possible pairs.\n",
815 |     "- High average t-stats - good probability of strategy success.\n",
816 |     "- Johansen test tends to perform poorly out-of-sample."
817 |    ]
818 |   },
819 |   {
820 |    "cell_type": "code",
821 |    "execution_count": null,
822 |    "metadata": {},
823 |    "outputs": [],
824 |    "source": [
825 |     "# Cointegration test:\n",
826 |     "coints = []\n",
827 |     "print('Critical values:',coint(X2[0],X2[1])[2])\n",
828 |     "for i in range(X2.shape[0]):\n",
829 |     "    for k in range(i,X2.shape[0]):\n",
830 |     "        if not i==k:\n",
831 |     "            coints.append(coint(X2[i],X2[k])[0])\n",
832 |     "            \n",
833 |     "print('Average coint t-stats:',np.mean(coints))"
834 |    ]
835 |   },
836 |   {
837 |    "cell_type": "markdown",
838 |    "metadata": {},
839 |    "source": [
840 |     "- Simulated data can give us an understanding how well our backtest performs under idealised conditions. \n",
841 |     "\n",
842 |     "- In the next part we are looking at applying this algorithm to real market data using Quantopian."
843 |    ]
844 |   },
845 |   {
846 |    "cell_type": "code",
847 |    "execution_count": null,
848 |    "metadata": {},
849 |    "outputs": [],
850 |    "source": [
851 |     "pnls = backtest(X2,max_pos=1,num_factors=1,initial_cash=1e6)\n",
852 |     "plt.plot(np.cumsum(pnls));\n",
853 |     "plt.show()"
854 |    ]
855 |   },
856 |   {
857 |    "cell_type": "code",
858 |    "execution_count": null,
859 |    "metadata": {},
860 |    "outputs": [],
861 |    "source": []
862 |   }
863 |  ],
864 |  "metadata": {
865 |   "kernelspec": {
866 |    "display_name": "Python 3",
867 |    "language": "python",
868 |    "name": "tribo"
869 |   },
870 |   "language_info": {
871 |    "codemirror_mode": {
872 |     "name": "ipython",
873 |     "version": 3
874 |    },
875 |    "file_extension": ".py",
876 |    "mimetype": "text/x-python",
877 |    "name": "python",
878 |    "nbconvert_exporter": "python",
879 |    "pygments_lexer": "ipython3",
880 |    "version": "3.6.1"
881 |   }
882 |  },
883 |  "nbformat": 4,
884 |  "nbformat_minor": 2
885 | }
886 | 


--------------------------------------------------------------------------------