├── README.md ├── WIKI_PRICES.csv ├── eda.py └── technicaltools ├── __init__.py ├── __init__.pyc ├── utils.py └── utils.pyc /README.md: -------------------------------------------------------------------------------- 1 | # trading_pairs 2 | 3 | This repo supplements the blog post I wrote here. 4 | It discusses a pairs trading strategy I explored. 5 | 6 | Kyle Franz 7 | -------------------------------------------------------------------------------- /WIKI_PRICES.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:dd5127aae478d270150904fcbad6e96a42e461e13c3d48a1587edb9b89cea43e 3 | size 235562224 4 | -------------------------------------------------------------------------------- /eda.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import datetime 5 | from matplotlib.finance import candlestick 6 | from matplotlib.dates import DateFormatter, WeekdayLocator, DayLocator, MONDAY, date2num 7 | import seaborn as sns 8 | import time 9 | import random 10 | from sklearn import linear_model 11 | from statsmodels.tsa.stattools import adfuller as adf 12 | 13 | from technicaltools import utils 14 | 15 | all_data = pd.read_csv('WIKI_PRICES.csv',delimiter=',') 16 | all_data['date_num'] = all_data['date'].apply(lambda d: date2num(datetime.datetime.strptime(d, "%Y-%m-%d"))) 17 | 18 | #create required normalization columns 19 | #this can be in its own function... some day 20 | t0 = all_data.groupby('ticker').first()[['adj_close']] 21 | t0.columns = ['adj_close_t0'] 22 | t0_df = pd.DataFrame(t0) 23 | t0_df.reset_index(inplace = True) 24 | 25 | #left join t0 on ticker 26 | all_data = pd.merge(all_data, t0_df, on = 'ticker', how = 'left') 27 | 28 | #Normalize data 29 | all_data['norm_close'] = all_data['adj_close'] / all_data['adj_close_t0'] 30 | all_data['ln_close'] = np.log(all_data['norm_close']) 31 | 32 | tickers = all_data['ticker'].unique() 33 | 34 | 35 | #easy plot stocks across provided tickers and columns to plot ie close, adj_close, normalized close 36 | def plot_stocks(stock_data, tickers, column, title = "Stock Performace"): 37 | for ticker in tickers: 38 | stock_points = stock_data.loc[stock_data['ticker'] == ticker][['date_num', column]] 39 | plt.plot(stock_points['date_num'], stock_points[column], label = ticker) 40 | 41 | plt.gca().xaxis.set_major_formatter(DateFormatter('%m/%d/%Y')) 42 | #plt.gca().xaxis.set_major_locator(DayLocator()) 43 | 44 | 45 | #This was used specically to annotate on graph for my blog post. 46 | #This test was run with GOOGL, MSFT pair 47 | #This adds arrows to the chart 48 | if False == True: 49 | ##PLOT ANNOTATIONS FOR MICROSOFT SPECIFICALLY 50 | plt.annotate('Divergence', xy = (735536, 1.36), xytext = (735294, 1.65), arrowprops=dict(facecolor='yellow', shrink=0.05)) 51 | plt.annotate('', xy = (735522, 0.92), xytext = (735408, 1.65), arrowprops=dict(facecolor='yellow', shrink=0.05)) 52 | 53 | plt.annotate('Converge', xy = (735631, 1.10), xytext = (735713, 0.85), arrowprops=dict(facecolor='pink', shrink=0.05)) 54 | #plt.annotate('', xy = (735522, 0.92), xytext = (735408, 1.65), arrowprops=dict(facecolor='pink', shrink=0.05)) 55 | 56 | ##PLOT ANNOTATIONS FOR MICROSOFT SPECIFICALLY 57 | plt.annotate('Divergence', xy = (735715, 1.36), xytext = (735744, 1.6), arrowprops=dict(facecolor='yellow', shrink=0.05)) 58 | #plt.annotate('', xy = (735727, 0.92), xytext = (735798, 1.45), arrowprops=dict(facecolor='yellow', shrink=0.05)) 59 | 60 | plt.annotate('Converge', xy = (735795, 1.22), xytext = (735991, 1.00), arrowprops=dict(facecolor='pink', shrink=0.05)) 61 | #plt.annotate('', xy = (735522, 0.92), xytext = (735408, 1.65), arrowprops=dict(facecolor='pink', shrink=0.05)) 62 | 63 | 64 | plt.xlabel('Date') 65 | plt.ylabel('% Gains') 66 | plt.title(title) 67 | plt.legend() 68 | plt.show() 69 | 70 | def run_ADF_regression(stock_data, t1, t2, column = 'adj_close'): 71 | x_points = stock_data.loc[stock_data['ticker'] == t1][column] 72 | y_points = stock_data.loc[stock_data['ticker'] == t2][column] 73 | 74 | reg = linear_model.LinearRegression() 75 | reg.fit(x_points.reshape((len(x_points), 1)), y_points) 76 | 77 | return reg 78 | 79 | def plot_scatter_compare(stock_data, t1, t2, column = 'adj_close', with_regression = False, plot_title = 'Scatter Regression'): 80 | x_points = stock_data.loc[stock_data['ticker'] == t1][column] 81 | y_points = stock_data.loc[stock_data['ticker'] == t2][column] 82 | 83 | if with_regression: 84 | reg = run_ADF_regression(stock_data, t1, t2, column) 85 | 86 | # print "Coefficients: \n" , reg.coef_ 87 | # print "Intercept: \n" , reg.intercept_ 88 | # print "Residues: \n" , reg.residues_ 89 | plt.plot(x_points, reg.predict(x_points.reshape((len(x_points), 1))), c = 'red') 90 | 91 | plt.xlabel(t1) 92 | plt.ylabel(t2) 93 | plt.scatter(x_points, y_points) 94 | plt.title(plot_title) 95 | plt.show() 96 | 97 | def get_residuals(stock_data, t1, t2, column = 'adj_close'): 98 | stock_a = stock_data.loc[stock_data['ticker'] == t1][['date_num', column]] 99 | stock_b = stock_data.loc[stock_data['ticker'] == t2][['date_num', column]] 100 | 101 | stock_a.reset_index(inplace = True) 102 | stock_b.reset_index(inplace = True) 103 | 104 | reg = run_ADF_regression(stock_data, t1, t2, column) 105 | residuals = reg.intercept_ + reg.coef_[0] * stock_b[column] - stock_a[column] 106 | 107 | res_df = residuals.to_frame() 108 | res_df.columns = ['residuals'] 109 | 110 | res_df['date_num'] = stock_a['date_num'] 111 | 112 | return res_df 113 | 114 | 115 | def ADF_test(residuals, output_log = False, title = "ADF Test Results"): 116 | t0 = residuals 117 | t1 = residuals.shift() 118 | 119 | shifted = t1 - t0 120 | shifted.dropna(inplace = True) 121 | 122 | plt.plot(shifted, c='green') 123 | plt.show() 124 | 125 | adf_value = adf(shifted, regression = 'nc') 126 | 127 | test_statistic = adf_value[0] 128 | pvalue = adf_value[1] 129 | usedlags = adf_value[2] 130 | nobs = adf_value[3] 131 | 132 | 133 | if output_log: 134 | #output on figure eventually, that looks really professional 135 | print title 136 | print "Test Statistic: %.4f\nP-Value: %.4f\nLags Used: %d\nObservations: %d" % (test_statistic, pvalue, usedlags, nobs) 137 | 138 | for crit in adf_value[4]: 139 | print crit, adf_value[4][crit] 140 | #print "Critical Value (%s): %.3f" % (crit, adf_value[crit]) 141 | 142 | return adf_value 143 | 144 | 145 | def plot_residuals(stock_data, t1, t2, column = 'adj_close', plot_title = 'residuals'): 146 | 147 | residuals = get_residuals(stock_data, t1, t2, column) 148 | 149 | x = range(0, len(residuals)) 150 | plt.plot(residuals['date_num'], residuals['residuals'], c = 'red') 151 | 152 | #mean, top, bot = utils.boilerbands(residuals, 75, 1.5) 153 | #x_boiler = range(0, len(mean)) 154 | 155 | res_mean = residuals['residuals'].mean() 156 | res_std = residuals['residuals'].std() 157 | 158 | #plt.annotate('MSFT Overvalued', xy = (735544, 0.59), xytext = (735225, 0.675), arrowprops=dict(facecolor='green', shrink=0.05)) 159 | #plt.annotate('', xy = (735720, 0.56), xytext = (735490, 0.675), arrowprops=dict(facecolor='green', shrink=0.05)) 160 | 161 | #Apply the standard deviation filter on top of the residuals chart 162 | #comment out if you dont want the residuals 163 | plt.plot(residuals['date_num'], np.full((len(residuals), 1), res_mean), c = '#1E293D', linestyle = '--', label = 'Mean (%.3f)' % residuals['residuals'].mean()) 164 | plt.plot(residuals['date_num'], np.full((len(residuals), 1), res_mean + .5*res_std), c = '#445E76', linestyle = '--', label = 'Mean (%.3f)' % residuals['residuals'].mean()) 165 | plt.plot(residuals['date_num'], np.full((len(residuals), 1), res_mean - .5*res_std), c = '#445E76', linestyle = '--', label = 'Mean (%.3f)' % residuals['residuals'].mean()) 166 | plt.plot(residuals['date_num'], np.full((len(residuals), 1), res_mean + res_std), c = '#7498B7', linestyle = '--', label = 'Mean (%.3f)' % residuals['residuals'].mean()) 167 | plt.plot(residuals['date_num'], np.full((len(residuals), 1), res_mean - res_std), c = '#7498B7', linestyle = '--', label = 'Mean (%.3f)' % residuals['residuals'].mean()) 168 | plt.plot(residuals['date_num'], np.full((len(residuals), 1), res_mean + 1.5*res_std), c = '#B1CCE0', linestyle = '--', label = 'Mean (%.3f)' % residuals['residuals'].mean()) 169 | plt.plot(residuals['date_num'], np.full((len(residuals), 1), res_mean - 1.5*res_std), c = '#B1CCE0', linestyle = '--', label = 'Mean (%.3f)' % residuals['residuals'].mean()) 170 | 171 | offset_y_frac = 0.012 172 | offset_y = 0.01 173 | 174 | plt.text(736342, res_mean - offset_y, "$\mu$") 175 | plt.text(736342, res_mean + 0.5*res_std - offset_y_frac, '$\\frac{1}{2} \sigma$') 176 | plt.text(736342, res_mean - 0.5*res_std - offset_y_frac, '$\\frac{1}{2} \sigma$') 177 | plt.text(736342, res_mean + res_std - offset_y, '$\sigma$') 178 | plt.text(736342, res_mean - res_std - offset_y, '$\sigma$') 179 | plt.text(736342, res_mean + 1.5*res_std - offset_y_frac, '$\\frac{3}{2} \sigma$') 180 | plt.text(736342, res_mean - 1.5*res_std - offset_y_frac, '$\\frac{3}{2} \sigma$') 181 | ##END STD OVERLAY 182 | 183 | plt.xlabel('Date') 184 | plt.ylabel('$S_t$') 185 | plt.gca().xaxis.set_major_formatter(DateFormatter('%m/%d/%Y')) 186 | 187 | #plt.plot(x_boiler, mean, c='cyan') 188 | #plt.plot(x_boiler, top, c='pink') 189 | #plt.plot(x_boiler, bot, c='pink') 190 | #plt.fill_between(x_boiler, top, bot, alpha = 0.5) 191 | #plt.legend(frameon = True).get_frame().set_facecolor('white') 192 | plt.title(plot_title) 193 | plt.show() 194 | 195 | 196 | 197 | #get a tuple list of all correlations that meet the thresholf found in the corr matrix 198 | #currently it returns a pair for each we may want to address this 199 | # 200 | #Also, we need to filter out and account for low volume stocks as well. 201 | #Probably do that later 202 | def correlation_threshold(corr_matrix, threshold): 203 | correlation_threshold = threshold 204 | top_correlations = [] 205 | 206 | for index, row in corr_matrix.iterrows(): 207 | 208 | for ind, element in enumerate(row): 209 | if element > correlation_threshold and index != corr_matrix.columns.values[ind]: 210 | correlation_element = (index, corr_matrix.columns.values[ind], element) 211 | top_correlations.append(correlation_element) 212 | 213 | print top_correlations 214 | 215 | 216 | def get_correlation(stock_data, tickers, plot_title = 'Covariance Matrix', plot = True, output_log = False, annot = False): 217 | start_time = time.time() 218 | index_frames = [] 219 | for symbol in tickers: 220 | a_s = stock_data.loc[stock_data['ticker'] == symbol][['adj_close', 'date_num']] 221 | as_index = a_s.set_index(['date_num']) 222 | as_index.columns = [symbol] 223 | index_frames.append(as_index) 224 | 225 | joined_data = pd.concat(index_frames, axis = 1) 226 | 227 | #return matrix? print matplot lib 228 | corr_matrix = joined_data.corr() 229 | 230 | if plot: 231 | f, ax = plt.subplots(figsize=(11, 9)) 232 | cmap = sns.diverging_palette(220, 10, as_cmap = True) 233 | 234 | # Generate a mask for the upper triangle 235 | #mask = np.zeros_like(corr_matrix, dtype=np.bool) 236 | #mask[np.triu_indices_from(mask)] = True 237 | 238 | sns.heatmap(corr_matrix, cmap = cmap, ax = ax, vmax = 1.0, vmin = -1.0, linewidths = 0.5, annot = annot) 239 | plt.title(plot_title) 240 | plt.show() 241 | 242 | 243 | if output_log: 244 | print corr_matrix 245 | print "Run Time: %s seconds" % (time.time() - start_time) 246 | 247 | return corr_matrix 248 | 249 | 250 | 251 | tech_stocks = ['AAPL', 'GOOGL', 'GOOG', 'MSFT', 'FB', 'IBM', 'CSCO'] 252 | # auto_stocks = ['F', 'GM', 'MMM', 'ABT', 'ABBV', 'FOX', 'FOXA'] 253 | consumer = ['AN', 'AZO', 'CCL', 'CBS', 'CMG', 'COH', 'CMCSA'] 254 | portfolio = ['MAS', 'AZO', 'CSCO', 'SJM'] 255 | 256 | consumer.extend(tech_stocks) 257 | 258 | #get_correlation shows the big table with annotations = True 259 | #get_correlation(all_data, consumer, annot = True, plot = True) 260 | 261 | 262 | 263 | stock_set = ['CMCSA', 'CSCO'] 264 | plot_stocks(all_data, stock_set, column = 'norm_close', title = "%s / %s" % (stock_set[0], stock_set[1])) 265 | plot_residuals(all_data, stock_set[0], stock_set[1], column = 'norm_close', plot_title = '$S_t$, Stock A: %s Stock B: %s' % (stock_set[0], stock_set[1])) 266 | 267 | 268 | print(len(stock_set[1]), len(stock_set[0])) 269 | 270 | #plot_scatter_compare(all_data, pairs_test[0], pairs_test[1], 'ln_close', with_regression = True, plot_title="Lognormal Scatter") 271 | 272 | #adf_results = ADF_test(get_residuals(all_data, pairs_test[0], pairs_test[1], column = 'ln_close'), output_log = True, title = '%s, %s Lognormal ADF Stationality Test' % (pairs_test[0], pairs_test[1])) 273 | #plot_residuals(all_data, pairs_test[0], pairs_test[1], column = 'ln_close', plot_title = "Residuals, Stock A: %s, Stock B: %s" % (pairs_test[0], pairs_test[1])) 274 | 275 | 276 | 277 | #testing for similiarity 278 | #with this method you will pretty much always get a stationary result. Especially because of the drift factor. 279 | #This means that your results from this test are usless to the underlying data 280 | #can use in tandem with our correlation calculations to hand pick, but I don't think there is an automated system based on this ADF testing 281 | 282 | #The Thesis paper mentions Vidyamurthy's paper in which he analyzes the amount of times the residuals cross the mean 283 | #I wonder how this performed, i would say much better at detecting similarities than this ADF. At least for stock moves 284 | #Because they are inhereitely normally disributed, and you will miss the drift if you include a drift factor. 285 | 286 | #next step is to create the buy signlals, maybe move some of this code into functions or even modules. I think right now its fine 287 | #eventually maybe have a module for this Cointegration Test and another for the various other types of statistical tests. 288 | 289 | 290 | 291 | 292 | #benchmark against market 293 | 294 | def benchmark(stock_data, t1, t2): 295 | stock_a = stock_data.loc[stock_data['ticker'] == t1][['date_num', 'adj_close']] 296 | stock_b = stock_data.loc[stock_data['ticker'] == t2][['date_num', 'adj_close']] 297 | 298 | stock_a.reset_index(inplace = True) 299 | stock_b.reset_index(inplace = True) 300 | 301 | #stock_a and stock_b contain all price information 302 | 303 | #next, calculate residuals 304 | residuals = get_residuals(stock_data, t1, t2, column='norm_close') 305 | 306 | 307 | 308 | #using boiler bands as buy signals 309 | bb_mean, bb_top, bb_bot = utils.boilerbands(residuals, 75, 1.5) 310 | 311 | #implement stop losses 312 | #stop loss on 1.5% down, 2.5% (hyperparam) 313 | 314 | #we have to check if close < stop loss from day prior, sell. 315 | 316 | 317 | long_pos = () 318 | short_pos = () 319 | pnl = 0 320 | entry_points = [] 321 | exit_points = [] 322 | #begin iterating through the days to backtest 323 | for i, row in enumerate(bb_mean): 324 | 325 | if long_pos: 326 | if long_pos[0] == t2: 327 | long_pnl = (stock_b.loc[i]['adj_close'] - long_pos[1]) 328 | long_returns = (stock_b.loc[i]['adj_close'] - long_pos[1]) / long_pos[1] 329 | 330 | short_pnl = (short_pos[1] - stock_a.loc[i]['adj_close']) 331 | #short_returns 332 | 333 | if long_returns > 0.0225: 334 | pnl += long_pnl 335 | pnl += short_pnl 336 | print pnl 337 | long_pos = () 338 | short_pos = () 339 | exit_points.append(i) 340 | elif long_returns < -0.0225: 341 | pnl += long_pnl 342 | pnl += short_pnl 343 | print pnl 344 | long_pos = () 345 | short_pos = () 346 | exit_points.append(i) 347 | else: 348 | pass 349 | 350 | elif residuals.loc[i] < bb_bot.loc[i] and not short_pos: 351 | short_pos = (t1,stock_a.loc[i]['adj_close'], i) 352 | long_pos = (t2, stock_b.loc[i]['adj_close'], i) 353 | entry_points.append(i) 354 | print "BUY %s @ %.3f / SELL %s @ %.3f" % (t2, stock_b.loc[i]['adj_close'], t1, stock_a.loc[i]['adj_close']) 355 | #elif long_pos[1] 356 | #if residual < bot boiler band and not short stock A 357 | #short stock A 358 | #long stock B 359 | #else if residual > top boiler band and not long stock A 360 | #short stock B 361 | #short stock A 362 | #else if long PNL < 2%, close all positions #sell signalsssss 363 | #wait 3 days 364 | #print 'BBTOP: %.3f, BBBOT: %.3f, Residual:%.3f, Stock A: %.3f, Stock B: %.3f' % (bb_top.loc[i], bb_bot.loc[i], residuals.loc[i], stock_a.loc[i]['norm_close'], stock_b.loc[i]['norm_close']) 365 | 366 | 367 | plt.plot(range(0, len(bb_top)), stock_a['adj_close'], label = t1) 368 | plt.plot(range(0, len(bb_top)), stock_b['adj_close'], label = t2) 369 | 370 | 371 | for e in exit_points: 372 | plt.axvline(x=e, c = 'red') 373 | for b in entry_points: 374 | plt.axvline(x=b, c = 'green') 375 | plt.show() 376 | 377 | 378 | #benchmark(all_data, stock_set[0], stock_set[1]) 379 | 380 | #stop losses 381 | 382 | 383 | 384 | -------------------------------------------------------------------------------- /technicaltools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmfranz/trading_pairs/aff4c4f3b677b0434bfedbc12b4137facaf7a0bb/technicaltools/__init__.py -------------------------------------------------------------------------------- /technicaltools/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmfranz/trading_pairs/aff4c4f3b677b0434bfedbc12b4137facaf7a0bb/technicaltools/__init__.pyc -------------------------------------------------------------------------------- /technicaltools/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def boilerbands(stock_data, k, n_adj, moving_average = 'normal'): 5 | mean = pd.stats.moments.rolling_mean(stock_data, k) 6 | std = pd.stats.moments.rolling_std(stock_data, k) 7 | 8 | top_band = mean + (std*n_adj) 9 | bot_band = mean - (std*n_adj) 10 | 11 | 12 | return mean, top_band, bot_band 13 | 14 | 15 | #moving_average = 'normal', 'exponential' 16 | #return a new frame with boil 17 | 18 | def moving_average(stock_data, k): 19 | return stock_data.rolling_mean(k) -------------------------------------------------------------------------------- /technicaltools/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmfranz/trading_pairs/aff4c4f3b677b0434bfedbc12b4137facaf7a0bb/technicaltools/utils.pyc --------------------------------------------------------------------------------