├── Images
├── buySell.png
├── FittingWindow.png
├── cointegration.png
└── trade_performance.png
├── Report+Poster
├── Pairs_Trading_Doc.pdf
└── Pairs_Trading_Poster.pdf
├── Code
├── CointHurst.py
├── PairsTrade.py
└── slib.py
└── README.md
/Images/buySell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/HEAD/Images/buySell.png
--------------------------------------------------------------------------------
/Images/FittingWindow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/HEAD/Images/FittingWindow.png
--------------------------------------------------------------------------------
/Images/cointegration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/HEAD/Images/cointegration.png
--------------------------------------------------------------------------------
/Images/trade_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/HEAD/Images/trade_performance.png
--------------------------------------------------------------------------------
/Report+Poster/Pairs_Trading_Doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/HEAD/Report+Poster/Pairs_Trading_Doc.pdf
--------------------------------------------------------------------------------
/Report+Poster/Pairs_Trading_Poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/HEAD/Report+Poster/Pairs_Trading_Poster.pdf
--------------------------------------------------------------------------------
/Code/CointHurst.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Apr 30 14:57:23 2020
4 |
5 | @author: alber
6 | """
7 |
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 |
11 | from statsmodels.tsa.stattools import coint
12 |
13 | import slib as slb
14 | plt.close('all')
15 | #%%Load data
16 | start_year = 2010; end_year = 2020;
17 | names = np.load('names.npy'); symbols = np.load('symbols.npy');
18 | variation = np.load('variation.npy'); close_val = np.load('close_val.npy'); open_val = np.load('open_val.npy')
19 |
20 | #%%Cointegration Value
21 | # =============================================================================
22 | # scores, pvalues, pairs = slb.find_cointegrated_pairs(close_val)
23 | # #Save data
24 | # np.save('scores',scores)
25 | # np.save('pvalues',pvalues)
26 | # np.save('pairs',pairs)
27 | # =============================================================================
28 | pairs = np.load('pairs.npy'); pvalues = np.load('pvalues.npy'); scores = np.load('scores.npy');
29 | slb.covplot(pvalues,symbols,'Cointegration')
30 |
31 | pvalue = slb.cointplt(close_val,pairs[1],names,start_year,end_year,plot=True)#1#5#9
32 | #%%
33 | S1 = pairs[1,0]; S2 = pairs[1,1]
34 | ratio = close_val[S1]/close_val[S2]
35 | H=slb.hurstcoeff(close_val,pairs[1],names,plot=True)
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pairs-Trading-as-application-of-the-Ornstein-Uhlenbeck-Process
2 | A model simulation shows how pairs trading could be used for two S&P500 traded stocks. It proofs that the strategy is successful on real data, which is downloaded via the
3 | [pandas-datareader](https://pandas-datareader.readthedocs.io/en/latest/). In the [report and poster section](https://github.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/tree/master/Report%2BPoster) a profound analysis of the pairs trading underlying Ornstein-Uhlenbeck process is done. Rigorous mathematical formulations can be found in this section also.
4 |
5 | Keywords: Stochastic differential equations, Ornstein-Ulenbeck Process, Pairs Trading, S&P500
6 |
7 | ### Work Flow & Manual:
8 | 1. Load stock data from [slib.py](https://github.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/blob/master/Code/slib.py) with **stockvals()**.
9 | 2. Determine two stocks suitable for the pairs trading strategy with **find_cointegrated_pairs()** in [CointHurst.py](https://github.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/blob/master/Code/CointHurst.py). Note tha the output 'pairs' already suggest pairs of stocks with a significance level below 0.05 using a cointegration test. A full map of pvalues for pairwise cointegration can be displayed with **covplot()**.
10 | 3. Use **pairstrade()** in [PairsTrade.py](https://github.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/blob/master/Code/PairsTrade.py) to execute the pairs trading strategy on the two stocks (id0, id1) under consideration. The parameters that determine the success of the strategy are defined in the **pairstrade()** input *param: f8[d,buy_signal,sell_signal]* - threshold parameters for the time window, buying and selling signal. A graph that shows the buying and selling signal and how much money the strategy would have made can be displayed by setting *plot = True*.
11 |
12 | ### Getting started
13 | ```python
14 | import pandas_datareader.data as web
15 | from statsmodels.tsa.stattools import coint
16 | import numpy as np
17 | import matplotlib.pyplot as plt
18 | import slib as slb #Custom library
19 |
20 | #%% Load data
21 | names[indices.astype(int)],symbols[indices.astype(int)],variation,close_val,open_val = stockvals(df,start_date,end_date)
22 | data = pd.DataFrame.from_records(close_val)
23 |
24 | #%% Find suitable stocks for pairs trading
25 | scores, pvalues, pairs = slb.find_cointegrated_pairs(close_val)
26 | slb.covplot(pvalues,symbols,'Cointegration')
27 |
28 | #%% Perform trading strategy
29 | start_year = 2010; end_year = 2020;
30 | d, buy_signal, sell_signal = 252 0.05, 0.01
31 | param = [d,buy_signal,sell_signal]
32 |
33 | money,buy,sell,k,Y_tilde,t,S0,S1 = slb.pairstrade(param,data_train,id0,id1,names,start_year,end_year,plot=True)
34 | ```
35 |
36 |
37 | ### Map of pairwise cointegrated stocks
38 |
39 | Cointegration
40 | :-------------------------:
41 |
42 |
43 | Map of p-values for pairwise cointegration. The statistical test for cointegration checks how likely the closing values of two stocks are mean reversive. The null hypothesis is no cointegration. Hence low p-values suggest that the stochastic processes under consideration show the property of being mean reversiv and can be described as an Ornstein-Uhlenbeck processes. They are thus suitable to perform the pairs trading strategy.
44 |
45 |
46 |
47 | ### Buying-Selling Signal & Trading Performance
48 |
49 | Buying-Selling Signal | Trade Performance
50 | :-------------------------:|:-------------------------:
51 |
|
52 |
53 | Buying and selling signals for two suitable S&P500 stocks (Adobe Inc.-Cintas Corporation) over a time period of 10 years (2010-2020).
54 | The strategy started with 0 EUR, ended with 3000 EUR of hypothetical profit.
55 |
56 |
57 |
58 | ### How to contribute
59 | Fork from the `Developer`- branch and pull request to merge back into the original `Developer`- branch.
60 | Working updates and improvements will then be merged into the `Master` branch, which will always contain the latest working version.
61 |
62 | With:
63 | * [Daniel Brus](https://www.linkedin.com/in/daniel-brus)
64 |
65 | ### Dependencies
66 | [Numpy](https://numpy.org/),
67 | [Matplotlib](https://matplotlib.org/),
68 | [Pandas](https://pandas.pydata.org/), [pandas-datareader](https://pandas-datareader.readthedocs.io/en/latest/), [Statsmodels](https://www.statsmodels.org/stable/index.html),
69 | [slib (custom)](https://github.com/david-alber/Pairs-Trading-as-application-to-the-Ornstein-Uhlenbeck-Process/blob/master/Code/slib.py)
70 |
71 |
--------------------------------------------------------------------------------
/Code/PairsTrade.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy as sp
3 | import matplotlib.pyplot as plt
4 | import pandas as pd
5 | from scipy.stats import norm
6 | from tqdm import tqdm
7 | import slib as slb
8 | plt.close('all')
9 | #%%
10 | start_year = 2010; end_year = 2020;
11 | end_yearTe = end_year
12 | names = np.load('names.npy'); symbols = np.load('symbols.npy');
13 | variation = np.load('variation.npy'); close_val = np.load('close_val.npy'); open_val = np.load('open_val.npy')
14 | pairs = np.load('pairs.npy'); pvalues = np.load('pvalues.npy'); scores = np.load('scores.npy');
15 |
16 | data = pd.DataFrame.from_records(close_val)
17 | days = np.shape(data)[1]
18 | train = 0.7
19 | data_train = data.loc[:,0:int(days*train)]
20 | data_test = data.loc[:,int(days*train):]
21 | end_year = int(start_year + (end_year-start_year)*train) #redifine end year according to training samples
22 | start_yearTe = end_year;
23 | id0 = pairs[1,0]; id1 = pairs[1,1]
24 |
25 | #%%
26 | def returnMaximizer(random_walks,function,n_param,data,id0,id1,start_year,end_year,d):
27 | """
28 | random_walks : f8; number of random walks, i.e different random init guesses
29 | function : function handle; function to optimize: sp.optimize.minimize(...,method='Nelder-Mead')
30 | n_param : f8; number of free parameters to optimize
31 |
32 | Returns: opt_object: .x->optimal parameters; .fun-> fct evaluation at opt parameters
33 | """
34 | #Funcition specific definitions
35 | d = int(d)
36 |
37 | minimum = 9e15
38 | for i in tqdm(range(0,random_walks),position=0,desc ='MPL Maximizer'): #Minimize for different initial guesses
39 | initial_guess = np.zeros(n_param)
40 | initial_guess[0] = np.random.uniform(0,3);
41 | initial_guess[1] = np.random.uniform(0,initial_guess[0])#print(initial_guess) #init guess for values of connectivity matrix Jmat
42 | optimizer_object = sp.optimize.minimize(function,initial_guess,method='Nelder-Mead')
43 | y = optimizer_object.fun
44 | if (y < minimum): #keep parameters if -MPL is smaller than before
45 | minimum = y;# print(minimum)
46 | opt_object = optimizer_object;
47 | print(f'Parameters that minimize returns: buy,sell = {opt_object.x}')
48 | return opt_object
49 |
50 | def pairstrade_maxi(param):
51 | buy_signal,sell_signal = param[0],param[1]
52 | S0 = data.iloc[id0][:]; S1 = data.iloc[id1][:]
53 |
54 | k = (S0/S1).rolling(window=int(d), center=False).mean()
55 | Y_tilde = np.log(S0/(k*S1))[d:-1] # model implied OU value based on deviation from k_old
56 |
57 | #TRADE THE PAIR BASED ON OU IMPLIED OVER-/UNDERVALUATION.
58 | t = np.linspace(start_year,end_year,int(len(S0)))
59 | S0=S0[d:-1]; S1=S1[d:-1]; k = k[d:-1]; t = t[d:-1]
60 |
61 | money = np.zeros(len(Y_tilde)) # start trading with no positions
62 | S0_held = np.zeros(len(Y_tilde))
63 | S1_held = np.zeros(len(Y_tilde))
64 |
65 | buy = []; sell = []
66 | for i in range(len(S0)):
67 | if Y_tilde.iloc[i] > buy_signal: # short the pair when OU up
68 | S0_held[i] = S0_held[i-1] - 1
69 | S1_held[i] = S1_held[i-1] + S0.iloc[i]/S1.iloc[i]
70 | money[i] = money[i-1] - 0.005 * 2 * S0.iloc[i] # 50 bps tx costs
71 |
72 | buy.append((t[i], S1.iloc[i]))
73 | sell.append((t[i], S0.iloc[i]))
74 |
75 | elif Y_tilde.iloc[i] < -buy_signal: # long the pair when OU down
76 | S0_held[i] = S0_held[i-1] + 1
77 | S1_held[i] = S1_held[i-1] - S0.iloc[i]/S1.iloc[i]
78 | money[i] = money[i-1] - 0.005 * 2 * S0.iloc[i]
79 |
80 | buy.append((t[i], S0.iloc[i]))
81 | sell.append((t[i], S1.iloc[i]))
82 |
83 | elif abs(Y_tilde.iloc[i]) < sell_signal and S0_held[i-1]!=0: # clear position when ratio normal
84 | money[i] = money[i-1] + (S0_held[i-1]*S0.iloc[i] + S1_held[i-1]*S1.iloc[i]) * 0.995
85 | S0_held[i], S1_held[i] = 0, 0
86 |
87 | if S0_held[i-1] < 0:
88 | buy.append((t[i], S0.iloc[i]))
89 | sell.append((t[i], S1.iloc[i]))
90 | else:
91 | buy.append((t[i], S1.iloc[i]))
92 | sell.append((t[i], S0.iloc[i]))
93 |
94 | else:
95 | money[i] = money[i-1]
96 | S0_held[i] = S0_held[i-1]
97 | S1_held[i] = S1_held[i-1]
98 | ret = 9e10-(money[-1]);
99 | return ret #flip sign for optimizer
100 |
101 | d = 252
102 | n_param = 2
103 | random_walks = 20
104 | opt_object = returnMaximizer(random_walks,pairstrade_maxi,n_param,data,id0,id1,start_year,end_year,d)
105 |
106 | #%%
107 | buy_signal, sell_signal = opt_object.x[0], opt_object.x[1]
108 | buy_signal, sell_signal = 0.05, 0.01
109 |
110 | windows = np.linspace(10,700,100)
111 | train_return = np.zeros(len(windows)); test_return = np.zeros(len(windows))
112 | for i in range(0,len(windows)):
113 | d = int(windows[i])
114 | param = np.array([d,buy_signal,sell_signal]) #d,buy_signal,sell_signal
115 | #training data evaluation
116 | moneyTr,_,_,_,_,_,_,_ = slb.pairstrade(param,data_train,id0,id1,names,start_year,end_year,plot=False)
117 | #test data evaluation
118 | moneyTe,_,_,_,_,_,_,_ = slb.pairstrade(param,data_test,id0,id1,names,start_yearTe,end_yearTe,plot=False)
119 |
120 | train_return[i] = moneyTr[-1]
121 | test_return[i] = moneyTe[-1]
122 |
123 | #%% Plot Window Fit
124 | plt.figure('FittingWindow')
125 | plt.plot(windows,train_return,color='g',label='Trainin data')
126 | plt.plot(windows,test_return,color='r',label='Test data')
127 | plt.title('Window analysis')
128 | plt.xlabel('Window length [days]'); plt.ylabel('Final money [EUR]')
129 | plt.legend(); plt.show()
130 | #%%
131 | buy_signal, sell_signal = opt_object.x[0], opt_object.x[1]
132 | best_window = windows[np.argmax(train_return)]
133 | param = np.array([best_window,buy_signal,sell_signal])
134 | moneybest,_,_,_,_,_,_,_ = slb.pairstrade(param,data_train,id0,id1,names,start_year,end_year,plot=True)
135 | #%%
136 | best_window=252
137 | buy_signal, sell_signal = 0.05, 0.01
138 | param = np.array([best_window,buy_signal,sell_signal])
139 | moneybest,_,_,_,_,_,_,_ = slb.pairstrade(param,data_train,id0,id1,names,start_year,end_year,plot=True)
--------------------------------------------------------------------------------
/Code/slib.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Apr 27 18:37:42 2020
4 |
5 | @author: alber
6 | """
7 |
8 |
9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | import pandas_datareader.data as web
12 | from matplotlib.collections import LineCollection
13 | from sklearn import cluster, covariance, manifold, metrics
14 | from sklearn.cluster import KMeans
15 | import matplotlib.cm as cm
16 | from tqdm import tqdm #Progressbar
17 |
18 | from statsmodels.tsa.stattools import coint
19 |
20 | def stockvals(df,start_date,end_date):
21 | """
22 | Converts opening/closing values of stocks from a pd dataframe to np arrays
23 | of floats and converts the names to arrays of strings
24 | IN:
25 | df : pandas dataframe of stocks
26 | start_date, end_date : datetime; start and end date
27 | end_date : TYPE
28 | RETURNS:
29 | names,symbols : str, stock names and symbols
30 | variation : f8[n_samples,n_timesteps], variation value between opening and closing price
31 | """
32 | #convert pd dataframes to strings
33 | symbols, names = df.Symbol, df.Security
34 | symbols = symbols.to_numpy()
35 | symbols = symbols.astype(str)
36 | names = names.to_numpy()
37 | names = names.astype(str)
38 | start_date_int = datetime_to_integer(start_date)
39 | #Stocks under consideration (from S&P500)
40 | n_stocks = len(symbols)
41 | #Open - Closing value of stocks (as float)
42 | indices = []; open_val = []; close_val = []
43 | for j in tqdm(range(0,n_stocks),position=0,desc='Loading Stock Data'):
44 | if j == 91:
45 | continue
46 | date_string=(df.iloc[j][6]).replace('-',''); #print(date_string)
47 | date_added = int(date_string[:8])
48 | if(date_added <= start_date_int):
49 | index = j
50 | indices = np.append(indices,index)
51 | quotes = web.DataReader(symbols[j], 'yahoo', start_date, end_date)
52 | opening = quotes.Open
53 | closing = quotes.Close
54 | open_val = np.append(open_val,opening,axis=0)
55 | close_val = np.append(close_val,closing,axis=0)
56 | open_val = open_val.reshape(len(indices),-1)
57 | close_val = close_val.reshape(len(indices),-1)
58 | variation = open_val-close_val
59 | return names[indices.astype(int)],symbols[indices.astype(int)],variation,close_val,open_val
60 |
61 | def datetime_to_integer(dt_time):
62 | return 10000*dt_time.year + 100*dt_time.month + dt_time.day
63 |
64 | def blocking(tt,vector,blocks):
65 | """
66 | This is a function which helps to process big data files more easily
67 | by the method of block averaging.
68 | For this the second argument is a vector with data, e.g. averaged temperature/stock price
69 | the second argument is another vector, e.g. time grid.
70 | The third argument should be the number of blocks.
71 | The more blocks, the more data points are taken into consideration.
72 | If less blocks, more averaging takes place.
73 | Out: blockvec - blockaveraged vector
74 | blocktt - timesteps acording to blockaveraged data
75 | ...
76 | bdata - number of data points combined in one block
77 | """
78 | blockvec = np.zeros(blocks)
79 | elements = len(vector)
80 | rest = elements % blocks
81 | if rest != 0: #truncate vector if number of blocks dont fit in vector
82 | vector = vector[0:-rest]
83 | tt = tt[0:-rest]
84 | elements = len(vector)
85 | meanA = np.mean(vector)
86 | bdata = int((elements/blocks))#how many points per block
87 | sigBsq = 0;
88 | for k in range(0,blocks):
89 | blockvec[k] = np.average(vector[k*bdata : (k+1)*bdata])
90 | sigBsq = sigBsq + (blockvec[k]-meanA)**2
91 | sigBsq *= 1/(blocks-1);
92 | sigmaB = np.sqrt(sigBsq)
93 | error = 1/np.sqrt(blocks)*sigmaB
94 | blocktt = tt[0:-1:bdata]
95 | return(blockvec,blocktt,error,sigmaB,bdata)
96 |
97 | def covplot(cov,symbols,title):
98 | """
99 | Plots a covariance matrix cov as heatmap. The x-,y tick lables are the input symbols
100 | cov : f8[n_samples,n_samples]; covariance matrix
101 | symbols : str[n_samples]; name of x and y tick labels
102 | """
103 | fig, ax = plt.subplots(figsize=(12,7))
104 | im = ax.imshow(cov)
105 | plt.colorbar(im, spacing='proportional')
106 | # We want to show all ticks...
107 | ax.set_xticks(np.arange(len(symbols)))
108 | ax.set_yticks(np.arange(len(symbols)))
109 | # ... and label them with the respective list entries
110 | ax.set_xticklabels(symbols)
111 | ax.set_yticklabels(symbols)
112 | # Rotate the tick labels and set their alignment.
113 | plt.setp(ax.get_xticklabels(), rotation=60, ha="right",
114 | rotation_mode="anchor")
115 |
116 | ax.set_title(title)
117 | fig.tight_layout()
118 | plt.show()
119 |
120 | def stockplot(X,stock,names,start_year,end_year,ylabel):
121 | """
122 | Plots stock values from a portfolio X from start_year to end_year
123 | X : f8[n_stock,n_timestep]; Portfolio: open_value/close_value/variation of stocks
124 | stock : i4; stock under consideration from the portfolio X
125 | names : str[n_stocks]; Name of stocks in the portfolio
126 | start_year,end_year : f8; start and end year under consideration
127 | ylabel : str; ylabel of what is plotted ( open_value/close_value/variation)
128 | """
129 | fig = plt.figure()
130 | ax = fig.add_subplot(1, 1, 1)
131 | major_ticks = np.arange(start_year, end_year+1, 1)
132 | ax.set_xticks(major_ticks)
133 | ax.grid(which='both')
134 | plt.plot(np.linspace(start_year,end_year,len(X[:,stock])),X[:,stock])
135 | plt.xlabel('Time [years]'); plt.ylabel(ylabel + ' [USD]')
136 | plt.title(names[stock])
137 | plt.show()
138 |
139 |
140 | def k_means_ana(data,max_n_clusters,plot):
141 | """
142 | Analyses the best number of clusters according to the average silhouette score.
143 | Silhouette coefficients (as these values are referred to as) near +1 indicate
144 | that the sample is far away from the neighboring clusters. A value of 0 indicates
145 | that the sample is on or very close to the decision boundary between two neighboring clusters
146 | and negative values indicate that those samples might have been assigned to the wrong cluster.
147 | IN:data[n_samples,m_characteristics] = data to analyse
148 | max_n_clusters = Integer - Maximum number of clusters
149 | plot=boolean
150 | """
151 | #The range of clusters is 2-max_n_clusters
152 | range_n_clusters = np.arange(2,max_n_clusters+1)
153 | range_silhouette_avg = np.zeros(max_n_clusters-1)
154 |
155 | for n_clusters in range_n_clusters: #loop over different number of clusters
156 | clusterer = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
157 | cluster_labels = clusterer.fit_predict(data)
158 | silhouette_avg = metrics.silhouette_score(data, cluster_labels)
159 | range_silhouette_avg[n_clusters-2] = silhouette_avg
160 |
161 | if plot == True:
162 | plt.figure()
163 | plt.plot(range_n_clusters,range_silhouette_avg,'*-')
164 | plt.title(f'Cluster Analysis for data with {len(data[1]):d} characteristics',fontsize=18)
165 | plt.xlabel('Number of clusters',fontsize=16)
166 | plt.ylabel('Average Silhouette Number',fontsize=16)
167 |
168 | def k_means_trafo(data,symbols,n_clusters,plot):
169 | """
170 | Computes the centers of K-Means clusters
171 | If m_characteristics == 3 the clusters can be visualized
172 | IN: data[n_samples,m_characteristics] = data to analyse
173 | n_clusters = Integer - Number of clusters to find in the given dataset
174 | OUT: centers[n_centers,m_characteristics] = cluster centers for the given data
175 | cluster_labels[n_samples] - cluster labels for data
176 | """
177 | #init: initialization method ('k-means++','random'); n_clusters:Integer - number of cluster centroids;
178 | #n_init: Integer - number of random walks executed to optimize results
179 | clusterer = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
180 | cluster_labels = clusterer.fit_predict(data)
181 | centers = clusterer.cluster_centers_
182 |
183 | if (plot == True and len(data[0]) <=3 ):
184 | silhouette_avg = metrics.silhouette_score(data, cluster_labels)
185 | # Compute the silhouette scores for each sample
186 | sample_silhouette_values = metrics.silhouette_samples(data, cluster_labels)
187 |
188 | # Create a subplot with 1 row and 2 columns
189 | fig = plt.figure()
190 | ax1 = fig.add_subplot(1,2,1) #Plot: Silhouette Score
191 | fig.set_size_inches(13, 7) #Format size of subplots
192 | archColors = cm.plasma(np.arange(n_clusters).astype(float) / (n_clusters)) #shape: (n_clusters,4)
193 | dataColors = cm.plasma(cluster_labels.astype(float) / n_clusters) #shape: (n_samples,4)
194 | # The 1st subplot is the silhouette plot
195 | # The silhouette coefficient can range from -1, 1 but in this eg. the range is -0.2 to 1
196 | ax1.set_xlim([-0.2, 1])
197 | # The (n_clusters+1)*10 is for inserting blank space between silhouette
198 | # plots of individual clusters, to demarcate them clearly.
199 | ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])
200 | y_lower = 10
201 | for i in range(n_clusters):
202 | # Aggregate the silhouette scores for samples belonging to
203 | # cluster i, and sort them
204 | ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
205 | ith_cluster_silhouette_values.sort()
206 | size_cluster_i = ith_cluster_silhouette_values.shape[0]
207 | y_upper = y_lower + size_cluster_i
208 |
209 | #color = cm.nipy_spectral(float(i) / n_clusters)
210 | ax1.fill_betweenx(np.arange(y_lower, y_upper),
211 | 0, ith_cluster_silhouette_values, facecolors=archColors[i,:], alpha=1)
212 | # Label the silhouette plots with their cluster numbers at the middle
213 | ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
214 | # Compute the new y_lower for next plot
215 | y_lower = y_upper + 10 # 10 for the 0 samples
216 |
217 | # Labeling for subplot 1: Silhouette scores
218 | ax1.set_title(f"Silhouette plot for various clusters. Silhouette Average = {silhouette_avg:.3f}"
219 | ,fontsize=18)
220 | ax1.set_xlabel("Silhouette coefficient values",fontsize=16)
221 | ax1.set_ylabel("Cluster label",fontsize=16)
222 | # The vertical line for average silhouette score of all the values
223 | ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
224 |
225 | # 2nd Plot showing the actual clusters formed
226 | if (len(data[0]) == 2):
227 | ax2 = fig.add_subplot(1,2,2)
228 | #Plot data
229 | ax2.scatter(data[:,0], data[:,1], c=dataColors, alpha = 0.6, s=20)
230 | #Plot cluster centers
231 | #ax2.scatter(centers[:, 0], centers[:, 1],c=archColors,marker='X',edgecolor = 'k', alpha=1, s=200)
232 | ax2.set_title("K-Means clusteres",fontsize=18)
233 | ax2.set_xlabel('1st component',fontsize=16)
234 | ax2.set_ylabel('2nd component',fontsize=16)
235 | for i, txt in enumerate(symbols):
236 | ax2.annotate(txt, (data[i,0], data[i,1]))
237 | plt.tight_layout()
238 | plt.show()
239 |
240 | elif len(data[0] == 3):
241 | ax2 = fig.add_subplot(1,2,2, projection='3d')
242 | #Plot data
243 | ax2.scatter(data[:,0], data[:,1], data[:,2], c=dataColors, alpha = 0.6, s=5)
244 | #Plot cluster centers
245 | ax2.scatter(centers[:, 0], centers[:, 1],centers[:, 2],c=archColors,marker='X',edgecolor = 'k', alpha=1, s=200)
246 | ax2.set_title("K-Means clusteres of PCA reduced data.",fontsize=18)
247 | ax2.set_xlabel('1st pca component',fontsize=16)
248 | ax2.set_ylabel('2nd pca component',fontsize=16)
249 | ax2.set_zlabel('3rd pca component',fontsize=16)
250 | plt.tight_layout()
251 | plt.show()
252 | return centers,cluster_labels
253 |
254 | def nodeplt(embedding,labels,symbols,partial_correlations,threshold):
255 | """
256 | Plots a map of the samples. The node positions is the 2D embedding of the cov of
257 | the samples. The colour indicates the group to which the sample belongs;
258 | The edges represent the connectivity to the other nodes (partial correlation)
259 | IN:
260 | embedding : f8[2,n_samples]; A 2D embedding of the covariance matrix
261 | labels : labels[n_samples] - cluster labels for data
262 | symbols : str[n_samples]; names of stocks in the portfolio
263 | partial_correlations : f8[n_samples,n_samples]; partial correlation between samples (connectivity)
264 | threshold : f8; value to cut off partial correlation
265 | """
266 | x = embedding[0,:]; y = embedding[1,:]
267 | fig, ax = plt.subplots()
268 | ax.scatter(x, y,c=labels)
269 | for i, txt in enumerate(symbols):
270 | ax.annotate(txt, (x[i], y[i]))
271 |
272 | # Display a graph of the partial correlations
273 | non_zero = (np.abs(np.triu(partial_correlations, k=1)) > threshold) #tridiag entries where partial corr > 0.06
274 | # Plot the edges
275 | start_idx, end_idx = np.where(non_zero)
276 | # a sequence of (*line0*, *line1*, *line2*), where::
277 | # linen = (x0, y0), (x1, y1), ... (xm, ym)
278 | segments = [[embedding[:, start], embedding[:, stop]]
279 | for start, stop in zip(start_idx, end_idx)]
280 | values = np.abs(partial_correlations[non_zero])
281 | lc = LineCollection(segments,
282 | zorder=0, cmap=plt.cm.hot_r,
283 | norm=plt.Normalize(0, .7 * values.max()))
284 | lc.set_array(values)
285 | lc.set_linewidths(15 * values)
286 | ax.add_collection(lc)
287 | axcb = fig.colorbar(lc)
288 | axcb.set_label('Partial Correlation')
289 | ax.xaxis.set_visible(False)
290 | ax.yaxis.set_visible(False)
291 | plt.show()
292 |
293 | def find_cointegrated_pairs(data):
294 | n = data.shape[0]
295 | score_matrix = np.zeros((n, n))
296 | pvalue_matrix = np.ones((n, n))
297 | pairs = []
298 | for i in range(n):
299 | print(i)
300 | for j in range(i+1, n):
301 | S1 = data[i]
302 | S2 = data[j]
303 | result = coint(S1, S2)
304 | score = result[0]
305 | pvalue = result[1]
306 | score_matrix[i, j] = score
307 | pvalue_matrix[i, j] = pvalue
308 | if pvalue < 0.05:
309 | pairs.append((i, j))
310 | return score_matrix, pvalue_matrix, pairs
311 |
312 | def zscore(X):
313 | return (X - X.mean()) / np.std(X)
314 |
315 | def cointplt(data,pair,names,start_year,end_year,plot):
316 | index_s1 = pair[0]; index_s2 = pair[1]
317 |
318 | score, pvalue, _ = coint(data[index_s1],data[index_s2])
319 | ratio = data[index_s1] / data[index_s2]
320 | z_ratio = zscore(ratio)
321 | if plot == True:
322 | X = z_ratio;
323 |
324 | fig = plt.figure()
325 | ax = fig.add_subplot(1, 1, 1)
326 | major_ticks = np.arange(start_year, end_year+1, 1)
327 | ax.set_xticks(major_ticks)
328 | ax.grid(which='both')
329 |
330 | plt.plot(np.linspace(start_year,end_year,len(X[:])),X[:])
331 | plt.axhline(z_ratio.mean(),color='black',ls='--')
332 | plt.axhline(1.0, color='red',ls='--')
333 | plt.axhline(-1.0, color='green',ls='--')
334 |
335 | plt.xlabel('Time [years]'); plt.ylabel('Ratio')
336 | plt.suptitle(names[pair[0]] + ' / ' + names[pair[1]])
337 | plt.title(f'Coint-Pvalue = {pvalue:.3} ')
338 | plt.show()
339 | return pvalue
340 |
341 | def hurstcoeff(close_val,pairs,names,plot):
342 | S1 = pairs[0]; S2 = pairs[1]
343 | ts = close_val[S1]/close_val[S2]
344 | lags = np.arange(2,400,1)
345 | tauvec = []
346 | for i in range(0,len(lags)):
347 | lag = lags[i]
348 | tau = np.sqrt(np.std(np.subtract(ts[lag:],ts[:-lag])))
349 | tauvec = np.append(tauvec,tau)
350 |
351 | poly = np.polyfit(np.log(lags), np.log(tauvec), 1)
352 | H = poly[0]*2; #coeff orderd s.t. highest power first
353 | if plot == True:
354 | plt.figure()
355 | plt.scatter(np.log(lags),np.log(tauvec),color='black',s=2,label=r'$<|ln(S_{t+\tau})-ln(S_t)|^2>$')
356 | plt.plot(np.log(lags),poly[1]+np.log(lags)*0.5*H,color='green',
357 | label=f'Linear fit: '+r'$f(\tau)$= '+f'{poly[1]:.2} + {poly[0]:.2}'+r'$\tau$' )
358 | plt.xlabel(r'$ln(\tau)$');
359 | plt.suptitle(f'Estimation of the Hurst coeff. H = {H:.2}')
360 | plt.title(names[pairs[0]] + '/' + names[pairs[1]])
361 | plt.legend(); plt.show()
362 | return H
363 |
364 | def pairstrade(param,data,id0,id1,names,start_year,end_year,plot):
365 | S0 = data.iloc[id0][:]; S1 = data.iloc[id1][:]
366 | d,buy_signal,sell_signal = param[0],param[1],param[2]
367 | d = int(d)
368 | k = (S0/S1).rolling(window=d, center=False).mean()
369 | Y_tilde = np.log(S0/(k*S1))[d:-1] # model implied OU value based on deviation from k_old
370 |
371 | #TRADE THE PAIR BASED ON OU IMPLIED OVER-/UNDERVALUATION.
372 | t = np.linspace(start_year,end_year,int(len(S0)))
373 | S0=S0[d:-1]; S1=S1[d:-1]; k = k[d:-1]; t = t[d:-1]
374 |
375 | money = np.zeros(len(Y_tilde)) # start trading with no positions
376 | S0_held = np.zeros(len(Y_tilde))
377 | S1_held = np.zeros(len(Y_tilde))
378 |
379 | buy = []; sell = []
380 | for i in range(len(S0)):
381 | if Y_tilde.iloc[i] > buy_signal: # short the pair when OU up
382 | S0_held[i] = S0_held[i-1] - 1
383 | S1_held[i] = S1_held[i-1] + S0.iloc[i]/S1.iloc[i]
384 | money[i] = money[i-1] - 0.005 * 2 * S0.iloc[i] # 50 bps tx costs
385 |
386 | buy.append((t[i], S1.iloc[i]))
387 | sell.append((t[i], S0.iloc[i]))
388 |
389 | elif Y_tilde.iloc[i] < -buy_signal: # long the pair when OU down
390 | S0_held[i] = S0_held[i-1] + 1
391 | S1_held[i] = S1_held[i-1] - S0.iloc[i]/S1.iloc[i]
392 | money[i] = money[i-1] - 0.005 * 2 * S0.iloc[i]
393 |
394 | buy.append((t[i], S0.iloc[i]))
395 | sell.append((t[i], S1.iloc[i]))
396 |
397 | elif abs(Y_tilde.iloc[i]) < sell_signal and S0_held[i-1]!=0: # clear position when ratio normal
398 | money[i] = money[i-1] + (S0_held[i-1]*S0.iloc[i] + S1_held[i-1]*S1.iloc[i]) * 0.995
399 | S0_held[i], S1_held[i] = 0, 0
400 |
401 | if S0_held[i-1] < 0:
402 | buy.append((t[i], S0.iloc[i]))
403 | sell.append((t[i], S1.iloc[i]))
404 | else:
405 | buy.append((t[i], S1.iloc[i]))
406 | sell.append((t[i], S0.iloc[i]))
407 |
408 | else:
409 | money[i] = money[i-1]
410 | S0_held[i] = S0_held[i-1]
411 | S1_held[i] = S1_held[i-1]
412 | buy,sell = np.asarray(buy),np.asarray(sell)
413 | if plot == True and len(buy) != 0:
414 | size = (16,8)
415 | fig = plt.figure(figsize=size)
416 | ax = fig.add_subplot(1, 1, 1)
417 | major_ticks = np.arange(start_year, end_year+1, 1)
418 | ax.set_xticks(major_ticks)
419 | ax.grid(which='both')
420 | plt.plot(t,S0/S1, 'black',label='current k')
421 | plt.plot(t,k, 'green',label=f'{int(param[0])}-day moving average');
422 | plt.title('Stock Pair Ratio')
423 | plt.xlabel('t [years]'); plt.ylabel('k')
424 | plt.legend(); plt.show()
425 | #%%
426 | #Buy sell
427 | fig = plt.figure(figsize=size)
428 | ax = fig.add_subplot(1, 1, 1)
429 | major_ticks = np.arange(start_year, end_year+1, 1);
430 | ax.set_xticks(major_ticks)
431 | ax.grid(which='both')
432 | plt.plot(t,S0.iloc[0:], 'black',label=names[id0]); plt.plot(t,S1.iloc[:], 'black', linestyle='dotted',label=names[id1])
433 | plt.scatter(buy[:,0],buy[:,1], color='g', linestyle='None', marker='^',label='Buy Signal')
434 | plt.scatter(sell[:,0],sell[:,1], color='r', linestyle='None', marker='^',label='Sell Signal')
435 | plt.title('Buy / Sell signal'); plt.xlabel('Time [years]'); plt.ylabel('Stock value [EUR]')
436 | plt.legend(); plt.show()
437 |
438 | #Money
439 | fig = plt.figure(figsize=size)
440 | ax = fig.add_subplot(1, 1, 1)
441 | major_ticks = np.arange(start_year, end_year+1, 1)
442 | ax.set_xticks(major_ticks)
443 | ax.grid(which='both')
444 | plt.plot(t, money, 'black');
445 | plt.title('Trade performance');
446 | plt.xlabel('Time [years]'); plt.ylabel('Money [EUR]')
447 | plt.show()
448 |
449 | return money,buy,sell,k,Y_tilde,t,S0,S1
--------------------------------------------------------------------------------