├── .DS_Store ├── README.md ├── .gitignore ├── denoise.py ├── CovMatrix.py ├── 3_Distance_Metrics.ipynb ├── 5_financial_lables.ipynb └── 4_Optimal_Clustering.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuangology/Machine-Learning-for-Asset-Managers/HEAD/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-for-Asset-Managers 2 | Implementation of code snippets and exercises in the book Machine Learning for Asset Managers written by Prof. Marcos López de Prado. 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .idea 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /denoise.py: -------------------------------------------------------------------------------- 1 | import numpy as np,pandas as pd 2 | import matplotlib.pyplot as plt 3 | 4 | #%% md 5 | 6 | ## The Marcenko-Pastur Theorem 7 | 8 | 9 | #%% md 10 | 11 | ### SNIPPET 2.1 12 | 13 | #%% 14 | 15 | 16 | #--------------------------------------------------- 17 | 18 | def mpPDF(var,q,pts): 19 | # Marcenko-Pastur pdf 20 | # q=T/N 21 | # when var= 1, C = T^-1 X'X is the correlation matrix associated with X 22 | # lambda+ =,lambda- = eMax, eMin 23 | eMin,eMax=var*(1-(1./q)**.5)**2,var*(1+(1./q)**.5)**2 24 | eVal=np.linspace(eMin,eMax,pts) 25 | pdf=q/(2*np.pi*var*eVal)*((eMax-eVal)*(eVal-eMin))**.5 26 | #pdf = pdf.ravel() 27 | pdf=pd.Series(pdf,index=eVal) 28 | return pdf 29 | 30 | 31 | #%% md 32 | 33 | ### SNIPPET 2.2 34 | 35 | #%% 36 | 37 | from sklearn.neighbors.kde import KernelDensity 38 | #--------------------------------------------------- 39 | def getPCA(matrix): 40 | # Get eVal,eVec from a Hermitian matrix 41 | eVal,eVec=np.linalg.eigh(matrix) 42 | indices=eVal.argsort()[::-1] # arguments for sorting eVal desc 43 | eVal,eVec=eVal[indices],eVec[:,indices] 44 | eVal=np.diagflat(eVal) 45 | return eVal,eVec 46 | #--------------------------------------------------- 47 | def fitKDE(obs,bWidth=.25,kernel='gaussian',x=None): 48 | # Fit kernel to a series of obs, and derive the prob of obs 49 | # x is the array of values on which the fit KDE will be evaluated 50 | if len(obs.shape)==1: 51 | obs=obs.reshape(-1,1) 52 | kde=KernelDensity(kernel=kernel,bandwidth=bWidth).fit(obs) 53 | if x is None: 54 | x=np.unique(obs).reshape(-1,1) 55 | if len(x.shape)==1: 56 | x=x.reshape(-1,1) 57 | logProb=kde.score_samples(x) # log(density) 58 | pdf=pd.Series(np.exp(logProb),index=x.flatten()) 59 | return pdf 60 | 61 | 62 | #%% 63 | 64 | #--------------------------------------------------- 65 | x=np.random.normal(size=(10000,1000)) 66 | eVal0,eVec0=getPCA(np.corrcoef(x,rowvar=False)) # each column is a variable 67 | pdf0=mpPDF(1.,q=x.shape[0]/float(x.shape[1]),pts=1000) 68 | pdf1=fitKDE(np.diag(eVal0),bWidth=.01) # empirical pdf 69 | ax = plt.figure().add_subplot(111) 70 | ax.plot(pdf0,label= 'Marcenko-Pastur') 71 | ax.plot(pdf1,linestyle = '--',label= 'Empirical:KDE') 72 | ax.set_xlabel(r'$\lambda$') 73 | ax.set_ylabel(r'prob[$\lambda$]') 74 | ax.legend() 75 | 76 | #%% md 77 | 78 | 79 | ## Random Matrix with Signal (not perfectly random) 80 | 81 | #%% 82 | 83 | #SNIPPET 2.3 ADD SIGNAL TO A RANDOM COVARIANCE MATRIX 84 | def getRndCov(nCols,nFacts): 85 | w=np.random.normal(size=(nCols,nFacts)) 86 | cov=np.dot(w,w.T) # random cov matrix, however not full rank 87 | cov+=np.diag(np.random.uniform(size=nCols)) # full rank cov 88 | return cov 89 | #--------------------------------------------------- 90 | def cov2corr(cov): 91 | # Derive the correlation matrix from a covariance matrix 92 | std=np.sqrt(np.diag(cov)) 93 | corr=cov/np.outer(std,std) 94 | corr[corr<-1],corr[corr>1]=-1,1 # numerical error 95 | return corr 96 | #--------------------------------------------------- 97 | alpha,nCols,nFact,q=.995,1000,100,10 98 | cov=np.cov(np.random.normal(size=(nCols*q,nCols)),rowvar=False) 99 | cov=alpha*cov+(1-alpha)*getRndCov(nCols,nFact) # noise+signal 100 | corr0=cov2corr(cov) 101 | eVal0,eVec0=getPCA(corr0) 102 | 103 | #%% 104 | 105 | #SNIPPET 2.4 FITTING THE MARCENKO–PASTUR PDF 106 | from scipy.optimize import minimize 107 | #--------------------------------------------------- 108 | def errPDFs(var,eVal,q,bWidth,pts=1000): 109 | # Fit error 110 | var = var[0] 111 | pdf0=mpPDF(var,q,pts) # theoretical pdf 112 | pdf1=fitKDE(eVal,bWidth,x=pdf0.index.values) # empirical pdf 113 | #import pdb; pdb.set_trace() 114 | sse=np.sum((pdf1-pdf0)**2) 115 | return sse 116 | #--------------------------------------------------- 117 | def findMaxEval(eVal,q,bWidth): 118 | # Find max random eVal by fitting Marcenko’s dist 119 | out=minimize(lambda *x: errPDFs(*x),.5,args=(eVal,q,bWidth),bounds=((1E-5,1-1E-5),)) 120 | if out['success']: 121 | var=out['x'][0] 122 | else: 123 | var=1 124 | eMax=var*(1+(1./q)**.5)**2 125 | return eMax,var 126 | #--------------------------------------------------- 127 | eMax0,var0=findMaxEval(np.diag(eVal0),q,bWidth=.01) 128 | nFacts0=eVal0.shape[0]-np.diag(eVal0)[::-1].searchsorted(eMax0) 129 | 130 | # nFacts0 gives the number of the eigenvalue is assumed to be important (cutoff level lambda+ adjusted for the presence of nonrandom eigenvectors) 131 | 132 | #%% 133 | 134 | #--------------------------------------------------- 135 | # Fitting the Marcenko–Pastur PDF on a noisy covariance matrix. 136 | # estimate the sigma for Marcenko-Pastur dist 137 | bWidth=0.01 138 | out=minimize(lambda *x: errPDFs(*x),.5,args=(np.diag(eVal0),q,bWidth),bounds=((1E-5,1-1E-5),)) 139 | if out['success']: 140 | var=out['x'][0] 141 | else: 142 | var=1 143 | 144 | pdf0=mpPDF(var,q,pts=1000) # Marcenko-Pastur dist 145 | pdf1=fitKDE(np.diag(eVal0),bWidth=.01) # empirical pdf 146 | ax = plt.figure().add_subplot(111) 147 | ax.plot(pdf0,label= 'Marcenko-Pastur dist') 148 | ax.bar(pdf1.index,pdf1.values,width = bWidth,label= 'Empirical dist',color = 'darkorange') 149 | ax.set_xlabel(r'$\lambda$') 150 | ax.set_ylabel(r'prob[$\lambda$]') 151 | ax.legend() 152 | 153 | #%% md 154 | 155 | ## 2.5 Denoising 156 | 157 | #%% md 158 | 159 | ### 2.5.1 Constant Residual Eigenvalue Method 160 | 161 | setting a constant eigenvalue for all random eigenvectors. 162 | 163 | #%% 164 | 165 | def denoisedCorr(eVal,eVec,nFacts): 166 | # Remove noise from corr by fixing random eigenvalues 167 | eVal_=np.diag(eVal).copy() 168 | eVal_[nFacts:]=eVal_[nFacts:].sum()/float(eVal_.shape[0]-nFacts) # average the rest 169 | eVal_=np.diag(eVal_) 170 | corr1=np.dot(eVec,eVal_).dot(eVec.T) 171 | corr1=cov2corr(corr1) 172 | return corr1 173 | #--------------------------------------------------- 174 | corr1=denoisedCorr(eVal0,eVec0,nFacts0) 175 | eVal1,eVec1=getPCA(corr1) 176 | 177 | #%% 178 | 179 | # A comparison of eigenvalues before and after applying the residual eigenvalue method. 180 | ax = plt.figure().add_subplot(111) 181 | ax.plot(np.diagonal(eVal0),label = 'Original eigen-function') 182 | ax.plot(np.diagonal(eVal1),label = 'Denoised eigen-function (Constant Residual)',linestyle = '--') 183 | ax.legend() 184 | ax.set_yscale('log') 185 | ax.set_xlabel('Eigenvalue number') 186 | ax.set_ylabel('Eigenvalue (log-scale)') 187 | 188 | #%% md 189 | 190 | ### 2.5.2 Targeted Shrinkage 191 | $\alpha$ regulates the amount fo shrinkage among the eigen vectors 192 | 193 | #%% 194 | 195 | #SNIPPET 2.6 DENOISING BY TARGETED SHRINKAGE 196 | def denoisedCorr2(eVal,eVec,nFacts,alpha=0): 197 | # Remove noise from corr through targeted shrinkage 198 | eValL,eVecL=eVal[:nFacts,:nFacts],eVec[:,:nFacts] 199 | eValR,eVecR=eVal[nFacts:,nFacts:],eVec[:,nFacts:] 200 | corr0=np.dot(eVecL,eValL).dot(eVecL.T) 201 | corr1=np.dot(eVecR,eValR).dot(eVecR.T) 202 | corr2=corr0+alpha*corr1+(1-alpha)*np.diag(np.diag(corr1)) 203 | return corr2 204 | #--------------------------------------------------- 205 | corr1=denoisedCorr2(eVal0,eVec0,nFacts0,alpha=.5) 206 | eVal1,eVec1=getPCA(corr1) 207 | 208 | #%% 209 | 210 | # A comparison of eigenvalues before and after applying the residual eigenvalue method. 211 | ax = plt.figure().add_subplot(111) 212 | ax.plot(np.diagonal(eVal0),label = 'Original eigen-function') 213 | ax.plot(np.diagonal(eVal1),label = 'Denoised eigen-function (targeted shrinkage)',linestyle = '--') 214 | ax.legend() 215 | ax.set_yscale('log') 216 | ax.set_xlabel('Eigenvalue number') 217 | ax.set_ylabel('Eigenvalue (log-scale)') 218 | 219 | #%% md 220 | 221 | # Experimental Results 222 | ## 2.7.1 Minimum Variance Portfolio 223 | 224 | #%% 225 | 226 | def corr2cov(corr,std): 227 | # Derive the covariance matrix from a correlation matrix 228 | corr[corr<-1],corr[corr>1]=-1,1 # numerical error 229 | cov = np.outer(std,std)*corr 230 | return cov 231 | 232 | #%% 233 | 234 | 235 | #SNIPPET 2.7 GENERATING A BLOCK-DIAGONAL COVARIANCE MATRIX AND A VECTOR OF MEANS 236 | def formBlockMatrix(nBlocks,bSize,bCorr): 237 | block=np.ones((bSize,bSize))*bCorr 238 | block[range(bSize),range(bSize)]=1 239 | corr=block_diag(*([block]*nBlocks)) 240 | return corr 241 | #--------------------------------------------------- 242 | def formTrueMatrix(nBlocks,bSize,bCorr): 243 | #In each block, the variances are drawn from a uniform distribution bounded between 5% and 20%; the vector of means is drawn from a Normal distribution with mean and standard deviation equal to the standard deviation from the covariance matrix 244 | corr0=formBlockMatrix(nBlocks,bSize,bCorr) 245 | corr0=pd.DataFrame(corr0) 246 | cols=corr0.columns.tolist() 247 | np.random.shuffle(cols) 248 | corr0=corr0[cols].loc[cols].copy(deep=True) 249 | std0=np.random.uniform(.05,.2,corr0.shape[0]) 250 | cov0=corr2cov(corr0,std0) 251 | mu0=np.random.normal(std0,std0,cov0.shape[0]).reshape(-1,1) 252 | return mu0,cov0 253 | #--------------------------------------------------- 254 | from scipy.linalg import block_diag 255 | from sklearn.covariance import LedoitWolf 256 | nBlocks,bSize,bCorr=10,50,.5 257 | np.random.seed(0) 258 | mu0,cov0=formTrueMatrix(nBlocks,bSize,bCorr) 259 | 260 | #%% 261 | 262 | #SNIPPET 2.8 GENERATING THE EMPIRICAL COVARIANCE MATRIX 263 | def simCovMu(mu0,cov0,nObs,shrink=False): 264 | x=np.random.multivariate_normal(mu0.flatten(),cov0,size=nObs) 265 | mu1=x.mean(axis=0).reshape(-1,1) 266 | if shrink: 267 | cov1=LedoitWolf().fit(x).covariance_ 268 | else: 269 | cov1=np.cov(x,rowvar=0) 270 | return mu1,cov1 271 | 272 | #%% 273 | 274 | # SNIPPET 2.9 DENOISING OF THE EMPIRICAL COVARIANCE MATRIX 275 | def deNoiseCov(cov0,q,bWidth): 276 | corr0=cov2corr(cov0) 277 | eVal0,eVec0=getPCA(corr0) 278 | eMax0,var0=findMaxEval(np.diag(eVal0),q,bWidth) 279 | nFacts0=eVal0.shape[0]-np.diag(eVal0)[::-1].searchsorted(eMax0) 280 | corr1=denoisedCorr(eVal0,eVec0,nFacts0) 281 | cov1=corr2cov(corr1,np.diag(cov0)**.5) 282 | return cov1 283 | 284 | #%% 285 | 286 | #SNIPPET 2.10 DENOISING OF THE EMPIRICAL COVARIANCE MATRIX 287 | def optPort(cov,mu=None): # optimal portfolio for minimum variance 288 | inv=np.linalg.inv(cov) 289 | ones=np.ones(shape=(inv.shape[0],1)) 290 | if mu is None: 291 | mu=ones 292 | w=np.dot(inv,mu) 293 | w/=np.dot(ones.T,w) 294 | return w -------------------------------------------------------------------------------- /CovMatrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Chapter 2 Denoising and Detoning 5 | # 6 | # reduce the noise and enhance the signal included in an empirical covariance matrix. 7 | 8 | # In[1]: 9 | 10 | 11 | import numpy as np,pandas as pd 12 | import matplotlib.pyplot as plt 13 | from tqdm import tqdm 14 | 15 | # ## The Marcenko-Pastur Theorem 16 | # 17 | 18 | # ### SNIPPET 2.1 19 | 20 | # In[2]: 21 | 22 | 23 | 24 | #--------------------------------------------------- 25 | 26 | def mpPDF(var,q,pts): 27 | # Marcenko-Pastur pdf 28 | # q=T/N 29 | # when var= 1, C = T^-1 X'X is the correlation matrix associated with X 30 | # lambda+ =,lambda- = eMax, eMin 31 | eMin,eMax=var*(1-(1./q)**.5)**2,var*(1+(1./q)**.5)**2 32 | eVal=np.linspace(eMin,eMax,pts) 33 | pdf=q/(2*np.pi*var*eVal)*((eMax-eVal)*(eVal-eMin))**.5 34 | #pdf = pdf.ravel() 35 | pdf=pd.Series(pdf,index=eVal) 36 | return pdf 37 | 38 | 39 | # ### SNIPPET 2.2 40 | 41 | # In[3]: 42 | 43 | 44 | from sklearn.neighbors.kde import KernelDensity 45 | #--------------------------------------------------- 46 | def getPCA(matrix): 47 | # Get eVal,eVec from a Hermitian matrix 48 | eVal,eVec=np.linalg.eigh(matrix) 49 | indices=eVal.argsort()[::-1] # arguments for sorting eVal desc 50 | eVal,eVec=eVal[indices],eVec[:,indices] 51 | eVal=np.diagflat(eVal) 52 | return eVal,eVec 53 | #--------------------------------------------------- 54 | def fitKDE(obs,bWidth=.25,kernel='gaussian',x=None): 55 | # Fit kernel to a series of obs, and derive the prob of obs 56 | # x is the array of values on which the fit KDE will be evaluated 57 | if len(obs.shape)==1: 58 | obs=obs.reshape(-1,1) 59 | kde=KernelDensity(kernel=kernel,bandwidth=bWidth).fit(obs) 60 | if x is None: 61 | x=np.unique(obs).reshape(-1,1) 62 | if len(x.shape)==1: 63 | x=x.reshape(-1,1) 64 | logProb=kde.score_samples(x) # log(density) 65 | pdf=pd.Series(np.exp(logProb),index=x.flatten()) 66 | return pdf 67 | 68 | 69 | 70 | 71 | # 72 | # ## Random Matrix with Signal (not perfectly random) 73 | 74 | # In[5]: 75 | 76 | 77 | #SNIPPET 2.3 ADD SIGNAL TO A RANDOM COVARIANCE MATRIX 78 | def getRndCov(nCols,nFacts): 79 | w=np.random.normal(size=(nCols,nFacts)) 80 | cov=np.dot(w,w.T) # random cov matrix, however not full rank 81 | cov+=np.diag(np.random.uniform(size=nCols)) # full rank cov 82 | return cov 83 | #--------------------------------------------------- 84 | def cov2corr(cov): 85 | # Derive the correlation matrix from a covariance matrix 86 | std=np.sqrt(np.diag(cov)) 87 | corr=cov/np.outer(std,std) 88 | corr[corr<-1],corr[corr>1]=-1,1 # numerical error 89 | return corr 90 | 91 | 92 | 93 | #SNIPPET 2.4 FITTING THE MARCENKO–PASTUR PDF 94 | from scipy.optimize import minimize 95 | #--------------------------------------------------- 96 | def errPDFs(var,eVal,q,bWidth,pts=1000): 97 | # Fit error 98 | var = var[0] 99 | pdf0=mpPDF(var,q,pts) # theoretical pdf 100 | pdf1=fitKDE(eVal,bWidth,x=pdf0.index.values) # empirical pdf 101 | #import pdb; pdb.set_trace() 102 | sse=np.sum((pdf1-pdf0)**2) 103 | return sse 104 | #--------------------------------------------------- 105 | def findMaxEval(eVal,q,bWidth): 106 | # Find max random eVal by fitting Marcenko’s dist 107 | out=minimize(lambda *x: errPDFs(*x),.5,args=(eVal,q,bWidth),bounds=((1E-5,1-1E-5),)) 108 | if out['success']: 109 | var=out['x'][0] 110 | else: 111 | var=1 112 | eMax=var*(1+(1./q)**.5)**2 113 | return eMax,var 114 | 115 | # ax = plt.figure().add_subplot(111) 116 | # ax.plot(pdf0,label= 'Marcenko-Pastur') 117 | # ax.plot(pdf1,linestyle = '--',label= 'Empirical:KDE') 118 | # ax.set_xlabel(r'$\lambda$') 119 | # ax.set_ylabel(r'prob[$\lambda$]') 120 | # ax.legend() 121 | 122 | 123 | #--------------------------------------------------- 124 | 125 | #--------------------------------------------------- 126 | 127 | 128 | # nFacts0 gives the number of the eigenvalue is assumed to be important (cutoff level lambda+ adjusted for the presence of nonrandom eigenvectors) 129 | 130 | 131 | # In[7]: 132 | 133 | 134 | #--------------------------------------------------- 135 | # Fitting the Marcenko–Pastur PDF on a noisy covariance matrix. 136 | # estimate the sigma for Marcenko-Pastur dist 137 | # bWidth=0.01 138 | # out=minimize(lambda *x: errPDFs(*x),.5,args=(np.diag(eVal0),q,bWidth),bounds=((1E-5,1-1E-5),)) 139 | # if out['success']: 140 | # var=out['x'][0] 141 | # else: 142 | # var=1 143 | # 144 | # pdf0=mpPDF(var,q,pts=1000) # Marcenko-Pastur dist 145 | # pdf1=fitKDE(np.diag(eVal0),bWidth=.01) # empirical pdf 146 | # ax = plt.figure().add_subplot(111) 147 | # ax.plot(pdf0,label= 'Marcenko-Pastur dist') 148 | # ax.bar(pdf1.index,pdf1.values,width = bWidth,label= 'Empirical dist',color = 'darkorange') 149 | # ax.set_xlabel(r'$\lambda$') 150 | # ax.set_ylabel(r'prob[$\lambda$]') 151 | # ax.legend() 152 | 153 | 154 | # ## 2.5 Denoising 155 | 156 | # ### 2.5.1 Constant Residual Eigenvalue Method 157 | # 158 | # setting a constant eigenvalue for all random eigenvectors. 159 | 160 | # In[8]: 161 | 162 | 163 | def denoisedCorr(eVal,eVec,nFacts): 164 | # Remove noise from corr by fixing random eigenvalues 165 | eVal_=np.diag(eVal).copy() 166 | eVal_[nFacts:]=eVal_[nFacts:].sum()/float(eVal_.shape[0]-nFacts) # average the rest 167 | eVal_=np.diag(eVal_) 168 | corr1=np.dot(eVec,eVal_).dot(eVec.T) 169 | corr1=cov2corr(corr1) 170 | return corr1 171 | #--------------------------------------------------- 172 | 173 | 174 | 175 | # In[9]: 176 | 177 | 178 | # # A comparison of eigenvalues before and after applying the residual eigenvalue method. 179 | # ax = plt.figure().add_subplot(111) 180 | # ax.plot(np.diagonal(eVal0),label = 'Original eigen-function') 181 | # ax.plot(np.diagonal(eVal1),label = 'Denoised eigen-function (Constant Residual)',linestyle = '--') 182 | # ax.legend() 183 | # ax.set_yscale('log') 184 | # ax.set_xlabel('Eigenvalue number') 185 | # ax.set_ylabel('Eigenvalue (log-scale)') 186 | 187 | 188 | # ### 2.5.2 Targeted Shrinkage 189 | # $\alpha$ regulates the amount fo shrinkage among the eigen vectors 190 | 191 | # In[10]: 192 | 193 | 194 | #SNIPPET 2.6 DENOISING BY TARGETED SHRINKAGE 195 | def denoisedCorr2(eVal,eVec,nFacts,alpha=0.0): 196 | # Remove noise from corr through targeted shrinkage 197 | eValL,eVecL=eVal[:nFacts,:nFacts],eVec[:,:nFacts] 198 | eValR,eVecR=eVal[nFacts:,nFacts:],eVec[:,nFacts:] 199 | corr0=np.dot(eVecL,eValL).dot(eVecL.T) 200 | corr1=np.dot(eVecR,eValR).dot(eVecR.T) 201 | corr2=corr0+alpha*corr1+(1-alpha)*np.diag(np.diag(corr1)) 202 | return corr2 203 | #--------------------------------------------------- 204 | 205 | 206 | 207 | # In[11]: 208 | 209 | 210 | # # A comparison of eigenvalues before and after applying the residual eigenvalue method. 211 | # ax = plt.figure().add_subplot(111) 212 | # ax.plot(np.diagonal(eVal0),label = 'Original eigen-function') 213 | # ax.plot(np.diagonal(eVal1),label = 'Denoised eigen-function (targeted shrinkage)',linestyle = '--') 214 | # ax.legend() 215 | # ax.set_yscale('log') 216 | # ax.set_xlabel('Eigenvalue number') 217 | # ax.set_ylabel('Eigenvalue (log-scale)') 218 | 219 | 220 | # # Experimental Results 221 | # ## 2.7.1 Minimum Variance Portfolio 222 | 223 | # In[12]: 224 | 225 | 226 | def corr2cov(corr,std): 227 | # Derive the covariance matrix from a correlation matrix 228 | corr[corr<-1],corr[corr>1]=-1,1 # numerical error 229 | cov = np.outer(std,std)*corr 230 | return cov 231 | 232 | 233 | # In[13]: 234 | 235 | 236 | 237 | #SNIPPET 2.7 GENERATING A BLOCK-DIAGONAL COVARIANCE MATRIX AND A VECTOR OF MEANS 238 | def formBlockMatrix(nBlocks,bSize,bCorr): 239 | block=np.ones((bSize,bSize))*bCorr 240 | block[range(bSize),range(bSize)]=1 241 | corr=block_diag(*([block]*nBlocks)) 242 | return corr 243 | #--------------------------------------------------- 244 | def formTrueMatrix(nBlocks,bSize,bCorr): 245 | #In each block, the variances are drawn from a uniform distribution bounded between 5% and 20%; the vector of means is drawn from a Normal distribution with mean and standard deviation equal to the standard deviation from the covariance matrix 246 | corr0=formBlockMatrix(nBlocks,bSize,bCorr) 247 | corr0=pd.DataFrame(corr0) 248 | cols=corr0.columns.tolist() 249 | np.random.shuffle(cols) 250 | corr0=corr0[cols].loc[cols].copy(deep=True) 251 | std0=np.random.uniform(.05,.2,corr0.shape[0]) 252 | cov0=corr2cov(corr0,std0) 253 | mu0=np.random.normal(std0,std0,cov0.shape[0]).reshape(-1,1) 254 | return mu0,cov0 255 | #--------------------------------------------------- 256 | from scipy.linalg import block_diag 257 | from sklearn.covariance import LedoitWolf 258 | nBlocks,bSize,bCorr=10,50,.5 259 | np.random.seed(0) 260 | mu0,cov0=formTrueMatrix(nBlocks,bSize,bCorr) 261 | 262 | 263 | # In[14]: 264 | 265 | 266 | #SNIPPET 2.8 GENERATING THE EMPIRICAL COVARIANCE MATRIX 267 | def simCovMu(mu0,cov0,nObs,shrink=False): 268 | x=np.random.multivariate_normal(mu0.flatten(),cov0,size=nObs) 269 | mu1=x.mean(axis=0).reshape(-1,1) 270 | if shrink: 271 | cov1=LedoitWolf().fit(x).covariance_ 272 | else: 273 | cov1=np.cov(x,rowvar=0) 274 | return mu1,cov1 275 | 276 | 277 | # In[15]: 278 | 279 | 280 | # SNIPPET 2.9 DENOISING OF THE EMPIRICAL COVARIANCE MATRIX 281 | def deNoiseCov(cov0,q,bWidth): 282 | corr0=cov2corr(cov0) 283 | eVal0,eVec0=getPCA(corr0) 284 | eMax0,var0=findMaxEval(np.diag(eVal0),q,bWidth) 285 | nFacts0=eVal0.shape[0]-np.diag(eVal0)[::-1].searchsorted(eMax0) 286 | corr1=denoisedCorr(eVal0,eVec0,nFacts0) 287 | cov1=corr2cov(corr1,np.diag(cov0)**.5) 288 | return cov1 289 | 290 | 291 | # In[16]: 292 | 293 | 294 | #SNIPPET 2.10 DENOISING OF THE EMPIRICAL COVARIANCE MATRIX 295 | def optPort(cov,mu=None): # optimal portfolio for minimum variance 296 | inv=np.linalg.inv(cov) 297 | ones=np.ones(shape=(inv.shape[0],1)) 298 | if mu is None: 299 | mu=ones 300 | w=np.dot(inv,mu) 301 | w/=np.dot(ones.T,w) 302 | return w 303 | # #--------------------------------------------------- 304 | # nObs,nTrials,bWidth,shrink,minVarPortf=1000,100,.01,False,True 305 | # w1= w1_s=pd.DataFrame(columns=range(cov0.shape[0]), 306 | # index=range(nTrials),dtype=float) 307 | # w1_d=w1.copy(deep=True) 308 | # w1_s_d = w1_s.copy(deep = True) 309 | # np.random.seed(0) 310 | # for i in tqdm(range(nTrials)): 311 | # mu1,cov1=simCovMu(mu0,cov0,nObs,shrink=True) 312 | # if minVarPortf: 313 | # mu1=None 314 | # cov1_d=deNoiseCov(cov1,nObs*1./cov1.shape[1],bWidth) 315 | # w1_s.loc[i]=optPort(cov1,mu1).flatten() 316 | # w1_s_d.loc[i]=optPort(cov1_d,mu1).flatten() 317 | # 318 | # 319 | # for i in tqdm(range(nTrials)): 320 | # mu1,cov1=simCovMu(mu0,cov0,nObs,shrink=False) 321 | # if minVarPortf: 322 | # mu1=None 323 | # cov1_d=deNoiseCov(cov1,nObs*1./cov1.shape[1],bWidth) 324 | # w1.loc[i]=optPort(cov1,mu1).flatten() 325 | # w1_d.loc[i]=optPort(cov1_d,mu1).flatten() 326 | 327 | 328 | # In[18]: 329 | 330 | 331 | # #SNIPPET 2.11 ROOT-MEAN-SQUARE ERRORS 332 | # w0=optPort(cov0,None if minVarPortf else mu0) 333 | # w0=np.repeat(w0.T,w1.shape[0],axis=0) 334 | # rmsd=np.mean((w1-w0).values.flatten()**2)**.5 # RMSE not shrunk not denoised 335 | # rmsd_d=np.mean((w1_d-w0).values.flatten()**2)**.5 # RMSE not shrunk denoised 336 | # rmsd_s=np.mean((w1_s-w0).values.flatten()**2)**.5 # RMSE shrunk not denoised 337 | # rmsd_s_d=np.mean((w1_s_d-w0).values.flatten()**2)**.5 # RMSE shrunk denoised 338 | # 339 | # res_tab = pd.DataFrame(columns = ['Note denoised','Denoised'],index = ['Not shrunk','Shrunk'],data = np.array([[rmsd,rmsd_d],[rmsd_s,rmsd_s_d]])) 340 | # 341 | 342 | # #-------------------------------------------------------- 343 | 344 | alpha, nCols, nFact, q = .995, 1000, 100, 10 345 | def init_para(): 346 | 347 | cov=np.cov(np.random.normal(size=(nCols*q,nCols)),rowvar=False) 348 | cov=alpha*cov+(1-alpha)*getRndCov(nCols,nFact) # noise+signal 349 | corr0=cov2corr(cov) 350 | eVal0,eVec0=getPCA(corr0) 351 | return corr0,eVal0,eVec0 352 | 353 | def denoise_init(method = 1): 354 | corr0, eVal0, eVec0 = init_para() 355 | eMax0,var0=findMaxEval(np.diag(eVal0),q,bWidth=.01) 356 | nFacts0=eVal0.shape[0]-np.diag(eVal0)[::-1].searchsorted(eMax0) 357 | 358 | if method==1: 359 | corr1=denoisedCorr(eVal0,eVec0,nFacts0) 360 | eVal1,eVec1=getPCA(corr1) 361 | else: 362 | corr1=denoisedCorr2(eVal0,eVec0,nFacts0,alpha=.5) 363 | eVal1,eVec1=getPCA(corr1) 364 | return corr0,eVal1,eVec1 365 | 366 | 367 | 368 | 369 | 370 | -------------------------------------------------------------------------------- /3_Distance_Metrics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter 3 Distance Metrics\n", 8 | "\n", 9 | "Look beyond correlations to understand codependency" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "ExecuteTime": { 17 | "end_time": "2020-08-27T21:32:53.539199Z", 18 | "start_time": "2020-08-27T21:32:52.544815Z" 19 | } 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np,pandas as pd\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "from tqdm.notebook import tqdm" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "ExecuteTime": { 33 | "end_time": "2020-08-27T21:33:16.310384Z", 34 | "start_time": "2020-08-27T21:33:16.295300Z" 35 | } 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "****************\n", 43 | "\n", 44 | "hX marginal entropy: 4.074017112668638\n", 45 | "hY marginal entropy: 4.0747093273590576\n", 46 | "iXY mutual info score: 3.5435562540396055\n", 47 | "iXYn normalized mutual information: 0.8697941505990483\n", 48 | "hX_Y cross entropy between x and y : 0.5304608586290325\n", 49 | "hY_X cross entropy between y and x : 0.5311530733194516\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "#SNIPPET 3.1 MARGINAL, JOINT, CONDITIONAL ENTROPIES, AND MUTUAL INFORMATION\n", 55 | "import numpy as np,scipy.stats as ss\n", 56 | "from sklearn.metrics import mutual_info_score\n", 57 | "\n", 58 | "x = np.random.random(100)\n", 59 | "y = np.random.random(100)\n", 60 | "bins = 100\n", 61 | "cXY=np.histogram2d(x,y,bins)[0] # The bi-dimensional histogram of samples x and y. Values in x are histogrammed along the first dimension and values in y are histogrammed along the second dimension.\n", 62 | "hX=ss.entropy(np.histogram(x,bins)[0]) # marginal \n", 63 | "hY=ss.entropy(np.histogram(y,bins)[0]) # marginal \n", 64 | "iXY=mutual_info_score(None,None,contingency=cXY) \n", 65 | "iXYn=iXY/min(hX,hY) # normalized mutual information \n", 66 | "hXY=hX+hY-iXY # joint\n", 67 | "hX_Y=hXY-hY # conditional\n", 68 | "hY_X=hXY-hX # conditional\n", 69 | "\n", 70 | "print('**'*8+'\\n')\n", 71 | "print('hX marginal entropy: {}'.format(hX))\n", 72 | "print('hY marginal entropy: {}'.format(hY))\n", 73 | "print('iXY mutual info score: {}'.format(iXY))\n", 74 | "print('iXYn normalized mutual information: {}'.format(iXYn))\n", 75 | "print('hX_Y cross entropy between x and y : {}'.format(hX_Y))\n", 76 | "print('hY_X cross entropy between y and x : {}'.format(hY_X))" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": { 83 | "ExecuteTime": { 84 | "end_time": "2020-08-27T21:33:19.716512Z", 85 | "start_time": "2020-08-27T21:33:19.703179Z" 86 | } 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "#SNIPPET 3.2 MUTUAL INFORMATION, VARIATION OF INFORMATION, AND NORMALIZED VARIATION OF INFORMATION\n", 91 | "def varInfo(x,y,bins,norm=False):\n", 92 | " # variation of information\n", 93 | " cXY=np.histogram2d(x,y,bins)[0] \n", 94 | " iXY=mutual_info_score(None,None,contingency=cXY) \n", 95 | " hX=ss.entropy(np.histogram(x,bins)[0]) # marginal \n", 96 | " hY=ss.entropy(np.histogram(y,bins)[0]) # marginal \n", 97 | " vXY=hX+hY-2*iXY # variation of information\n", 98 | " if norm:\n", 99 | " hXY=hX+hY-iXY # joint\n", 100 | " vXY/=hXY # normalized variation of information\n", 101 | " return vXY" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 6, 107 | "metadata": { 108 | "ExecuteTime": { 109 | "end_time": "2020-08-27T21:33:20.703804Z", 110 | "start_time": "2020-08-27T21:33:20.689116Z" 111 | } 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "#SNIPPET 3.3 VARIATION OF INFORMATION ON DISCRETIZED CONTINUOUS RANDOM VARIABLES\n", 116 | "def numBins(nObs,corr=None):\n", 117 | "# Optimal number of bins for discretization \n", 118 | " if corr is None: # univariate case\n", 119 | " z=(8+324*nObs+12*(36*nObs+729*nObs**2)**.5)**(1/3.)\n", 120 | " b=round(z/6.+2./(3*z)+1./3) \n", 121 | " else: # bivariate case\n", 122 | " if (1.-corr**2)==0:\n", 123 | " corr = np.sign(corr)*(np.abs(corr)-1e-5) \n", 124 | " b=round(2**-.5*(1+(1+24*nObs/(1.-corr**2))**.5)**.5) \n", 125 | " return int(b)\n", 126 | "#--------------------------------------------------- \n", 127 | "def varInfo_optBIn(x,y,norm=False): # Discretized and with optimal bin value\n", 128 | " # variation of information\n", 129 | " bXY=numBins(x.shape[0],corr=np.corrcoef(x,y)[0,1]) \n", 130 | " cXY=np.histogram2d(x,y,bXY)[0] \n", 131 | " iXY=mutual_info_score(None,None,contingency=cXY) \n", 132 | " hX=ss.entropy(np.histogram(x,bXY)[0]) # marginal \n", 133 | " hY=ss.entropy(np.histogram(y,bXY)[0]) # marginal \n", 134 | " vXY=hX+hY-2*iXY # variation of information\n", 135 | " if norm:\n", 136 | " hXY=hX+hY-iXY # joint\n", 137 | " vXY/=hXY # normalized variation of information\n", 138 | " return vXY" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 7, 144 | "metadata": { 145 | "ExecuteTime": { 146 | "end_time": "2020-08-27T21:33:21.627253Z", 147 | "start_time": "2020-08-27T21:33:21.586237Z" 148 | } 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "#SNIPPET 3.4 CORRELATION AND NORMALIZED MUTUAL INFORMATION OF TWO INDEPENDENT GAUSSIAN RANDOM VARIABLES\n", 153 | "def mutualInfo(x,y,norm=False):\n", 154 | " # mutual information\n", 155 | " bXY=numBins(x.shape[0],corr=np.corrcoef(x,y)[0,1]) \n", 156 | " cXY=np.histogram2d(x,y,bXY)[0] \n", 157 | " iXY=mutual_info_score(None,None,contingency=cXY) \n", 158 | " if norm:\n", 159 | " hX=ss.entropy(np.histogram(x,bXY)[0]) # marginal \n", 160 | " hY=ss.entropy(np.histogram(y,bXY)[0]) # marginal \n", 161 | " iXY/=min(hX,hY) # normalized mutual information\n", 162 | " return iXY \n", 163 | "#--------------------------------------------------- \n", 164 | "size,seed=5000,0\n", 165 | "np.random.seed(seed)\n", 166 | "x=np.random.normal(size=size)\n", 167 | "e=np.random.normal(size=size)\n", 168 | "y=0*x+e\n", 169 | "nmi=mutualInfo(x,y,True)\n", 170 | "corr=np.corrcoef(x,y)[0,1]" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 18, 176 | "metadata": { 177 | "ExecuteTime": { 178 | "end_time": "2020-08-27T21:43:19.429289Z", 179 | "start_time": "2020-08-27T21:42:13.142102Z" 180 | } 181 | }, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "application/vnd.jupyter.widget-view+json": { 186 | "model_id": "24994fcfd4b748ce8822bbec1a47195d", 187 | "version_major": 2, 188 | "version_minor": 0 189 | }, 190 | "text/plain": [ 191 | "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))" 192 | ] 193 | }, 194 | "metadata": {}, 195 | "output_type": "display_data" 196 | }, 197 | { 198 | "ename": "KeyboardInterrupt", 199 | "evalue": "", 200 | "output_type": "error", 201 | "traceback": [ 202 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 203 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 204 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mhX_Y\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhXY\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mhY\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m# conditional\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mviXY\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvarInfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbins\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mvi_t_XY\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvarInfo_optBIn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 205 | "\u001b[0;32m\u001b[0m in \u001b[0;36mvarInfo_optBIn\u001b[0;34m(x, y, norm)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# variation of information\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mbXY\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnumBins\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mcorr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorrcoef\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mcXY\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistogram2d\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mbXY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0miXY\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmutual_info_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mcontingency\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcXY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mhX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mentropy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mbXY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# marginal\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 206 | "\u001b[0;32m<__array_function__ internals>\u001b[0m in \u001b[0;36mhistogram2d\u001b[0;34m(*args, **kwargs)\u001b[0m\n", 207 | "\u001b[0;32m/anaconda3/lib/python3.7/site-packages/numpy/lib/twodim_base.py\u001b[0m in \u001b[0;36mhistogram2d\u001b[0;34m(x, y, bins, range, normed, weights, density)\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0mxedges\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0myedges\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbins\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 712\u001b[0m \u001b[0mbins\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mxedges\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0myedges\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 713\u001b[0;31m \u001b[0mhist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0medges\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhistogramdd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbins\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormed\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweights\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdensity\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 714\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mhist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0medges\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0medges\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 208 | "\u001b[0;32m<__array_function__ internals>\u001b[0m in \u001b[0;36mhistogramdd\u001b[0;34m(*args, **kwargs)\u001b[0m\n", 209 | "\u001b[0;32m/anaconda3/lib/python3.7/site-packages/numpy/lib/histograms.py\u001b[0m in \u001b[0;36mhistogramdd\u001b[0;34m(sample, bins, range, normed, weights, density)\u001b[0m\n\u001b[1;32m 1098\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1099\u001b[0m \u001b[0;31m# This preserves the (bad) behavior observed in gh-7845, for now.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1100\u001b[0;31m \u001b[0mhist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcasting\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'safe'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1101\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1102\u001b[0m \u001b[0;31m# Remove outliers (indices 0 and -1 for each dimension).\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 210 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "# Exercise 3.13.1\n", 216 | "\n", 217 | "\n", 218 | "bins = 10\n", 219 | "\n", 220 | "rho_list = [-1, -0.5, 0, 0.5, 1]\n", 221 | "hX = hY = hXY = hX_Y = iXY = viXY = vi_t_XY = np.zeros(len(rho_list))\n", 222 | "for i in tqdm(range(len(rho_list))):\n", 223 | " rho = rho_list[i]\n", 224 | " mu, sigma = 0, 1\n", 225 | " rr = np.random.normal(mu, sigma, size=(2, 1000))\n", 226 | " x,y_ = rr[0,:],rr[1,:]\n", 227 | " y = rho * x+np.sqrt(1-rho**2)*y_\n", 228 | "\n", 229 | " # The bi-dimensional histogram of samples x and y. Values in x are histogrammed along the first dimension and values in y are histogrammed along the second dimension.\n", 230 | " cXY = np.histogram2d(x, y, bins)[0]\n", 231 | " hX[i] = ss.entropy(np.histogram(x, bins)[0]) # marginal\n", 232 | " hY[i] = ss.entropy(np.histogram(y, bins)[0]) # marginal\n", 233 | " iXY[i] = mutual_info_score(None, None, contingency=cXY)\n", 234 | " hXY[i] = hX[i]+hY[i]-iXY[i] # joint\n", 235 | " hX_Y[i] = hXY[i]-hY[i] # conditional\n", 236 | " viXY[i] = varInfo(x, y, bins)\n", 237 | " vi_t_XY[i] = varInfo_optBIn(x, y)\n", 238 | "\n", 239 | "fig = plt.figure()\n", 240 | "ax = fig.add_subplot(111)\n", 241 | "ax.plot(rho_list, hX, label='H[X]')\n", 242 | "ax.plot(rho_list, hY, label='H[Y]')\n", 243 | "# ax.plot(rho_list, hXY, label='H[X,Y]')\n", 244 | "# ax.plot(rho_list, hX_Y, label='H[X|Y]')\n", 245 | "# ax.plot(rho_list, iXY, label='I[X,Y]')\n", 246 | "# ax.plot(rho_list, viXY, label='VI[X,Y]')\n", 247 | "# ax.plot(rho_list, vi_t_XY, label=r'$\\tilde{VI}$[X,Y]')\n", 248 | "ax.legend()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 17, 254 | "metadata": { 255 | "ExecuteTime": { 256 | "end_time": "2020-08-27T21:42:04.009916Z", 257 | "start_time": "2020-08-27T21:42:03.994430Z" 258 | } 259 | }, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "array([[1., 0., 0., ..., 0., 0., 0.],\n", 265 | " [0., 1., 0., ..., 0., 0., 0.],\n", 266 | " [0., 0., 0., ..., 0., 0., 0.],\n", 267 | " ...,\n", 268 | " [0., 0., 0., ..., 0., 0., 0.],\n", 269 | " [0., 0., 0., ..., 0., 0., 0.],\n", 270 | " [0., 0., 0., ..., 0., 0., 1.]])" 271 | ] 272 | }, 273 | "execution_count": 17, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "cXY = np.histogram2d(x, y, bins)[0]\n", 280 | "cXY" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 20, 286 | "metadata": { 287 | "ExecuteTime": { 288 | "end_time": "2020-08-27T21:30:50.493228Z", 289 | "start_time": "2020-08-27T21:30:50.479832Z" 290 | } 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "(1000,)" 297 | ] 298 | }, 299 | "execution_count": 20, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "rr = np.random.normal(mu, sigma, size=(2, 1000))\n", 306 | "rr[0,:].shape" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [] 315 | } 316 | ], 317 | "metadata": { 318 | "kernelspec": { 319 | "display_name": "Python 3", 320 | "language": "python", 321 | "name": "python3" 322 | }, 323 | "language_info": { 324 | "codemirror_mode": { 325 | "name": "ipython", 326 | "version": 3 327 | }, 328 | "file_extension": ".py", 329 | "mimetype": "text/x-python", 330 | "name": "python", 331 | "nbconvert_exporter": "python", 332 | "pygments_lexer": "ipython3", 333 | "version": "3.7.1" 334 | }, 335 | "latex_envs": { 336 | "LaTeX_envs_menu_present": true, 337 | "autoclose": false, 338 | "autocomplete": true, 339 | "bibliofile": "biblio.bib", 340 | "cite_by": "apalike", 341 | "current_citInitial": 1, 342 | "eqLabelWithNumbers": true, 343 | "eqNumInitial": 1, 344 | "hotkeys": { 345 | "equation": "Ctrl-E", 346 | "itemize": "Ctrl-I" 347 | }, 348 | "labels_anchors": false, 349 | "latex_user_defs": false, 350 | "report_style_numbering": false, 351 | "user_envs_cfg": false 352 | }, 353 | "toc": { 354 | "base_numbering": 1, 355 | "nav_menu": {}, 356 | "number_sections": true, 357 | "sideBar": true, 358 | "skip_h1_title": false, 359 | "title_cell": "Table of Contents", 360 | "title_sidebar": "Contents", 361 | "toc_cell": false, 362 | "toc_position": {}, 363 | "toc_section_display": true, 364 | "toc_window_display": false 365 | } 366 | }, 367 | "nbformat": 4, 368 | "nbformat_minor": 2 369 | } 370 | -------------------------------------------------------------------------------- /5_financial_lables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2020-09-10T16:09:55.457567Z", 9 | "start_time": "2020-09-10T16:09:55.452187Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "#SNIPPET 5.1 T-VALUE OF A LINEAR TREND \n", 15 | "import statsmodels.api as sm1\n", 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "#--------------------------------------------------- \n", 19 | "def tValLinR(close):\n", 20 | " # tValue of the beta coefficient of the trend on time index from a linear trend (it shows the significance of )\n", 21 | " x=np.ones((close.shape[0],2)) \n", 22 | " x[:,1]=np.arange(close.shape[0]) \n", 23 | " ols=sm1.OLS(close,x).fit()\n", 24 | " return ols.tvalues[1]" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 7, 30 | "metadata": { 31 | "ExecuteTime": { 32 | "end_time": "2020-09-10T16:09:02.604212Z", 33 | "start_time": "2020-09-10T16:09:02.595360Z" 34 | } 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "#SNIPPET 5.2 IMPLEMENTATION OF THE TREND-SCANNING METHOD\n", 39 | "def getBinsFromTrend(molecule,close,span):\n", 40 | " '''\n", 41 | " Derive labels from the sign of t-value of linear trend\n", 42 | " Output includes:\n", 43 | " - t1: End time for the identified trend\n", 44 | " - tVal: t-value associated with the estimated trend coefficient \n", 45 | " - bin: Sign of the trend\n", 46 | " ''' \n", 47 | " out=pd.DataFrame(index=molecule,columns=['t1','tVal','bin']) \n", 48 | " hrzns=range(*span)\n", 49 | " for dt0 in molecule:\n", 50 | " df0=pd.Series() \n", 51 | " iloc0=close.index.get_loc(dt0)\n", 52 | " if iloc0+max(hrzns)>close.shape[0]:continue\n", 53 | " for hrzn in hrzns: \n", 54 | " dt1=close.index[iloc0+hrzn-1] \n", 55 | " df1=close.loc[dt0:dt1] \n", 56 | " df0.loc[dt1]=tValLinR(df1.values)\n", 57 | " dt1=df0.replace([-np.inf,np.inf,np.nan],0).abs().idxmax() \n", 58 | " out.loc[dt0,['t1','tVal','bin']]=df0.index[-1],df0[dt1],np.sign(df0[dt1]) # prevent leakage\n", 59 | " out['t1']=pd.to_datetime(out['t1']) \n", 60 | " out['bin']=pd.to_numeric(out['bin'],downcast='signed') \n", 61 | " return out.dropna(subset=['bin'])" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 25, 67 | "metadata": { 68 | "ExecuteTime": { 69 | "end_time": "2020-09-10T16:19:36.703676Z", 70 | "start_time": "2020-09-10T16:19:35.856896Z" 71 | } 72 | }, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "" 78 | ] 79 | }, 80 | "execution_count": 25, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | }, 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "Text(0.5, 1.0, 'Simu without Sine trend')" 88 | ] 89 | }, 90 | "execution_count": 25, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | }, 94 | { 95 | "data": { 96 | "image/png": "\n", 97 | "text/plain": [ 98 | "
" 99 | ] 100 | }, 101 | "metadata": { 102 | "needs_background": "light" 103 | }, 104 | "output_type": "display_data" 105 | } 106 | ], 107 | "source": [ 108 | "#SNIPPET 5.3 TESTING THE TREND-SCANNING LABELING ALGORITHM\n", 109 | "import matplotlib.pyplot as plt\n", 110 | "df0=pd.Series(np.random.normal(0,.1,100)).cumsum() \n", 111 | "df1=getBinsFromTrend(df0.index,df0,[3,10,1])\n", 112 | "fig = plt.figure()\n", 113 | "ax = fig.add_subplot(111)\n", 114 | "ax.scatter(df1.index,df0.loc[df1.index].values,c=df1['bin'].values, cmap='viridis')\n", 115 | "ax.set_title('Simu without Sine trend')\n" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 27, 121 | "metadata": { 122 | "ExecuteTime": { 123 | "end_time": "2020-09-10T16:19:47.720841Z", 124 | "start_time": "2020-09-10T16:19:46.878522Z" 125 | } 126 | }, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "" 132 | ] 133 | }, 134 | "execution_count": 27, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | }, 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "Text(0.5, 1.0, 'Simu with Sine trend')" 142 | ] 143 | }, 144 | "execution_count": 27, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | }, 148 | { 149 | "data": { 150 | "image/png": "\n", 151 | "text/plain": [ 152 | "
" 153 | ] 154 | }, 155 | "metadata": { 156 | "needs_background": "light" 157 | }, 158 | "output_type": "display_data" 159 | } 160 | ], 161 | "source": [ 162 | "df0=pd.Series(np.random.normal(0,.1,100)).cumsum() \n", 163 | "df0+=np.sin(np.linspace(0,10,df0.shape[0])) \n", 164 | "df1=getBinsFromTrend(df0.index,df0,[3,10,1]) \n", 165 | "fig = plt.figure()\n", 166 | "ax = fig.add_subplot(111)\n", 167 | "ax.scatter(df1.index,df0.loc[df1.index].values,c=df1['tVal'].values, cmap='viridis')\n", 168 | "ax.set_title('Simu with Sine trend')\n", 169 | "# mpl.savefig('fig 5.1.png');\n", 170 | "# mpl.clf();\n", 171 | "# mpl.close() \n", 172 | "#mpl.scatter(df1.index,df0.loc[df1.index].values,cmap='viridis')" 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.7.1" 193 | }, 194 | "latex_envs": { 195 | "LaTeX_envs_menu_present": true, 196 | "autoclose": false, 197 | "autocomplete": true, 198 | "bibliofile": "biblio.bib", 199 | "cite_by": "apalike", 200 | "current_citInitial": 1, 201 | "eqLabelWithNumbers": true, 202 | "eqNumInitial": 1, 203 | "hotkeys": { 204 | "equation": "Ctrl-E", 205 | "itemize": "Ctrl-I" 206 | }, 207 | "labels_anchors": false, 208 | "latex_user_defs": false, 209 | "report_style_numbering": false, 210 | "user_envs_cfg": false 211 | }, 212 | "toc": { 213 | "base_numbering": 1, 214 | "nav_menu": {}, 215 | "number_sections": true, 216 | "sideBar": true, 217 | "skip_h1_title": false, 218 | "title_cell": "Table of Contents", 219 | "title_sidebar": "Contents", 220 | "toc_cell": false, 221 | "toc_position": {}, 222 | "toc_section_display": true, 223 | "toc_window_display": false 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 2 228 | } 229 | -------------------------------------------------------------------------------- /4_Optimal_Clustering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "distinguish several types of clustering algorithms, including the following:\n", 8 | "\n", 9 | "1 Connectivity: This clustering is based on distance connectivity, like hier- archical clustering. For an example in finance, see López de Prado (2016).\n", 10 | "\n", 11 | "2 Centroids: These algorithms perform a vector quantization, like k-means. For an example in finance, see López de Prado and Lewis (2018).\n", 12 | "\n", 13 | "3 Distribution: Clusters are formed using statistical distributions, e.g., a mixture of Gaussians.\n", 14 | "\n", 15 | "4 Density: These algorithms search for connected dense regions in the data space. Examples include DBSCAN and OPTICS.\n", 16 | "\n", 17 | "5 Subspace: Clusters are modeled on two dimensions, features and observa- tions. An example is biclustering (also known as coclustering). For instance, they can help identify similarities in subsets of instruments and time periods simultaneously.\n", 18 | "\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 5, 24 | "metadata": { 25 | "ExecuteTime": { 26 | "end_time": "2020-09-09T15:12:02.475159Z", 27 | "start_time": "2020-09-09T15:12:02.468896Z" 28 | } 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import os\n", 33 | "import sys\n", 34 | "nb_path = os.path.split(os.getcwd())[0]\n", 35 | "if nb_path not in sys.path:\n", 36 | " sys.path.append(nb_path)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 6, 42 | "metadata": { 43 | "ExecuteTime": { 44 | "end_time": "2020-09-09T15:15:56.924157Z", 45 | "start_time": "2020-09-09T15:15:55.283405Z" 46 | } 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stderr", 51 | "output_type": "stream", 52 | "text": [ 53 | "/anaconda3/lib/python3.7/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.neighbors.kde module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.\n", 54 | " warnings.warn(message, FutureWarning)\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "import CovMatrix\n", 60 | "corr0,eVal0,eVec0 = CovMatrix.init_para()\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 88, 66 | "metadata": { 67 | "ExecuteTime": { 68 | "end_time": "2020-09-10T10:35:40.740566Z", 69 | "start_time": "2020-09-10T10:35:40.727381Z" 70 | } 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "#SNIPPET 4.1 BASE CLUSTERING\n", 75 | "import numpy as np,pandas as pd\n", 76 | "from sklearn.cluster import KMeans\n", 77 | "from sklearn.metrics import silhouette_samples \n", 78 | "#--------------------------------------------------- \n", 79 | "def clusterKMeansBase(corr0,maxNumClusters=10,n_init=10):\n", 80 | " x,silh=((1-corr0.fillna(0))/2.)**.5,pd.Series()# observations matrix \n", 81 | " for init in range(n_init):\n", 82 | " for i in range(2,maxNumClusters+1): \n", 83 | " kmeans_=KMeans(n_clusters=i,n_jobs=1,n_init=1) \n", 84 | " kmeans_=kmeans_.fit(x) \n", 85 | " silh_=silhouette_samples(x,kmeans_.labels_) \n", 86 | " stat=(silh_.mean()/silh_.std(),silh.mean()/silh.std())\n", 87 | " if np.isnan(stat[1]) or stat[0]>stat[1]: \n", 88 | " silh,kmeans=silh_,kmeans_\n", 89 | " newIdx=np.argsort(kmeans.labels_) \n", 90 | " corr1=corr0.iloc[newIdx] # reorder rows\n", 91 | "\n", 92 | " corr1=corr1.iloc[:,newIdx] # reorder columns \n", 93 | " clstrs={i:corr0.columns[np.where(kmeans.labels_==i)[0]].tolist() \\\n", 94 | " for i in np.unique(kmeans.labels_) } # cluster members \n", 95 | " silh=pd.Series(silh,index=x.index)\n", 96 | " return corr1,clstrs,silh" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 89, 102 | "metadata": { 103 | "ExecuteTime": { 104 | "end_time": "2020-09-10T10:36:10.396076Z", 105 | "start_time": "2020-09-10T10:35:56.398771Z" 106 | } 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "corr1,clstrs,silh = clusterKMeansBase(pd.DataFrame(corr0))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 90, 116 | "metadata": { 117 | "ExecuteTime": { 118 | "end_time": "2020-09-10T10:36:15.285575Z", 119 | "start_time": "2020-09-10T10:36:15.271596Z" 120 | } 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "#a new (reduced) observations matrix out of the elements that compose the K1 clusters, and rerun the base clustering algorithm on that reduced correlation matrix. Doing so will return a, possibly new, clustering for those elements in K1. To check its efficacy, we compare the average cluster quality before and after reclustering those elements in K1. If the average cluster quality improves, we return the accepted clustering from the base clustering concate- nated with the new clustering for the redone nodes.\n", 125 | "\n", 126 | "#SNIPPET 4.2 TOP-LEVEL OF CLUSTERING\n", 127 | "from sklearn.metrics import silhouette_samples \n", 128 | "#--------------------------------------------------- \n", 129 | "def makeNewOutputs(corr0,clstrs,clstrs2):\n", 130 | " clstrsNew={}\n", 131 | " for i in clstrs.keys():\n", 132 | " clstrsNew[len(clstrsNew.keys())]=list(clstrs[i]) \n", 133 | " for i in clstrs2.keys():\n", 134 | " clstrsNew[len(clstrsNew.keys())]=list(clstrs2[i]) \n", 135 | " newIdx=[j for i in clstrsNew for j in clstrsNew[i]] \n", 136 | " corrNew=corr0.loc[newIdx,newIdx] \n", 137 | " x=((1-corr0.fillna(0))/2.)**.5 \n", 138 | " kmeans_labels=np.zeros(len(x.columns))\n", 139 | " for i in clstrsNew.keys(): \n", 140 | " idxs=[x.index.get_loc(k) for k in clstrsNew[i]] \n", 141 | " kmeans_labels[idxs]=i\n", 142 | " silhNew=pd.Series(silhouette_samples(x,kmeans_labels),index=x.index)\n", 143 | " return corrNew,clstrsNew,silhNew \n", 144 | "\n", 145 | "#--------------------------------------------------- \n", 146 | "def clusterKMeansTop(corr0,maxNumClusters=None,n_init=3): # for real stock data, n_init start from 10 ; for simulated data, n_init=3 would speed up the process\n", 147 | " if maxNumClusters==None:\n", 148 | " maxNumClusters=corr0.shape[1]-1 \n", 149 | " corr1,clstrs,silh=clusterKMeansBase(corr0,maxNumClusters= \\\n", 150 | " min(maxNumClusters,corr0.shape[1]-1),n_init=n_init) \n", 151 | " clusterTstats={i:np.mean(silh[clstrs[i]])/ \\\n", 152 | " np.std(silh[clstrs[i]]) for i in clstrs.keys()}\n", 153 | " tStatMean=sum(clusterTstats.values())/len(clusterTstats) \n", 154 | " redoClusters=[i for i in clusterTstats.keys() if \\\n", 155 | " clusterTstats[i]" 249 | ] 250 | }, 251 | "execution_count": 84, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | }, 255 | { 256 | "data": { 257 | "image/png": "\n", 258 | "text/plain": [ 259 | "
" 260 | ] 261 | }, 262 | "metadata": { 263 | "needs_background": "light" 264 | }, 265 | "output_type": "display_data" 266 | } 267 | ], 268 | "source": [ 269 | "# simulated a cov matraix with blocks\n", 270 | "import seaborn as sns\n", 271 | "corr_blk_simu = randomBlockCorr(100,10)\n", 272 | "sns.heatmap(corr_blk_simu)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 92, 278 | "metadata": { 279 | "ExecuteTime": { 280 | "end_time": "2020-09-10T10:36:45.718412Z", 281 | "start_time": "2020-09-10T10:36:39.518374Z" 282 | } 283 | }, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "" 289 | ] 290 | }, 291 | "execution_count": 92, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | }, 295 | { 296 | "data": { 297 | "image/png": "\n", 298 | "text/plain": [ 299 | "
" 300 | ] 301 | }, 302 | "metadata": { 303 | "needs_background": "light" 304 | }, 305 | "output_type": "display_data" 306 | } 307 | ], 308 | "source": [ 309 | "# clustered using clusterKMeansTop\n", 310 | "sns.heatmap(clusterKMeansTop(corr_blk_simu)[0])" 311 | ] 312 | } 313 | ], 314 | "metadata": { 315 | "kernelspec": { 316 | "display_name": "Python 3", 317 | "language": "python", 318 | "name": "python3" 319 | }, 320 | "language_info": { 321 | "codemirror_mode": { 322 | "name": "ipython", 323 | "version": 3 324 | }, 325 | "file_extension": ".py", 326 | "mimetype": "text/x-python", 327 | "name": "python", 328 | "nbconvert_exporter": "python", 329 | "pygments_lexer": "ipython3", 330 | "version": "3.7.1" 331 | }, 332 | "latex_envs": { 333 | "LaTeX_envs_menu_present": true, 334 | "autoclose": false, 335 | "autocomplete": true, 336 | "bibliofile": "biblio.bib", 337 | "cite_by": "apalike", 338 | "current_citInitial": 1, 339 | "eqLabelWithNumbers": true, 340 | "eqNumInitial": 1, 341 | "hotkeys": { 342 | "equation": "Ctrl-E", 343 | "itemize": "Ctrl-I" 344 | }, 345 | "labels_anchors": false, 346 | "latex_user_defs": false, 347 | "report_style_numbering": false, 348 | "user_envs_cfg": false 349 | }, 350 | "toc": { 351 | "base_numbering": 1, 352 | "nav_menu": {}, 353 | "number_sections": true, 354 | "sideBar": true, 355 | "skip_h1_title": false, 356 | "title_cell": "Table of Contents", 357 | "title_sidebar": "Contents", 358 | "toc_cell": false, 359 | "toc_position": {}, 360 | "toc_section_display": true, 361 | "toc_window_display": false 362 | } 363 | }, 364 | "nbformat": 4, 365 | "nbformat_minor": 2 366 | } 367 | --------------------------------------------------------------------------------