├── .gitignore
├── LICENSE
├── Machine_Learning_for_Asset_Managers
    ├── __init__.py
    ├── ch2_fitKDE_find_best_bandwidth.py
    ├── ch2_marcenko_pastur_pdf.py
    ├── ch2_monte_carlo_experiment.py
    ├── ch3_metrics.py
    ├── ch4_optimal_clustering.py
    ├── ch5_financial_labels.py
    ├── ch6_feature_importance_analysis.py
    ├── ch7_portfolio_construction.py
    └── ch8_testing_set_overfitting.py
├── README.md
├── __init__.py
├── img
    ├── fig_2_3_mp_with_signal.png
    ├── fig_3_1_abs_squared_angular_distance.png
    ├── fig_3_1_angular_distance.png
    ├── fig_4_1_random_block_correlation_matrix.png
    ├── fig_4_1_random_block_correlation_matrix_mini.png
    ├── fig_4_1_random_block_correlation_matrix_onc.png
    ├── fig_4_1_random_block_correlation_matrix_onc_mini.png
    ├── fig_5_1_trend_scanning.png
    ├── fig_5_2_trend_scanning_t_values.png
    ├── fig_5_2_trend_scanning_t_values2.png
    ├── fig_5_3_distribution_t_values.png
    ├── fig_5_3_distribution_t_values_2.png
    ├── fig_6_1_p_values_explanatory_vars.png
    ├── fig_6_2_mdi_example.png
    ├── fig_6_3_mda_example.png
    ├── fig_6_4_feature_clustering.png
    ├── fig_6_5_clustered_MDI.png
    ├── fig_6_6_clustered_MDA.png
    ├── fig_7_1_block_diagonal.png
    ├── fig_7_2_block_diagonal.png
    ├── figure_2_3_eigenvalue_method.png
    ├── figure_2_3_eigenvalue_method_with_denoise.png
    ├── figure_2_3_eigenvalue_method_with_denoise_zoomed.png
    ├── gaussian_mp.png
    ├── gaussian_mp_excersize_2_7.png
    └── maxSR_across_uniform_strategies_8_1.png
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
2 | /venv/
3 | /ml_for_am/
4 | __pycache__
5 | /Machine_Learning_for_Asset_Managers.egg-info
6 | /build
7 | /dist
8 | /.vs


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/Machine_Learning_for_Asset_Managers/__init__.py


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch2_fitKDE_find_best_bandwidth.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | from sklearn.neighbors import KernelDensity
 4 | from sklearn.model_selection import learning_curve,GridSearchCV
 5 | from sklearn.model_selection import LeaveOneOut
 6 | 
 7 | '''
 8 | Selecting the bandwidth via cross-validation
 9 | The choice of bandwidth within KDE is extremely important to finding a suitable density estimate, 
10 | and is the knob that controls the bias–variance trade-off in the estimate of density: 
11 | too narrow a bandwidth leads to a high-variance estimate (i.e., over-fitting), 
12 | where the presence or absence of a single point makes a large difference. 
13 | Too wide a bandwidth leads to a high-bias estimate (i.e., under-fitting) 
14 | where the structure in the data is washed out by the wide kernel.
15 | 
16 | There is a long history in statistics of methods to quickly estimate the best bandwidth 
17 | based on rather stringent assumptions about the data: if you look up the KDE implementations 
18 | in the SciPy and StatsModels packages, for example, you will see implementations 
19 | based on some of these rules.
20 | 
21 | In machine learning contexts, we've seen that such hyperparameter tuning often is 
22 | done empirically via a cross-validation approach. With this in mind, the KernelDensity estimator 
23 | in Scikit-Learn is designed such that it can be used directly within the Scikit-Learn's standard 
24 | grid search tools. Here we will use GridSearchCV to optimize the 
25 | bandwidth for the preceding dataset. Because we are looking at such a small dataset, 
26 | we will use leave-one-out cross-validation, which minimizes the reduction in 
27 | training set size for each cross-validation trial:
28 | 
29 | https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html
30 | '''
31 | 
32 | ''' This is also excersize 2.7 in the book:
33 | "Extend function fitKDE in code snippet 2.2, so that it estimates through
34 | cross-validation the optimal value of bWidth (bandwidth)"
35 | '''
36 | 
37 | def findOptimalBWidth(eigenvalues):
38 |     bandwidths = 10 ** np.linspace(-2, 1, 100)
39 |     grid = GridSearchCV(KernelDensity(kernel='gaussian'),
40 |                         {'bandwidth': bandwidths},
41 |                         cv=LeaveOneOut())
42 |     grid.fit(eigenvalues[:, None]);
43 |     
44 |     #Now we can find the choice of bandwidth which maximizes the score (which in this case defaults to the log-likelihood):
45 |     
46 |     return grid.best_params_


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch2_marcenko_pastur_pdf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.neighbors import KernelDensity
  5 | import matplotlib.pylab as plt
  6 | from scipy.optimize import minimize
  7 | from scipy.linalg import block_diag
  8 | from sklearn.covariance import LedoitWolf
  9 | 
 10 | #snippet 2.1
 11 | #Marcenko-Pastur pdf
 12 | #q=T/N 
 13 | def mpPDF(var, q, pts):
 14 |     eMin, eMax = var*(1-(1./q)**.5)**2, var*(1+(1./q)**.5)**2 # calc lambda_minus, lambda_plus
 15 |     eVal = np.linspace(eMin, eMax, pts) #Return evenly spaced numbers over a specified interval. eVal='lambda'
 16 |     #Note: 1.0/2*2 = 1.0 not 0.25=1.0/(2*2)
 17 |     pdf = q/(2*np.pi*var*eVal)*((eMax-eVal)*(eVal-eMin))**.5 #np.allclose(np.flip((eMax-eVal)), (eVal-eMin))==True
 18 |     pdf = pd.Series(pdf, index=eVal)
 19 |     return pdf
 20 | 
 21 | #snippet 2.2
 22 | #Test Marcenko-Pastur Thm
 23 | def getPCA(matrix):
 24 |     # Get eVal, eVec from a Hermitian matrix
 25 |     eVal, eVec = np.linalg.eig(matrix) #complex Hermitian (conjugate symmetric) or a real symmetric matrix.
 26 |     indices = eVal.argsort()[::-1] #arguments for sorting eval desc
 27 |     eVal,eVec = eVal[indices],eVec[:,indices]
 28 |     eVal = np.diagflat(eVal) # identity matrix with eigenvalues as diagonal
 29 |     return eVal,eVec
 30 |     
 31 | def fitKDE(obs, bWidth=.15, kernel='gaussian', x=None):
 32 |     #Fit kernel to a series of obs, and derive the prob of obs
 33 |     # x is the array of values on which the fit KDE will be evaluated
 34 |     #print(len(obs.shape) == 1)
 35 |     if len(obs.shape) == 1: obs = obs.reshape(-1,1)
 36 |     kde = KernelDensity(kernel = kernel, bandwidth = bWidth).fit(obs)
 37 |     #print(x is None)
 38 |     if x is None: x = np.unique(obs).reshape(-1,1)
 39 |     #print(len(x.shape))
 40 |     if len(x.shape) == 1: x = x.reshape(-1,1)
 41 |     logProb = kde.score_samples(x) # log(density)
 42 |     pdf = pd.Series(np.exp(logProb), index=x.flatten())
 43 |     return pdf
 44 | 
 45 | #snippet 2.3
 46 | def getRndCov(nCols, nFacts): #nFacts - contains signal out of nCols
 47 |     w = np.random.normal(size=(nCols, nFacts))
 48 |     cov = np.dot(w, w.T) #random cov matrix, however not full rank
 49 |     cov += np.diag(np.random.uniform(size=nCols)) #full rank cov
 50 |     return cov
 51 | 
 52 | def cov2corr(cov):
 53 |     # Derive the correlation matrix from a covariance matrix
 54 |     std = np.sqrt(np.diag(cov))
 55 |     corr = cov/np.outer(std,std)
 56 |     corr[corr<-1], corr[corr>1] = -1,1 #for numerical errors
 57 |     return corr
 58 |     
 59 | def corr2cov(corr, std):
 60 |     cov = corr * np.outer(std, std)
 61 |     return cov     
 62 |     
 63 | #snippet 2.4 - fitting the marcenko-pastur pdf - find variance
 64 | #Fit error
 65 | def errPDFs(var, eVal, q, bWidth, pts=1000):
 66 |     var = var[0]
 67 |     pdf0 = mpPDF(var, q, pts) #theoretical pdf
 68 |     pdf1 = fitKDE(eVal, bWidth, x=pdf0.index.values) #empirical pdf
 69 |     sse = np.sum((pdf1-pdf0)**2)
 70 |     print("sse:"+str(sse))
 71 |     return sse 
 72 |     
 73 | # find max random eVal by fitting Marcenko's dist
 74 | # and return variance
 75 | def findMaxEval(eVal, q, bWidth):
 76 |     out = minimize(lambda *x: errPDFs(*x), x0=np.array(0.5), args=(eVal, q, bWidth), bounds=((1E-5, 1-1E-5),))
 77 |     print("found errPDFs"+str(out['x'][0]))
 78 |     if out['success']: var = out['x'][0]
 79 |     else: var=1
 80 |     eMax = var*(1+(1./q)**.5)**2
 81 |     return eMax, var
 82 |     
 83 | # code snippet 2.5 - denoising by constant residual eigenvalue
 84 | # Remove noise from corr by fixing random eigenvalue
 85 | # Operation invariante to trace(Correlation)
 86 | # The Trace of a square matrix is the _Sum_ of its eigenvalues
 87 | # The Determinate of thematrix is the _Product_ of its eigenvalues
 88 | def denoisedCorr(eVal, eVec, nFacts):
 89 |     eVal_ = np.diag(eVal).copy()
 90 |     eVal_[nFacts:] = eVal_[nFacts:].sum()/float(eVal_.shape[0] - nFacts) #all but 0..i values equals (1/N-i)sum(eVal_[i..N]))
 91 |     eVal_ = np.diag(eVal_) #square matrix with eigenvalues as diagonal: eVal_.I
 92 |     corr1 = np.dot(eVec, eVal_).dot(eVec.T) #Eigendecomposition of a symmetric matrix: S = QΛQT
 93 |     corr1 = cov2corr(corr1) # Rescaling the correlation matrix to have 1s on the main diagonal
 94 |     return corr1
 95 |     
 96 | # code snippet 2.6 - detoning
 97 | # ref: mlfinlab/portfolio_optimization/risk_estimators.py
 98 | # This method assumes a sorted set of eigenvalues and eigenvectors.
 99 | # The market component is the first eigenvector with highest eigenvalue.
100 | # it returns singular correlation matrix: 
101 | # "the detoned correlation matrix is singualar, as a result of eliminating (at least) one eigenvector."
102 | # Page 32
103 | def detoned_corr(corr, eigenvalues, eigenvectors, market_component=1):
104 |     """
105 |     De-tones the de-noised correlation matrix by removing the market component.
106 |     The input is the eigenvalues and the eigenvectors of the correlation matrix and the number
107 |     of the first eigenvalue that is above the maximum theoretical eigenvalue and the number of
108 |     eigenvectors related to a market component.
109 |     :param corr: (np.array) Correlation matrix to detone.
110 |     :param eigenvalues: (np.array) Matrix with eigenvalues on the main diagonal.
111 |     :param eigenvectors: (float) Eigenvectors array.
112 |     :param market_component: (int) Number of fist eigevectors related to a market component. (1 by default)
113 |     :return: (np.array) De-toned correlation matrix.
114 |     """
115 |     
116 |     # Getting the eigenvalues and eigenvectors related to market component
117 |     eigenvalues_mark = eigenvalues[:market_component, :market_component]
118 |     eigenvectors_mark = eigenvectors[:, :market_component]
119 |     
120 |     # Calculating the market component correlation
121 |     corr_mark = np.dot(eigenvectors_mark, eigenvalues_mark).dot(eigenvectors_mark.T)
122 |     
123 |     # Removing the market component from the de-noised correlation matrix
124 |     corr = corr - corr_mark
125 |     
126 |     # Rescaling the correlation matrix to have 1s on the main diagonal
127 |     corr = cov2corr(corr)
128 |     
129 |     return corr
130 |             
131 | def test_detone():
132 |     # ------ Test detone --------
133 |     cov_matrix = np.array([[0.01, 0.002, -0.001],
134 |                            [0.002, 0.04, -0.006],
135 |                            [-0.001, -0.006, 0.01]])
136 |     cor_test = np.corrcoef(cov_matrix, rowvar=0) 
137 |     eVal_test, eVec_test = getPCA(cor_test)
138 |     eMax_test, var_test = findMaxEval(np.diag(eVal_test), q, bWidth=.01)
139 |     nFacts_test = eVal_test.shape[0]-np.diag(eVal_test)[::-1].searchsorted(eMax_test)   
140 |     corr1_test = denoisedCorr(eVal_test, eVec_test, nFacts_test) 
141 |     eVal_denoised_test, eVec_denoised_test = getPCA(corr1_test)
142 |     corr_detoned_denoised_test = detoned_corr(corr1_test, eVal_denoised_test, eVec_denoised_test)       
143 |     eVal_detoned_denoised_test, _ = getPCA(corr_detoned_denoised_test)     
144 |     np.diag(eVal_denoised_test)
145 |     np.diag(eVal_detoned_denoised_test)
146 |     
147 |     expected_detoned_denoised_corr = np.array([ 1.56236229e+00,  1.43763771e+00, -2.22044605e-16])    
148 |     
149 |     np.testing.assert_almost_equal(np.diag(eVal_detoned_denoised_test), expected_detoned_denoised_corr, decimal=4)
150 |     np.testing.assert_almost_equal(sum(np.diag(eVal_denoised_test)), sum(np.diag(eVal_detoned_denoised_test)), decimal=4 )
151 | 
152 | if __name__ == '__main__':
153 |     # code snippet 2.2 - marcenko-pastur pdf explains eigenvalues of random matrix x
154 |     N = 1000
155 |     T = 10000
156 |     x = np.random.normal(0, 1, size = (T, N))
157 |     cor = np.corrcoef(x, rowvar=0) # cor.shape = (1000,1000). If rowvar=1 - row represents a var, with observations in the columns.
158 |     eVal0 , eVec0 = getPCA( cor ) 
159 |     pdf0 = mpPDF(1., q=x.shape[0]/float(x.shape[1]), pts=N)
160 |     pdf1 = fitKDE(np.diag(eVal0), bWidth=.005) #empirical pdf
161 |             
162 |     # code snippet 2.3 - random matrix with signal
163 |     alpha, nCols, nFact, q = .995, 1000, 100, 10
164 |     pdf0 = mpPDF(1., q=x.shape[0]/float(x.shape[1]), pts=N)
165 |     cov = np.cov(np.random.normal(size=(nCols*q, nCols)), rowvar=0) #size = (1000*10,1000)
166 |     cov = alpha*cov+(1-alpha)*getRndCov(nCols, nFact) # noise + signal
167 |     corr0 = cov2corr(cov)
168 |     eVal01, eVec01 = getPCA(corr0)
169 |     #pdf2 = fitKDE(np.diag(eVal01), bWidth=.15) #empirical pdf
170 | 
171 |     # Figure 2.1 Plot empirical:KDE and Marcenko-Pastur, and histogram
172 |     fig = plt.figure()
173 |     ax  = fig.add_subplot(111)
174 |     ax.hist(np.diag(eVal01), density = True, bins=50) # Histogram the eigenvalues
175 | 
176 |     #plt.plot(pdf0.keys(), pdf0, color='r', label="Marcenko-Pastur pdf")
177 |     #plt.plot(pdf1.keys(), pdf1, color='g', label="Empirical:KDE")
178 |     #plt.plot(x_range, pdf2, color='b', label="Eigenvalues of random-matrix with signal")
179 |     #plt.legend(loc="upper right")
180 |     #plt.show()
181 | 
182 |     # code snippet 2.4 - fitting the marcenko-pastur pdf - find variance
183 |     eMax0, var0 = findMaxEval(np.diag(eVal01), q, bWidth=.01)
184 |     nFacts0 = eVal01.shape[0]-np.diag(eVal01)[::-1].searchsorted(eMax0)
185 | 
186 |     #code snippet 2.3 - with random matrix with signal
187 |     ######################
188 |     # Figure 2.1 Plot empirical:KDE and Marcenko-Pastur, and histogram
189 |     pdf0 = mpPDF(var0, q=x.shape[0]/float(x.shape[1]), pts=N)
190 |     fig = plt.figure()
191 |     ax  = fig.add_subplot(111)
192 |     ax.hist(np.diag(eVal01), density = True, bins=50) # Histogram the eigenvalues
193 | 
194 |     plt.plot(pdf0.keys(), pdf0, color='r', label="Marcenko-Pastur pdf")
195 |     #plt.plot(pdf1.keys(), pdf1, color='g', label="Empirical:KDE")
196 |     #plt.plot(x_range, pdf2, color='b', label="Eigenvalues of random-matrix with signal")
197 |     plt.legend(loc="upper right")
198 |     plt.show()    
199 |     ######################
200 | 
201 |     # code snippet 2.5 - denoising by constant residual eigenvalue
202 |     corr1 = denoisedCorr(eVal01, eVec01, nFacts0)   
203 |     eVal1, eVec1 = getPCA(corr1)
204 | 
205 |     denoised_eigenvalue = np.diag(eVal1)
206 |     eigenvalue_prior = np.diag(eVal01)
207 |     plt.plot(range(0, len(denoised_eigenvalue)), np.log(denoised_eigenvalue), color='r', label="Denoised eigen-function")
208 |     plt.plot(range(0, len(eigenvalue_prior)), np.log(eigenvalue_prior), color='g', label="Original eigen-function")
209 |     plt.xlabel("Eigenvalue number")
210 |     plt.ylabel("Eigenvalue (log-scale)")
211 |     plt.legend(loc="upper right")
212 |     plt.show()
213 | 
214 |     corr_detoned_denoised = detoned_corr(corr1, eVal1, eVec1)
215 | 
216 |     eVal1_detoned, eVec1_detoned = getPCA(corr_detoned_denoised)
217 |     detoned_denoised_eigenvalue = np.diag(eVal1_detoned)
218 |     denoised_eigenvalue = np.diag(eVal1)
219 |     eigenvalue_prior = np.diag(eVal01)
220 | 
221 |     plt.plot(range(0, len(detoned_denoised_eigenvalue)), np.log(detoned_denoised_eigenvalue), color='b', label="Detoned, denoised eigen-function")
222 |     plt.plot(range(0, len(denoised_eigenvalue)), np.log(denoised_eigenvalue), color='r', label="Denoised eigen-function")
223 |     plt.plot(range(0, len(eigenvalue_prior)), np.log(eigenvalue_prior), color='g', label="Original eigen-function")
224 |     plt.xlabel("Eigenvalue number")
225 |     plt.ylabel("Eigenvalue (log-scale)")
226 |     plt.legend(loc="upper right")
227 |     plt.show()


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch2_monte_carlo_experiment.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | from scipy.linalg import block_diag
  5 | from sklearn.covariance import LedoitWolf
  6 | 
  7 | from Machine_Learning_for_Asset_Managers import ch2_marcenko_pastur_pdf as mp
  8 | 
  9 | #import cvxpy as cp
 10 | 
 11 | # Code snippet 2.7
 12 | #Generate a block-diagnoal covariance matrix and a vector of means
 13 | def formBlockMatrix(nBlocks, bSize, bCorr):
 14 |     block = np.ones( (bSize, bSize))*bCorr
 15 |     block[range(bSize), range(bSize)] = 1 #diagonal is 1
 16 |     corr = block_diag(*([block]*nBlocks))
 17 |     return corr
 18 |     
 19 | def formTrueMatrix(nBlocks, bSize, bCorr):
 20 |     corr0 = formBlockMatrix(nBlocks, bSize, bCorr)
 21 |     corr0 = pd.DataFrame(corr0)
 22 |     cols = corr0.columns.tolist()
 23 |     np.random.shuffle(cols)
 24 |     corr0 = corr0[cols].loc[cols].copy(deep=True)
 25 |     std0 = np.random.uniform(.05, .2, corr0.shape[0])
 26 |     cov0 = corr2cov(corr0, std0)
 27 |     mu0 = np.random.normal(std0, std0, cov0.shape[0]).reshape(-1,1)
 28 |     return mu0, cov0
 29 |     
 30 | def corr2cov(corr, std):
 31 |     cov = corr * np.outer(std, std)
 32 |     return cov
 33 |     
 34 | # Code snippet 2.8
 35 | # generating the empirical covariance matrix
 36 | def simCovMu(mu0, cov0, nObs, shrink=False):
 37 |     x = np.random.multivariate_normal(mu0.flatten(), cov0, size = nObs)
 38 |     #print(x.shape)
 39 |     mu1 = x.mean(axis = 0).reshape(-1,1) #calc mean of columns of rand matrix
 40 |     #print(mu1.shape)
 41 |     if shrink: cov1 = LedoitWolf().fit(x).covariance_
 42 |     else: cov1 = np.cov(x, rowvar=0)
 43 |     return mu1, cov1
 44 | 
 45 | # code snippet 2.9 
 46 | # Denoising of the empirical covariance matrix
 47 | # by constant residual eigenvalue method
 48 | def deNoiseCov(cov0, q, bWidth):
 49 |     corr0 = mp.cov2corr(cov0)
 50 |     eVal0, eVec0 = mp.getPCA(corr0)
 51 |     eMax0, var0 = mp.findMaxEval(np.diag(eVal0), q, bWidth)
 52 |     nFacts0 = eVal0.shape[0]-np.diag(eVal0)[::-1].searchsorted(eMax0)
 53 |     corr1 = mp.denoisedCorr(eVal0, eVec0, nFacts0) #denoising by constant residual eigenvalue method
 54 |     cov1 = corr2cov(corr1, np.diag(cov0)**.5)
 55 |     return cov1
 56 |     
 57 | # code snippet 2.10
 58 | # Derive minimum-variance-portfolio
 59 | # Returns a column vector of percentage allocations
 60 | # should be subject to lagrangian constraints:
 61 | # 1. lambda_1*(sum(expectation(x_i)*x_i) - d = 0
 62 | # 2. lambda_2*(sum(x_i - 1))=0
 63 | # where d is expected rate of return
 64 | # w*=C^−1*μ/I.T*C^−1*μ - is minimum-variance-portfolio
 65 |  #short sales are allowed
 66 | def optPort(cov, mu = None):
 67 |     inv = np.linalg.inv(cov) #The precision matrix: contains information about the partial correlation between variables,
 68 |     #  the covariance between pairs i and j, conditioned on all other variables (https://www.mn.uio.no/math/english/research/projects/focustat/publications_2/shatthik_barua_master2017.pdf)
 69 |     ones = np.ones(shape = (inv.shape[0], 1)) # column vector 1's
 70 |     if mu is None: 
 71 |         mu = ones
 72 |     w = np.dot(inv, mu)
 73 |     w /= np.dot(ones.T, w) # def: w = w / sum(w) ~ w is column vector
 74 |     
 75 |     return w
 76 |     
 77 | #optPort with long only curtesy of Brady Preston
 78 | #requires: import cvxpy as cp
 79 | '''def optPort(cov,mu=None):
 80 |     n = cov.shape[0]
 81 |     if mu is None:mu = np.abs(np.random.randn(n, 1))
 82 |     w = cp.Variable(n)
 83 |     risk = cp.quad_form(w, cov)
 84 |     ret =  mu.T @ w
 85 |     constraints = [cp.sum(w) == 1, w >= 0]
 86 |     prob = cp.Problem(cp.Minimize(risk),constraints)
 87 |     prob.solve(verbose=True)
 88 |     return np.array(w.value.flat).round(4)'''
 89 | 
 90 | #According to the question 'Tangent portfolio weights without short sales?' 
 91 | #there is no analytical solution to the GMV problem with no short-sales constraints
 92 | #So - set the negative weights in WGV to 0, and make w sum up to 1
 93 | def optPortLongOnly(cov, mu = None):
 94 |     inv = np.linalg.inv(cov)
 95 |     ones = np.ones(shape = (inv.shape[0], 1)) # column vector 1's
 96 |     if mu is None: 
 97 |         mu = ones
 98 |     w = np.dot(inv, mu)
 99 |     w /= np.dot(ones.T, w) # def: w = w / sum(w) ~ w is column vector
100 |     w = w.flatten()
101 |     threshold = w < 0
102 |     wpluss = w.copy()
103 |     wpluss[threshold] = 0
104 |     wpluss = wpluss/np.sum(wpluss)
105 |     
106 |     return wpluss
107 |     
108 | if __name__ == '__main__':
109 |     nBlocks, bSize, bCorr = 2, 2, .5
110 |     np.random.seed(0)
111 |     mu0, cov0 = formTrueMatrix(nBlocks, bSize, bCorr)
112 | 
113 |     # code snippet 2.10
114 |     nObs, nTrials, bWidth, shrink, minVarPortf = 5, 5, .01, False, True
115 |     w1 = pd.DataFrame(columns = range(cov0.shape[0]), index = range(nTrials), dtype=float)
116 | 
117 |     w1_d = w1.copy(deep=True)
118 |     np.random.seed(0)
119 |     for i in range(nTrials):
120 |         mu1, cov1 = simCovMu(mu0, cov0, nObs, shrink = shrink)
121 |         if minVarPortf: mu1 = None
122 |         cov1_d = deNoiseCov(cov1, nObs*1./cov1.shape[1], bWidth)
123 |         w1.loc[i] = optPort(cov1, mu1).flatten() # add column vector w as row in w1
124 |         w1_d.loc[i] = optPort(cov1_d, mu1).flatten() # np.sum(w1_d, axis=1) is vector of 1's. sum(np.sum(w1_d, axis=0)= nTrials
125 |         # so minimum-variance-portfolio is 1./nTrials*(np.sum(w1_d, axis=0)) - but distribution not stationary
126 |     
127 |     min_var_port = 1./nTrials*(np.sum(w1_d, axis=0)) 
128 |     #code snippet 2.11
129 |     w0 = optPort(cov0, None if minVarPortf else mu0) # w0 true percentage asset allocation
130 |     w0 = np.repeat(w0.T, w1.shape[0], axis=0) 
131 |     rmsd = np.mean((w1-w0).values.flatten()**2)**.5     #RMSE not denoised
132 |     rmsd_d = np.mean((w1_d-w0).values.flatten()**2)**.5 #RMSE denoised
133 |     print("RMSE not denoised:"+str( rmsd))
134 |     print("RMSE denoised:"+str( rmsd_d))
135 |     


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch3_metrics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import scipy.stats as ss
 4 | from sklearn.metrics import mutual_info_score
 5 | 
 6 | # Marginal-, joint-distribution, conditional entropies, and mutual information
 7 | # https://pypi.org/project/pyitlib/
 8 | 
 9 | #codesnippet 3.3
10 | #Variation of information on discretized continuous random variables
11 | def numBins(nObs, corr=None):
12 |     #optimal number of bins for discretization
13 |     if corr is None: #univariate case
14 |         z = (8+324*nObs+12*(36*nObs+729*nObs**2)**.5)**(1/3.)
15 |         b = round(z/6.+2./(3*z)+1./3)
16 |     else: #bivariate case
17 |         b = round(2**-.5*(1+(1+24*nObs/(1.-corr**2))**.5)**.5)
18 |     
19 |     return int(b)
20 | 
21 | #codesnippet 3.2
22 | def varInfo(x,y, bins, norm=False):
23 |     #variation of information
24 |     bXY = numBins(x.shape[0], corr= np.corrcoef(x,y)[0,1])
25 |     bins = bXY
26 |     cXY = np.histogram2d(x, y, bins)[0]
27 |     hX = ss.entropy(np.histogram(x, bins)[0]) #marginal 
28 |     hY = ss.entropy(np.histogram(y, bins)[0]) #marginal
29 |     iXY = mutual_info_score(None, None, contingency=cXY)
30 |     vXY = hX+hY-2*iXY #variation of information
31 |     if norm:
32 |         hXY = hX + hY - iXY #joint
33 |         vXY = vXY/hXY #normalized varaiation of information - Kraskov (2008)
34 |         
35 |     return vXY
36 | 
37 | #codesnippet 3.4 Correlation and normalized mutual information of two independent gaussian random variables
38 | def mutualInfo(x,y, norm=False):
39 |     #mutual information
40 |     bXY = numBins(x.shape[0], corr = np.corrcoef(x,y)[0,1])
41 |     cXY = np.histogram2d(x,y, bXY)[0]
42 |     iXY = mutual_info_score(None, None, contingency=cXY)
43 |     if norm:
44 |         hX = ss.entropy(np.histogram(x, bXY)[0]) #marginal 
45 |         hY = ss.entropy(np.histogram(y, bXY)[0]) #marginal
46 |         iXY /= min(hX, hY) #normalized mutual information
47 | 
48 |     return iXY
49 |     
50 | if __name__ == '__main__':
51 |     x = np.random.normal(0, 1, 1000)
52 |     y = np.random.normal(0, 1, 1000)
53 |     bins=10 # descretize sample space
54 | 
55 |     #codesnippet 3.1
56 |     cXY = np.histogram2d(x, y, bins)[0]
57 |     hX = ss.entropy(np.histogram(x, bins)[0]) #marginal 
58 |     hY = ss.entropy(np.histogram(y, bins)[0]) #marginal
59 |     iXY = mutual_info_score(None, None, contingency=cXY)
60 |     iXYn = iXY/min(hX, hY) #normalized mutual information
61 |     hXY = hX+hY - iXY #joint
62 |     hX_Y = hXY-hY #conditional
63 |     hY_X = hXY-hX #contitional
64 |     
65 |     #codesnippet 3.4
66 |     size, seed = 5000, 0 
67 |     np.radom.seed(seed)
68 |     x = np.random.normal(size=size)
69 |     e = np.random.normal(size=size)
70 |     y = 0*x+e
71 |     nmi = mutualInfo(x,y,True)
72 |     corr = np.corrcoef(x,y)[0,1]
73 | 
74 | 


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch4_optimal_clustering.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.cluster import KMeans
  4 | from sklearn.metrics import silhouette_samples, silhouette_score
  5 | from sklearn.utils import check_random_state
  6 | from scipy.linalg import block_diag
  7 | import matplotlib.pylab as plt
  8 | import matplotlib
  9 | 
 10 | from Machine_Learning_for_Asset_Managers import ch2_marcenko_pastur_pdf as mp
 11 | 
 12 | '''
 13 | Optimal Number of Clusters (ONC Algorithm)
 14 | Detection of False Investment Strategies using Unsupervised Learning Methods
 15 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3167017
 16 | '''
 17 | 
 18 | '''codesnippet 4.1
 19 |  base clustering: Evaluate the correlation matrix as distance matrix,
 20 |  the find cluster; in the inner loop, we try different k=2..N
 21 |  on which to cluster with kmeans for one given initialization,
 22 |  and evaluate q = E(silhouette)/std(silhouette) for all clusters.
 23 |  The outer loop repeats inner loop with initializations of
 24 |  _different centroid seeds_
 25 |   
 26 |  kmeans.labels_ is the assignment of members to the cluster
 27 |  [0 1 1 0 0]
 28 |  [1 0 0 1 1] is equivelant
 29 | '''
 30 | def clusterKMeansBase(corr0, maxNumClusters=10, n_init=10, debug=False):
 31 |     corr0[corr0 > 1] = 1
 32 |     dist_matrix = ((1-corr0.fillna(0))/2.)**.5
 33 |     silh_coef_optimal = pd.Series(dtype='float64') #observations matrixs
 34 |     kmeans, stat = None, None
 35 |     maxNumClusters = min(maxNumClusters, int(np.floor(dist_matrix.shape[0]/2)))
 36 |     print("maxNumClusters"+str(maxNumClusters))
 37 |     for init in range(0, n_init):
 38 |     #The [outer] loop repeats the first loop multiple times, thereby obtaining different initializations. Ref: de Prado and Lewis (2018)
 39 |     #DETECTION OF FALSE INVESTMENT STRATEGIES USING UNSUPERVISED LEARNING METHODS
 40 |         for num_clusters in range(2, maxNumClusters+1):
 41 |             #(maxNumClusters + 2 - num_clusters) # go in reverse order to view more sub-optimal solutions
 42 |             kmeans_ = KMeans(n_clusters=num_clusters, n_init=10) #, random_state=3425) #n_jobs=None #n_jobs=None - use all CPUs
 43 |             kmeans_ = kmeans_.fit(dist_matrix)
 44 |             silh_coef = silhouette_samples(dist_matrix, kmeans_.labels_)
 45 |             stat = (silh_coef.mean()/silh_coef.std(), silh_coef_optimal.mean()/silh_coef_optimal.std())
 46 | 
 47 |             # If this metric better than the previous set as the optimal number of clusters
 48 |             if np.isnan(stat[1]) or stat[0] > stat[1]:
 49 |                 silh_coef_optimal = silh_coef
 50 |                 kmeans = kmeans_
 51 |                 if debug==True:
 52 |                     print(kmeans)
 53 |                     print(stat)
 54 |                     silhouette_avg = silhouette_score(dist_matrix, kmeans_.labels_)
 55 |                     print("For n_clusters ="+ str(num_clusters)+ "The average silhouette_score is :"+ str(silhouette_avg))
 56 |                     print("********")
 57 |     
 58 |     newIdx = np.argsort(kmeans.labels_)
 59 |     #print(newIdx)
 60 | 
 61 |     corr1 = corr0.iloc[newIdx] #reorder rows
 62 |     corr1 = corr1.iloc[:, newIdx] #reorder columns
 63 | 
 64 |     clstrs = {i:corr0.columns[np.where(kmeans.labels_==i)[0]].tolist() for i in np.unique(kmeans.labels_)} #cluster members
 65 |     silh_coef_optimal = pd.Series(silh_coef_optimal, index=dist_matrix.index)
 66 |     
 67 |     return corr1, clstrs, silh_coef_optimal
 68 |     
 69 | #codesnippet 4.2
 70 | #Top level of clustering
 71 | ''' Improve number clusters using silh scores
 72 | 
 73 |     :param corr_mat: (pd.DataFrame) Correlation matrix
 74 |     :param clusters: (dict) Clusters elements
 75 |     :param top_clusters: (dict) Improved clusters elements
 76 |     :return: (tuple) [ordered correlation matrix, clusters, silh scores]
 77 | '''
 78 | def makeNewOutputs(corr0, clstrs, clstrs2):
 79 |     clstrsNew, newIdx = {}, []
 80 |     for i in clstrs.keys():
 81 |         clstrsNew[len(clstrsNew.keys())] = list(clstrs[i])
 82 |     
 83 |     for i in clstrs2.keys():
 84 |         clstrsNew[len(clstrsNew.keys())] = list(clstrs2[i])
 85 |     
 86 |     newIdx = [j for i in clstrsNew for j in clstrsNew[i]]
 87 |     corrNew = corr0.loc[newIdx, newIdx]
 88 |     
 89 |     dist = ((1 - corr0.fillna(0)) / 2.)**.5
 90 |     kmeans_labels = np.zeros(len(dist.columns))
 91 |     for i in clstrsNew.keys():
 92 |         idxs = [dist.index.get_loc(k) for k in clstrsNew[i]]
 93 |         kmeans_labels[idxs] = i
 94 |     
 95 |     silhNew = pd.Series(silhouette_samples(dist, kmeans_labels), index=dist.index)
 96 |     
 97 |     return corrNew, clstrsNew, silhNew
 98 | 
 99 | ''' Recursivly cluster
100 |     Typical output: e.g if there are 4 clusters:
101 | >>> _,_,_=clusterKMeansTop(corr0)
102 | redo cluster:[0, 1, 2, 5]
103 | redo cluster:[0, 1, 2]
104 | redo cluster:[1]
105 | redoCluster <=1:[1]
106 | newTstatMean > tStatMean
107 | newTstatMean > tStatMean
108 | >>>
109 | 
110 | So it returns first time on base-case  >>>if len(redoClusters) <= 1
111 | Then sub-sequent returnes are after the tail-recurrsion
112 | '''
113 | def clusterKMeansTop(corr0: pd.DataFrame, maxNumClusters=None, n_init=10):
114 |     if maxNumClusters == None:
115 |         maxNumClusters = corr0.shape[1]-1
116 |         
117 |     corr1, clstrs, silh = clusterKMeansBase(corr0, maxNumClusters=min(maxNumClusters, corr0.shape[1]-1), n_init=10)#n_init)
118 |     print("clstrs length:"+str(len(clstrs.keys())))
119 |     print("best clustr:"+str(len(clstrs.keys())))
120 |     #for i in clstrs.keys():
121 |     #    print("std:"+str(np.std(silh[clstrs[i]])))
122 | 
123 |     clusterTstats = {i:np.mean(silh[clstrs[i]])/np.std(silh[clstrs[i]]) for i in clstrs.keys()}
124 |     tStatMean = sum(clusterTstats.values())/len(clusterTstats)
125 |     redoClusters = [i for i in clusterTstats.keys() if clusterTstats[i] < tStatMean]
126 |     #print("redo cluster:"+str(redoClusters))
127 |     if len(redoClusters) <= 2:
128 |         print("If 2 or less clusters have a quality rating less than the average then stop.")
129 |         print("redoCluster <=1:"+str(redoClusters)+" clstrs len:"+str(len(clstrs.keys())))
130 |         return corr1, clstrs, silh
131 |     else:
132 |         keysRedo = [j for i in redoClusters for j in clstrs[i]]
133 |         corrTmp = corr0.loc[keysRedo, keysRedo]
134 |         _, clstrs2, _ = clusterKMeansTop(corrTmp, maxNumClusters=min(maxNumClusters, corrTmp.shape[1]-1), n_init=n_init)
135 |         print("clstrs2.len, stat:"+str(len(clstrs2.keys())))
136 |         #Make new outputs, if necessary
137 |         dict_redo_clstrs = {i:clstrs[i] for i in clstrs.keys() if i not in redoClusters}
138 |         corrNew, clstrsNew, silhNew = makeNewOutputs(corr0, dict_redo_clstrs, clstrs2)
139 |         newTstatMean = np.mean([np.mean(silhNew[clstrsNew[i]])/np.std(silhNew[clstrsNew[i]]) for i in clstrsNew.keys()]) 
140 |         if newTstatMean <= tStatMean:
141 |             print("newTstatMean <= tStatMean"+str(newTstatMean)+ " (len:newClst)"+str(len(clstrsNew.keys()))+" <= "+str(tStatMean)+ " (len:Clst)"+str(len(clstrs.keys())))
142 |             return corr1, clstrs, silh
143 |         else: 
144 |             print("newTstatMean > tStatMean"+str(newTstatMean)+ " (len:newClst)"+str(len(clstrsNew.keys()))
145 |                   +" > "+str(tStatMean)+ " (len:Clst)"+str(len(clstrs.keys())))
146 |             return corrNew, clstrsNew, silhNew
147 |             #return corr1, clstrs, silh, stat
148 |              
149 | # codesnippet 4.3 - utility for monte-carlo simulation
150 | # Random block correlation matrix creation
151 | # Simulates a time-series of atleast 100 elements. 
152 | # So each column is highly correlated for small sigma and less correlated for large sigma (standard deviation)
153 | #
154 | # two matrixes of N(0,sigma^2) rv added which results in variance=2*sigma^2
155 | def getCovSub(nObs, nCols, sigma, random_state=None):
156 |     #sub correl matrix
157 |     rng = check_random_state(random_state)
158 |     if nCols == 1:
159 |         return np.ones((1,1))
160 |     ar0 = rng.normal(size=(nObs, 1)) #array of normal rv
161 |     ar0 = np.repeat(ar0, nCols, axis=1) #matrix of columns repeating rv. Simulate time-series of at least 100 elements.
162 |     ar0 += rng.normal(loc=0, scale=sigma, size=ar0.shape) #add two rv X~Y~N(0,1), Z=X+Y~N(0+0, 1+1)=N(0,2)
163 |     ar0 = np.cov(ar0, rowvar=False) #ar0.shape = nCols x nCols
164 |     return ar0
165 | 
166 | #generate a block random correlation matrix
167 | #
168 | # The last block in the matrix is going to be as large as possible
169 | # Controlling the size of the last block matrix can be done by inceasing minBlockSize
170 | # 
171 | # parts is the size of the blocks. If nCols, nBlocks, minBlockSize = 6,3,1
172 | # then parts = [1,1,4] resulting in 1x1, 1x1, 4x4 block-covariance-matrixes
173 | # If block > 1x1 matrix the diagonal is 2 or 2*sigma as the variance of
174 | # covariance from getCovSub() is Z=X+Y => 2*sigma
175 | def getRndBlockCov(nCols, nBlocks, minBlockSize=1, sigma=1., random_state=None):
176 |     
177 |     print("getRndBlockCov:"+str(minBlockSize))
178 |     rng = check_random_state(random_state)
179 |     parts = rng.choice(range(1, nCols-(minBlockSize-1)*nBlocks), nBlocks-1, replace=False)
180 |     parts.sort()
181 |     parts = np.append(parts, nCols-(minBlockSize-1)*nBlocks) #add nCols to list of parts, unless minBlockSize>1
182 |     parts = np.append(parts[0], np.diff(parts))-1+minBlockSize
183 |     print("block sizes:"+str(parts))
184 |     cov=None
185 |     for nCols_ in parts:
186 |         cov_ = getCovSub(int(max(nCols_*(nCols_+1)/2., 100)), nCols_, sigma, random_state=rng)
187 |         if cov is None:
188 |             cov = cov_.copy()
189 |         else: 
190 |             cov = block_diag(cov, cov_) #list of square matrix on larger matrix on the diagonal
191 |     
192 |     return cov
193 | 
194 | # add two random covariance matrixes and return the correlation matrix as a dataframe. 
195 | #
196 | # The first covariance matrix consists of nBlocks
197 | # and the second matrix consists of 1 block - which adds noice.
198 | # Note: noice is also added in each block matrix. Why is noice added 2 times?
199 | def randomBlockCorr(nCols, nBlocks, random_state=None, minBlockSize=1):
200 |     #Form block corr
201 |     rng = check_random_state(random_state)
202 |     
203 |     print("randomBlockCorr:"+str(minBlockSize))
204 |     cov0 = getRndBlockCov(nCols, nBlocks, minBlockSize=minBlockSize, sigma=.5, random_state=rng)
205 |     cov1 = getRndBlockCov(nCols, 1, minBlockSize=minBlockSize, sigma=1., random_state=rng) #add noise
206 |     cov0 += cov1
207 |     corr0 = mp.cov2corr(cov0)
208 |     corr0 = pd.DataFrame(corr0)
209 |     return corr0
210 |     
211 | if __name__ == '__main__':
212 |     nCols, nBlocks = 6, 3
213 |     nObs = 8
214 |     sigma = 1.
215 |     corr0 = randomBlockCorr(nCols, nBlocks)
216 |     testGetCovSub = getCovSub(nObs, nCols, sigma, random_state=None) #6x6 matrix
217 |     
218 |     # recreate fig 4.1 colormap of random block correlation matrix
219 |     nCols, nBlocks, minBlockSize = 30, 6, 2
220 |     print("minBlockSize"+str(minBlockSize))
221 |     corr0 = randomBlockCorr(nCols, nBlocks, minBlockSize=minBlockSize) #pandas df
222 |     
223 |     corr1 = clusterKMeansTop(corr0) #corr0 is ground truth, corr1 is ONC
224 | 
225 |     #Draw ground truth
226 |     matplotlib.pyplot.matshow(corr0) #invert y-axis to get origo at lower left corner
227 |     matplotlib.pyplot.gca().xaxis.tick_bottom()
228 |     matplotlib.pyplot.gca().invert_yaxis()
229 |     matplotlib.pyplot.colorbar()
230 |     matplotlib.pyplot.show()
231 | 
232 |     #draw prediction based on ONC
233 |     corrNew, clstrsNew, silhNew = clusterKMeansTop(corr0)
234 |     matplotlib.pyplot.matshow(corrNew) 
235 |     matplotlib.pyplot.gca().xaxis.tick_bottom()
236 |     matplotlib.pyplot.gca().invert_yaxis()
237 |     matplotlib.pyplot.colorbar()
238 |     matplotlib.pyplot.show()
239 |         
240 |     


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch5_financial_labels.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import statsmodels.api as sm1
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pylab as plt
  6 | from sklearn import linear_model, datasets
  7 | 
  8 | #Trend scanning method
  9 | 
 10 | #code snippet 5.1
 11 | # Fit linear regression on close
 12 | # Return the t-statistic for a given parameter estimate.
 13 | def tValLinR(close):
 14 |     #tValue from a linear trend
 15 |     x = np.ones((close.shape[0],2))
 16 |     x[:,1] = np.arange(close.shape[0])
 17 |     ols = sm1.OLS(close, x).fit()
 18 |     return ols.tvalues[1]
 19 |     
 20 | #code snippet 5.2
 21 | '''
 22 |     - molecule - index of observations we wish to labels. 
 23 |     - close - which is the time series of x_t
 24 |     - span - is the set of values of L (look forward period) that the algorithm will try.
 25 |     The L that maximizes |tHat_B_1| is choosen - which is the look-forward period 
 26 |     with the most significant trend.
 27 | '''
 28 | def getBinsFromTrend(molecule, close, span):
 29 |     '''
 30 |     Derive labels from the sign of t-value of trend line
 31 |     output includes:
 32 |       - t1: End time for the identified trend
 33 |       - tVal: t-value associated with the estimated trend coefficient
 34 |       - bin: Sign of the trend
 35 |     The t-statistics for each tick has a different look-back window.
 36 |       
 37 |     - idx start time in look-forward window
 38 |     - dt1 stop time in look-forward window
 39 |     - df1 is the look-forward window
 40 |     - iloc ? 
 41 |     '''
 42 |     out = pd.DataFrame(index=molecule, columns=['t1', 'tVal', 'bin', 'windowSize'])
 43 |     hrzns = range(*span)
 44 |     windowSize = span[1] - span[0]
 45 |     maxWindow = span[1]-1
 46 |     minWindow = span[0]
 47 |     for idx in close.index:
 48 |         idx += maxWindow
 49 |         if idx >= len(close):
 50 |             break
 51 |         df_tval = pd.Series(dtype='float64')
 52 |         iloc0 = close.index.get_loc(idx)
 53 |         #if iloc0+max(hrzns) > close.shape[0]:
 54 |         #    continue
 55 |         for hrzn in hrzns:
 56 |             dt1 = close.index[iloc0-hrzn+1]
 57 |             df1 = close.loc[dt1:idx]
 58 |             df_tval.loc[dt1] = tValLinR(df1.values) #calculates t-statistics on period
 59 |         dt1 = df_tval.replace([-np.inf, np.inf, np.nan], 0).abs().idxmax() #get largest t-statistics calculated over span period
 60 | 
 61 |         print(df_tval.index[-1])
 62 |         print(dt1)
 63 |         print(abs(df_tval.values).argmax() + minWindow)
 64 |         out.loc[idx, ['t1', 'tVal', 'bin', 'windowSize']] = df_tval.index[-1], df_tval[dt1], np.sign(df_tval[dt1]), abs(df_tval.values).argmax() + minWindow #prevent leakage
 65 |     out['t1'] = pd.to_datetime(out['t1'])
 66 |     out['bin'] = pd.to_numeric(out['bin'], downcast='signed')
 67 | 
 68 |     #deal with massive t-Value outliers - they dont provide more confidence and they ruin the scatter plot
 69 |     tValueVariance = out['tVal'].values.var()
 70 |     tMax = 20
 71 |     if tValueVariance < tMax:
 72 |         tMax = tValueVariance
 73 | 
 74 |     out.loc[out['tVal'] > tMax, 'tVal'] = tMax #cutoff tValues > 20
 75 |     out.loc[out['tVal'] < (-1)*tMax, 'tVal'] = (-1)*tMax #cutoff tValues < -20
 76 |     return out.dropna(subset=['bin'])
 77 | 
 78 | if __name__ == '__main__':
 79 |     #snippet 5.3
 80 |     idx_range_from = 3
 81 |     idx_range_to = 10
 82 |     df0 = pd.Series(np.random.normal(0, .1, 100)).cumsum()
 83 |     df0 += np.sin(np.linspace(0, 10, df0.shape[0]))
 84 |     df1 = getBinsFromTrend(df0.index, df0, [idx_range_from,idx_range_to,1]) #[3,10,1] = range(3,10)
 85 |     tValues = df1['tVal'].values #tVal
 86 | 
 87 |     doNormalize = False
 88 |     #normalise t-values to -1, 1
 89 |     if doNormalize:
 90 |         np.min(tValues)
 91 |         minusArgs = [i for i in range(0, len(tValues)) if tValues[i] < 0]
 92 |         tValues[minusArgs] = tValues[minusArgs] / (np.min(tValues)*(-1.0))
 93 | 
 94 |         plus_one = [i for i in range(0, len(tValues)) if tValues[i] > 0]
 95 |         tValues[plus_one] = tValues[plus_one] / np.max(tValues)
 96 | 
 97 |     #+(idx_range_to-idx_range_from+1)
 98 |     plt.scatter(df1.index, df0.loc[df1.index].values, c=tValues, cmap='viridis') #df1['tVal'].values, cmap='viridis')
 99 |     plt.plot(df0.index, df0.values, color='gray')
100 |     plt.colorbar()
101 |     plt.show()
102 |     plt.savefig('fig5.2.png')
103 |     plt.clf()
104 |     plt.close()
105 |     plt.scatter(df1.index, df0.loc[df1.index].values, c=df1['bin'].values, cmap='vipridis')
106 | 
107 |     #Test methods
108 |     ols_tvalue = tValLinR( np.array([3.0, 3.5, 4.0]) )
109 | 


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch6_feature_importance_analysis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from sklearn.datasets import make_classification
  3 | from sklearn.tree import DecisionTreeClassifier
  4 | from sklearn.ensemble import BaggingClassifier
  5 | from sklearn.metrics import log_loss
  6 | from sklearn.model_selection._split import KFold
  7 | import numpy as np
  8 | import pandas as pd
  9 | import seaborn as sns
 10 | import statsmodels.api as sm1
 11 | import matplotlib.pylab as plt
 12 | 
 13 | #from .ch2_marcenko_pastur_pdf import ch2_marcenko_pastur_pdf as mp
 14 | from .ch4_optimal_clustering import clusterKMeansBase 
 15 | 
 16 | #Code snippet 6.1 generating a set of informative, redundant, and noisy explanatory variables
 17 | # returns matrix X of training samples, and vector y of class labels for the training samples
 18 | def getTestData(n_features=100, n_informative=25, n_redundant=25, n_samples=10000, random_state=0, sigmaStd=.0):
 19 |     #generate a random dataset for classification problem
 20 |     np.random.seed(random_state)
 21 |     X, y = make_classification(n_samples=n_samples, n_features=n_features-n_redundant, 
 22 |         n_informative=n_informative, n_redundant=0, shuffle=False, random_state=random_state)
 23 |     cols = ['I_'+str(i) for i in range(0, n_informative)]
 24 |     cols += ['N_'+str(i) for i in range(0, n_features - n_informative - n_redundant)]
 25 |     X, y = pd.DataFrame(X, columns=cols), pd.Series(y)
 26 |     i = np.random.choice(range(0, n_informative), size=n_redundant)
 27 |     for k, j in enumerate(i):
 28 |         X['R_'+str(k)] = X['I_' + str(j)] + np.random.normal(size=X.shape[0])*sigmaStd    
 29 |     return X, y 
 30 | 
 31 | #code snippet 6.2 implementation of an ensembke MDI method
 32 | def featImpMDI(fit, featNames):
 33 |     #feat importance based on IS mean impurity reduction
 34 |     df0 = {i:tree.feature_importances_ for i, tree in enumerate(fit.estimators_)}
 35 |     df0 = pd.DataFrame.from_dict(df0, orient='index')
 36 |     df0.columns = featNames
 37 |     df0 = df0.replace(0, np.nan) #because max_features=1
 38 |     imp = pd.concat({'mean':df0.mean(), 'std':df0.std()*df0.shape[0]**-.5}, axis=1) #CLT
 39 |     imp /= imp['mean'].sum()
 40 |     return imp
 41 |     
 42 | #code snippet 6.3 implementation of MDA - MDI with cross validation
 43 | def featImpMDA(clf, X, y, n_splits=10):
 44 |     #feat importance based on OOS score reduction
 45 |     cvGen = KFold(n_splits=n_splits)
 46 |     scr0, scr1 = pd.Series(dtype='float64'), pd.DataFrame(columns=X.columns)
 47 |     for i, (train, test) in enumerate(cvGen.split(X=X)):
 48 |         x0, y0 = X.iloc[train, :], y.iloc[train]
 49 |         x1, y1 = X.iloc[test,:], y.iloc[test]
 50 |         fit = clf.fit(X=x0, y=y0) # the fit occures
 51 |         prob= fit.predict_proba(x1) #prediction before shuffles
 52 |         scr0.loc[i]=-log_loss(y1, prob, labels=clf.classes_)
 53 |         for j in X.columns:
 54 |             X1_ = x1.copy(deep=True)
 55 |             np.random.shuffle(X1_[j].values) #shuffle one columns
 56 |             prob = fit.predict_proba(X1_) #prediction after shuffle
 57 |             scr1.loc[i,j] = -log_loss(y1, prob, labels=clf.classes_)
 58 |         imp=(-1*scr1).add(scr0, axis=0)
 59 |         imp = imp/(-1*scr1)
 60 |         imp=pd.concat({'mean':imp.mean(), 'std':imp.std()*imp.shape[0]**-.5}, axis=1) #CLT
 61 |     return imp
 62 |     
 63 | #code snippet 6.4 - clustered MDI
 64 | def groupMeanStd(df0, clstrs):
 65 |     out = pd.DataFrame(columns=['mean', 'std'])
 66 |     for i, j in clstrs.items():
 67 |         df1 = df0[j].sum(axis=1)
 68 |         out.loc['C_'+str(i), 'mean'] = df1.mean()
 69 |         out.loc['C_'+str(i), 'std'] = df1.std() * df1.shape[0]**-.5
 70 |     return out
 71 | 
 72 | def featImpMDI_Clustered(fit, featNames, clstrs):
 73 |     df0 = {i:tree.feature_importances_ for i, tree in enumerate(fit.estimators_)}
 74 |     df0 = pd.DataFrame.from_dict(df0, orient='index')
 75 |     df0.columns = featNames
 76 |     df0 = df0.replace(0, np.nan) #because max_features=1
 77 |     imp = groupMeanStd(df0, clstrs)
 78 |     imp /= imp['mean'].sum()
 79 |     return imp
 80 |     
 81 | #code snippet 6.5 - clustered MDA    
 82 | def featImpMDA_Clustered(clf, X, y, clstrs, n_splits=10):
 83 |     cvGen = KFold(n_splits=n_splits)
 84 |     scr0, scr1 = pd.Series(dtype='float64'), pd.DataFrame(columns=clstrs.keys())
 85 |     for i, (train, test) in enumerate(cvGen.split(X=X)):
 86 |         X0, y0, = X.iloc[train,:], y.iloc[train] 
 87 |         X1, y1 = X.iloc[test, :], y.iloc[test]
 88 |         fit = clf.fit(X=X0, y=y0)
 89 |         prob=fit.predict_proba(X1)
 90 |         scr0.loc[i] = -log_loss(y1, prob, labels=clf.classes_)
 91 |         for j in scr1.columns:
 92 |             X1_=X1.copy(deep=True)
 93 |             for k in clstrs[j]:
 94 |                 np.random.shuffle(X1_[k].values) # shuffle clusters
 95 |             prob=fit.predict_proba(X1_)
 96 |             scr1.loc[i,j]=-log_loss(y1, prob, labels=clf.classes_)
 97 |         imp=(-1*scr1).add(scr0,axis=0)
 98 |         imp = imp/(-1*scr1)
 99 |         imp = pd.concat({'mean':imp.mean(), 'std':imp.std()*imp.shape[0]**-.5}, axis=1)
100 |         imp.index=['C_'+str(i) for i in imp.index]
101 |     return imp
102 | 
103 | if __name__ == '__main__':   
104 |     # 40 features, 5 informative features, 30 redundant, 5 noisy features
105 |     X, y = getTestData(40, 5, 30, 10000, sigmaStd=.1)
106 |     ols = sm1.Logit(y, X).fit()
107 |     ols.summary()
108 |     plot_data = ols.pvalues.sort_values(ascending=False)
109 |     plot_data.plot(kind='barh', figsize=(20,10), title="Figure 6.1 p-Values computed on a set of explanatory variables")
110 |     plt.show()
111 |     
112 |     #code snippet 6.2
113 |     X, y = getTestData(40, 5, 30, 10000, sigmaStd=.1)
114 |     clf = DecisionTreeClassifier(criterion='entropy', 
115 |                                  max_features=1, 
116 |                                  class_weight='balanced', 
117 |                                  min_weight_fraction_leaf=0)
118 |                                  
119 |     clf = BaggingClassifier(estimator=clf, 
120 |                           n_estimators=1000, 
121 |                           max_features=1., 
122 |                           max_samples=1., 
123 |                           oob_score=False)
124 |     fit = clf.fit(X,y)
125 |     imp = featImpMDI(fit, featNames=X.columns)
126 |     
127 |     #print the graph Example 6.2 Example of MDI results
128 |     imp.sort_values('mean', inplace=True)
129 |     plt.figure(figsize=(10, imp.shape[0] / 5))
130 |     imp['mean'].plot(kind='barh', color='b', alpha=0.25, xerr=imp['std'], error_kw={'ecolor': 'r'})
131 |     plt.title('Figure 6.2 Example of MDI results')
132 |     plt.show()
133 |     
134 |     #code snippet 6.3
135 |     X, y = getTestData(40, 5, 30, 10000, sigmaStd=.1)
136 |     clf = DecisionTreeClassifier(criterion='entropy', 
137 |                                  max_features=1, 
138 |                                  class_weight='balanced', 
139 |                                  min_weight_fraction_leaf=0)
140 |                                  
141 |     clf = BaggingClassifier(estimator=clf, 
142 |                           n_estimators=1000, 
143 |                           max_features=1., 
144 |                           max_samples=1., 
145 |                           oob_score=False)
146 |     fit = clf.fit(X,y)
147 |     imp = featImpMDA(clf, X, y, 10)
148 |     
149 |     imp.sort_values('mean', inplace=True)
150 |     plt.figure(figsize=(10, imp.shape[0] / 5))
151 |     imp['mean'].plot(kind='barh', color='b', alpha=0.25, xerr=imp['std'], error_kw={'ecolor': 'r'})
152 |     plt.title('Figure 6.3 Example of MDA results')
153 |     plt.show()
154 |     
155 |     #code snippet 6.6 - features clustering step
156 |     X, y = getTestData(40, 5, 30, 10000, sigmaStd=.1)
157 |     corr0, clstrs, silh = clusterKMeansBase(X.corr(), maxNumClusters=10, n_init=10)
158 |     fig, ax = plt.subplots(figsize=(13,10))  
159 |     sns.heatmap(corr0, cmap='viridis')
160 |     plt.show()
161 |     
162 |     #code snippet 6.7 - calling the functions for clustered MDI
163 |     X, y = getTestData(40, 5, 30, 10000, sigmaStd=.1)
164 |     clf = DecisionTreeClassifier(criterion='entropy', 
165 |                                  max_features=1, 
166 |                                  class_weight='balanced', 
167 |                                  min_weight_fraction_leaf=0)
168 |                                  
169 |     clf = BaggingClassifier(estimator=clf, 
170 |                           n_estimators=1000, 
171 |                           max_features=1., 
172 |                           max_samples=1., 
173 |                           oob_score=False)
174 |     fit = clf.fit(X,y)
175 |     imp = featImpMDI_Clustered(fit, X.columns, clstrs)
176 | 
177 |     imp.sort_values('mean', inplace=True)
178 |     plt.figure(figsize=(10, 5))
179 |     imp['mean'].plot(kind='barh', color='b', alpha=0.25, xerr=imp['std'], error_kw={'ecolor': 'r'})
180 |     plt.title('Figure 6.5 Clustered MDI')
181 |     plt.show()
182 |     
183 |     #code snippet 6.8 - calling the functions for clustered MDA
184 |     clf = DecisionTreeClassifier(criterion='entropy', 
185 |                                  max_features=1, 
186 |                                  class_weight='balanced', 
187 |                                  min_weight_fraction_leaf=0)
188 |                                  
189 |     clf = BaggingClassifier(estimator=clf, 
190 |                           n_estimators=1000, 
191 |                           max_features=1., 
192 |                           max_samples=1., 
193 |                           oob_score=False)
194 |     fit = clf.fit(X,y)
195 |     imp = featImpMDA_Clustered(clf, X, y, clstrs, 10)
196 | 
197 |     imp.sort_values('mean', inplace=True)
198 |     plt.figure(figsize=(10, 5))
199 |     imp['mean'].plot(kind='barh', color='b', alpha=0.25, xerr=imp['std'], error_kw={'ecolor': 'r'})
200 |     plt.title('Figure 6.6 Clustered MDA')
201 |     plt.show()


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch7_portfolio_construction.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | from scipy.linalg import block_diag
  5 | import matplotlib.pylab as plt
  6 | import matplotlib.pyplot as mp1
  7 | import seaborn as sns
  8 | 
  9 | from Machine_Learning_for_Asset_Managers import ch2_monte_carlo_experiment as mc
 10 | from Machine_Learning_for_Asset_Managers import ch2_marcenko_pastur_pdf as mp
 11 | from Machine_Learning_for_Asset_Managers import ch4_optimal_clustering as oc
 12 | 
 13 | 
 14 | def minVarPort(cov):
 15 |     return mc.optPort(cov, mu = None)
 16 | 
 17 | # code snippet 7.6 - function implementing the NCO algorithm
 18 | # Long only portfolio uses allocate_cvo()
 19 | # Method assumes input - detoned covariance matrix
 20 | def optPort_nco(cov, mu=None, maxNumClusters=None):
 21 |     cov = pd.DataFrame(cov)
 22 |     if mu is not None:
 23 |         mu = pd.Series(mu[:,0])
 24 |     
 25 |     corr1 = mp.cov2corr(cov)
 26 |     
 27 |     # Optimal partition of clusters (step 1)
 28 |     corr1, clstrs, _ = oc.clusterKMeansBase(corr1, maxNumClusters, n_init=10)
 29 |     #wIntra = pd.DataFrame(0, index=cov.index, columns=clstrs.keys())
 30 |     w_intra_clusters = pd.DataFrame(0, index=cov.index, columns=clstrs.keys())
 31 |     for i in clstrs:
 32 |         cov_cluster = cov.loc[clstrs[i], clstrs[i]].values
 33 |         if mu is None:
 34 |             mu_cluster = None
 35 |         else: 
 36 |             mu_cluster = mu.loc[clstrs[i]].values.reshape(-1,1)
 37 |         
 38 |         #Long/Short
 39 |         #w_intra_clusters.loc[clstrs[i],i] = mc.optPort(cov_cluster, mu_cluster).flatten()
 40 |         
 41 |         # Long only: Estimating the Convex Optimization Solution in a cluster (step 2)
 42 |         w_intra_clusters.loc[clstrs[i], i] = allocate_cvo(cov_cluster, mu_cluster).flatten()        
 43 |     
 44 |     cov_inter_cluster = w_intra_clusters.T.dot(np.dot(cov, w_intra_clusters)) #reduce covariance matrix
 45 |     mu_inter_cluster = (None if mu is None else w_intra_clusters.T.dot(mu))
 46 |     
 47 |     #Long/Short
 48 |     #w_inter_clusters = pd.Series(mc.optPort(cov_inter_cluster, mu_inter_cluster).flatten(), index=cov_inter_cluster.index)
 49 |     # Long only: Optimal allocations across the reduced covariance matrix (step 3)
 50 |     w_inter_clusters = pd.Series(allocate_cvo(cov_inter_cluster, mu_inter_cluster).flatten(), index=cov_inter_cluster.index)    
 51 |     
 52 |     # Final allocations - dot-product of the intra-cluster and inter-cluster allocations (step 4)
 53 |     nco = w_intra_clusters.mul(w_inter_clusters, axis=1).sum(axis=1).values.reshape(-1,1)
 54 |     return nco
 55 |     
 56 | def allocate_cvo(cov, mu_vec=None):
 57 |     """
 58 |     Estimates the Convex Optimization Solution (CVO).
 59 |     Uses the covariance matrix and the mu - optimal solution.
 60 |     If mu is the vector of expected values from variables, the result will be
 61 |     a vector of weights with maximum Sharpe ratio.
 62 |     If mu is a vector of ones, the result will be a vector of weights with
 63 |     minimum variance.
 64 |     :param cov: (np.array) Covariance matrix of the variables.
 65 |     :param mu_vec: (np.array) Expected value of draws from the variables for maximum Sharpe ratio.
 66 |                           None if outputting the minimum variance portfolio.
 67 |     :return: (np.array) Weights for optimal allocation.
 68 |     """
 69 |     
 70 |     # Calculating the inverse covariance matrix
 71 |     inv_cov = np.linalg.inv(cov)
 72 |     
 73 |     # Generating a vector of size of the inverted covariance matrix
 74 |     ones = np.ones(shape=(inv_cov.shape[0], 1))
 75 |     
 76 |     if mu_vec is None:  # To output the minimum variance portfolio
 77 |         mu_vec = ones
 78 |     
 79 |     # Calculating the analytical solution using CVO - weights
 80 |     w_cvo = np.dot(inv_cov, mu_vec)
 81 |     w_cvo /= np.dot(mu_vec.T, w_cvo)
 82 |     
 83 |     return w_cvo    
 84 |    
 85 | if __name__ == '__main__': 
 86 |     # code snippet 7.1 - Composition of block-diagonal correlation matric
 87 |     corr0 = mc.formBlockMatrix(2, 2, .5)
 88 |     eVal, eVec = np.linalg.eigh(corr0)
 89 |     matrix_condition_number = max(eVal)/min(eVal)
 90 |     print(matrix_condition_number) 
 91 | 
 92 |     fig, ax = plt.subplots(figsize=(13,10))  
 93 |     sns.heatmap(corr0, cmap='viridis')
 94 |     plt.show()
 95 | 
 96 |     # code snippet 7.2 - block-diagonal correlation matrix with a dominant block
 97 |     corr0 = block_diag(mc.formBlockMatrix(1,2, .5))
 98 |     corr1 = mc.formBlockMatrix(1,2, .0)
 99 |     corr0 = block_diag(corr0, corr1)
100 |     eVal, eVec = np.linalg.eigh(corr0)
101 |     matrix_condition_number = max(eVal)/min(eVal)
102 |     print(matrix_condition_number) 
103 |     
104 |     fig, ax = plt.subplots(figsize=(13,10))  
105 |     sns.heatmap(corr1, cmap='viridis')
106 |     plt.show()
107 | 
108 |     # code snippet 7.3 - NCO method. Step 1. Correlation matrix clustering
109 |     nBlocks, bSize, bCorr = 2, 2, .5
110 |     q = 10.0
111 |     np.random.seed(0)
112 |     mu0, cov0 = mc.formTrueMatrix(nBlocks, bSize, bCorr)
113 |     cols = cov0.columns
114 |     cov1 = mc.deNoiseCov(cov0, q, bWidth=.01) #denoise cov
115 |     cov1 = pd.DataFrame(cov1, index=cols, columns=cols)
116 |     corr1 = mp.cov2corr(cov1)
117 |     corr1, clstrs, silh = oc.clusterKMeansBase(pd.DataFrame(corr0))
118 |     
119 |     # code snippet 7.4 - intracluster optimal allocations
120 |     # step 2. compute intracluster allocations using the denoised cov matrix
121 |     wIntra = pd.DataFrame(0, index=cov0.index, columns=clstrs.keys())
122 |     for i in clstrs:
123 |         wIntra.loc[clstrs[i], i] = minVarPort(corr1.loc[clstrs[i], clstrs[i]]).flatten()
124 |         
125 |     cov2 = wIntra.T.dot(np.dot(cov1, wIntra)) #reduced covariance matrix
126 |     
127 |     # code snippet 7.5 - intercluster optimal allocations
128 |     # step 3. compute optimal intercluster allocations, usint the reduced covariance matrix
129 |     # which is close to a diagonal matrix, so optimization problem is close to ideal case \ro =0
130 |     wInter = pd.Series(minVarPort(cov2).flatten(), index=cov2.index)
131 |     wAll0 = wIntra.mul(wInter, axis=1).sum(axis=1).sort_index()
132 | 
133 |     # step 4. Final allocations - dot-product of the intra-cluster and inter-cluster allocations 
134 |     #w_nco = w_intra_clusters.mul(w_inter_clusters, axis=1).sum(axis=1).values.reshape(-1, 1)
135 |     nco = wIntra.mul(wInter, axis=1).sum(axis=1).values.reshape(-1,1)
136 |     
137 |     # code snippet 7.7 - data-generating process
138 |     nBlocks, bSize, bCorr = 10, 50, .5
139 |     np.random.seed(0)
140 |     mu0, cov0 = mc.formTrueMatrix(nBlocks, bSize, bCorr)
141 |        
142 |     # code snippet 7.8 - drawing an empirical vector of means and covariance matrix
143 |     nObs, nSims, shrink, minVarPortf = 1000, 1000, False, True
144 |     np.random.seed(0)
145 |     w1 = pd.DataFrame(0, index=range(0, nSims), columns=range(0, nBlocks*bSize))	
146 |     w1_d = pd.DataFrame(0, index=range(0, nSims), columns=range(0, nBlocks*bSize))
147 |     for i in range(0, nSims):
148 |         mu1, cov1 = mc.simCovMu(mu0, cov0, nObs, shrink=shrink)
149 |         if minVarPortf:
150 |             mu1 = None
151 |         w1.loc[i] = mc.optPort(cov1, mu1).flatten() #markowitc
152 |         w1_d.loc[i] = optPort_nco(cov1, mu1, int(cov1.shape[0]/2)).flatten() #nco
153 |         
154 |     # code snippet 7.9 - Estimation of allocation errors
155 |     w0 = mc.optPort(cov0, None if minVarPortf else mu0)
156 |     w0 = np.repeat(w0.T, w1.shape[0], axis=0) #true allocation
157 |     rmsd = np.mean((w1-w0).values.flatten()**2)**.5 #RMSE
158 |     rmsd_d = np.mean((w1_d-w0).values.flatten()**2)**.5 #RMSE
159 |     '''
160 |     >>> rmsd
161 |     0.020737753489610305 #markowitc
162 |     >>> rmsd_d
163 |     0.015918559234396952 #nco
164 |     '''
165 |     
166 |     
167 |     


--------------------------------------------------------------------------------
/Machine_Learning_for_Asset_Managers/ch8_testing_set_overfitting.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cupy  #Requires Cuda environment (and numpy). Also set CUPY_CACHE_DIR=/gpfs/gpfs0/deep/cupy, pip install cupy-cuda112
  3 | import pandas as pd
  4 | from scipy.stats import norm, percentileofscore
  5 | import scipy.stats as ss
  6 | import matplotlib.pylab as plt
  7 | import matplotlib as mpl
  8 | import itertools
  9 | 
 10 | # code in chapter 8 is from the paper:
 11 | #THE DEFLATED SHARPE RATIO: CORRECTING FOR SELECTION BIAS, BACKTEST OVERFITTING AND NON-NORMALITY by David H. Bailey and Marcos López de Prado
 12 | 
 13 | # code snippet 8.1 - experimental validation of the false strategy theorem
 14 | # Calculates the theoretical E[SR_(n)] = expected Sharpe Ratio of the n'th order statistics (max)
 15 | def getExpectedMaxSR(nTrials, meanSR, stdSR):
 16 |     #Expected max SR, controlling for SBuMT
 17 |     emc = 0.477215664901532860606512090082402431042159336 #Euler-Mascheronis Constant
 18 |     sr0 = (1-emc)*norm.ppf(1-1./nTrials)+emc*norm.ppf(1-(nTrials*np.e)**-1)
 19 |     sr0 = meanSR + stdSR*sr0
 20 |     return sr0
 21 | 
 22 | def getDistMaxSR(nSims, nTrials, stdSR, meanSR):
 23 |     #Monte carlo of max{SR} on nTrials, from nSims simulations
 24 |     rng = np.random.RandomState()
 25 |     out = pd.DataFrame()
 26 |     for nTrials_ in nTrials:
 27 |         # 1) simulated sharpe ratios
 28 |         sr = pd.DataFrame(rng.randn(nSims, nTrials_)) #Return a sample (or samples) from the “standard normal” distribution.
 29 |         sr = sr.sub(sr.mean(axis=1), axis=0) #center
 30 |         sr = sr.div(sr.std(axis=1), axis=0) #scale
 31 |         sr = meanSR+sr*stdSR
 32 |         #2) store output
 33 |         out_ = sr.max(axis=1).to_frame('max{SR}')
 34 |         out_['nTrials'] = nTrials_
 35 |         out = out.append(out_, ignore_index=True)
 36 |     return out
 37 |     
 38 | # code snippet 8.2 - mean and standard deviation of the prediction errors
 39 | def getMeanStdError(nSims0, nSims1, nTrials, stdSR=1, meanSR=0):
 40 |     #compute standard deviation of errors per nTrials
 41 |     #nTrials: [number of SR used to derive max{SR}]
 42 |     #nSims0: number of max{SR} u{sed to estimate E[max{SR}]
 43 |     #nSims1: number of errors on which std is computed
 44 |     sr0=pd.Series({i:getExpectedMaxSR(i, meanSR, stdSR) for i in nTrials})
 45 |     sr0 = sr0.to_frame('E[max{SR}]') 
 46 |     sr0.index.name='nTrials'
 47 |     err=pd.DataFrame()
 48 |     for i in range(0, int(nSims1)):
 49 |         #sr1 = getDistDSR(nSims=1000, nTrials=nTrials, meanSR=0, stdSR=1)
 50 |         sr1 = getDistMaxSR(nSims=1000, nTrials=nTrials, meanSR=0, stdSR=1)
 51 |         sr1=sr1.groupby('nTrials').mean()
 52 |         err_=sr0.join(sr1).reset_index()
 53 |         err_['err'] = err_['max{SR}']/err_['E[max{SR}]']-1.
 54 |         err=err.append(err_)
 55 |     out = {'meanErr':err.groupby('nTrials')['err'].mean()}
 56 |     out['stdErr'] = err.groupby('nTrials')['err'].std()
 57 |     out = pd.DataFrame.from_dict(out, orient='columns')
 58 |     return out
 59 |     
 60 | # code snippet 8.3 - Type I (False positive), with numerical example (Type II False negative)
 61 | def getZStat(sr, t, sr_=0, skew=0, kurt=3):
 62 |     z = (sr-sr_)*(t-1)**.5
 63 |     z /= (1-skew*sr+(kurt-1)/4.*sr**2)**.5
 64 |     return z
 65 |     
 66 | def type1Err(z, k=1):
 67 |     #false positive rate
 68 |     alpha = ss.norm.cdf(-z)
 69 |     alpha_k = 1-(1-alpha)**k #multi-testing correction
 70 |     return alpha_k
 71 | 
 72 | # code snippet 8.4 - Type II error (false negative) - with numerical example
 73 | def getTheta(sr, t, sr_=0., skew=0., kurt=3):
 74 |     theta = sr_*(t-1)**.5
 75 |     theta /= (1-skew*sr+(kurt-1)/.4*sr**2)**.5
 76 |     return theta
 77 |     
 78 | def type2Err(alpha_k, k, theta):
 79 |     #false negative rate
 80 |     z = ss.norm.ppf((1-alpha_k)**(1./k)) #Sidak's correction
 81 |     beta = ss.norm.cdf(z-theta)
 82 |     return beta
 83 | 
 84 | if __name__ == '__main__':
 85 |     # code snippet 8.1
 86 |     nTrials = list(set(np.logspace(1, 6, 100).astype(int))) #only 100 iterations, in book - 1000
 87 |     nTrials.sort()
 88 |     sr0 = pd.Series({i:getExpectedMaxSR(i, meanSR=0, stdSR=1) for i in nTrials}, name="E[max{SR}] (prior)") #prior
 89 |     sr1 = getDistMaxSR(nSims=100, nTrials = nTrials, meanSR=0, stdSR=1) #observed
 90 |     # Note: running it takes a lot of memory
 91 |     #    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
 92 |     #    --- ---       20   0    38.1g  10.1g  227180 R 100.3   0.2  220:19.07 python
 93 | 
 94 |     ######### PLOT fig 8.1 ####################
 95 |     nnSR0 = list(itertools.chain.from_iterable(itertools.repeat(x, 100) for x in sr0.values))
 96 |     deviationFromExpectation = abs(sr1['max{SR}'] - nnSR0)
 97 | 
 98 |     ax = sr1.plot.scatter(x='nTrials', y='max{SR}', label='Max{SR} (observed)', c=deviationFromExpectation, cmap=mpl.cm.viridis.reversed()) #c: Array of values to use for marker colors.
 99 |     ax.set_xscale('log')
100 |     ax.plot(nTrials, sr0, linestyle='--', linewidth=1, label='E[max{SR}} (prioer)', color='black')
101 |     plt.legend()
102 |     ax.figure.savefig('/gpfs/gpfs0/deep/maxSR_across_uniform_strategies_8_1.png')
103 |     ######### end #######################
104 |     
105 |     # code snippet 8.2
106 |     nTrials = list(set(np.logspace(1, 6, 1000).astype(int)))
107 |     nTrials.sort()
108 |     stats = getMeanStdError(nSims0=1000, nSims1=100, nTrials=nTrials, stdSR=1)
109 | 
110 |     ######### plot fig 8.2 ##############
111 |     ax = stats.plot()
112 |     ax.set_xscale('log')
113 |     ax.figure.savefig('/gpfs/gpfs0/deep/fig82.png')
114 |     
115 |     # code snippet 8.3
116 |     #Numerical example
117 |     t, skew, kurt, k, freq=1250, -3, 10, 10, 250
118 |     sr = 1.25/freq**.5
119 |     sr_ = 1./freq**.5
120 |     z = getZStat(sr, t, 0, skew, kurt)
121 |     alpha_k = type1Err(z, k=k)
122 |     print(alpha_k)
123 |     #>>> print(alpha_k)
124 |     #0.060760769078662125
125 |     
126 |     # code snippet 8.4 
127 |     #numerical example
128 |     t, skew, kurt, k, freq = 1250, -3, 10, 10, 250
129 |     sr = 1.25/freq**.5
130 |     sr_ = 1./freq**.5
131 |     z = getZStat(sr, t, 0, skew, kurt)
132 |     alpha_k = type1Err(z, k=k)
133 |     theta = getTheta(sr, t, sr_, skew, kurt)
134 |     beta = type2Err(alpha_k, k, theta)
135 |     beta_k = beta**k
136 |     print(beta_k)
137 |     #>>> beta_k
138 |     #0.039348420332089205
139 |     
140 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Install Library 
  2 | 
  3 | ..with `pip install -U git+https://github.com/emoen/Machine-Learning-for-Asset-Managers`
  4 | 
  5 | <pre>
  6 | >>> from Machine_Learning_for_Asset_Managers import ch2_fitKDE_find_best_bandwidth as c
  7 | >>> import numpy as np
  8 | >>> c.findOptimalBWidth(np.asarray([21,3]))
  9 | {'bandwidth': 10.0}
 10 | </pre>
 11 | 
 12 | # Machine-Learning-for-Asset-Managers
 13 | 
 14 | Implementation of code snippets and exercises from [Machine Learning for Asset Managers (Elements in Quantitative Finance)](https://www.amazon.com/Machine-Learning-Managers-Elements-Quantitative/dp/1108792898)
 15 | written by Prof. Marcos López de Prado.
 16 | 
 17 | The project is for my own learning. If you want to use the consepts from the book - you should head over to Hudson & Thames. They have implemented these consepts and many more in [mlfinlab](https://github.com/hudson-and-thames/mlfinlab). 
 18 | 
 19 | For practical application see the repository: [Machine-Learning-for-Asset-Managers-Oslo-Bors](https://github.com/emoen/Machine-Learning-for-Asset-Managers-Oslo-Bors).
 20 | 
 21 | Note: In chapter 4 - there is a bug in the implementation of "Optimal Number of Clusters" algorithm (ONC) in the book 
 22 | (the code from the paper - DETECTION OF FALSE INVESTMENT STRATEGIES USING UNSUPERVISED LEARNING METHODS, de Prado and Lewis (2018) -  
 23 | is different but is also incorrect )
 24 | https://quant.stackexchange.com/questions/60486/bug-found-in-optimal-number-of-clusters-algorithm-from-de-prado-and-lewis-201
 25 | 
 26 | The divide and conquer method of subspaces used by ONC can be problematic because if you embed a subspace into a space with a large eigen-value.
 27 | The larger space can distort the clusters found in the subspace. ONC does precisely that - it embeds subspaces into the space consisting of the largest 
 28 | eigenvalues found in the correlation matrix. An outline describing the problem more rigorously can be found here: 
 29 | https://math.stackexchange.com/questions/4013808/metric-on-clustering-of-correlation-matrix-using-silhouette-score/4050616#4050616
 30 | 
 31 | Other clustering algorithms should be investigated like hierarchical clustering.
 32 | 
 33 | ## Chapter 2 Denoising and Detoning
 34 | 
 35 | Marcenko-Pasture theoretical probability density function, and empirical density function:
 36 | | ![marcenko-pastur.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/gaussian_mp.png) | 
 37 | |:--:| 
 38 | | *Figure 2.1:Marcenko-Pasture theoretical probability density function, and empirical density function:* |
 39 | 
 40 | 
 41 | Denoising a random matrix with signal using the constant residual eigenvalue method. This is done by fixing random eigenvalues. See code snippet 2.5
 42 | | ![eigenvalue_method.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/figure_2_3_eigenvalue_method.png) | 
 43 | |:--:| 
 44 | | *Figure 2.2: A comparison of eigenvalues before and after applying the residual eigenvalue method:* |
 45 | 
 46 | Detoned covariance matrix can be used to calculate minimum variance portfolio. The efficient frontier is the upper portion of the minimum variance frontier starting at the minimum variance portfolio. A denoised covariance matrix is less unstable to change.
 47 | 
 48 | Note: Excersize 2.7: "Extend function fitKDE in code snippet 2.2, so that it estimates through
 49 | cross-validation the optimal value of bWidth (bandwidth)".
 50 | 
 51 | The script ch2_fitKDE_find_bandwidth.py implements this procedure and produces the (green) KDE in figure 2.3:
 52 | | ![gaussian_mp_excersize_2_7.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/gaussian_mp_excersize_2_7.png) | 
 53 | |:--:| 
 54 | | *Figure 2.3:  Calculated bandwidth(green line) together with histogram, and pdf. The green line is smoother. Bandwidth found: 0.03511191734215131* |
 55 | 
 56 | From code snippet 2.3 -  with random matrix with signal: the histogram is how the eigenvalues of a random matrix with signal is distributed. Then the variance of the theoretical probability density function is calculated using the $fitKDE$ as the empirical probability density function. So finding a good value for bandwidth in fitKDE is needed to find the likeliest variance of the theoretical mp-pdf.
 57 | | ![fig_2_3_mp_with_signal.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_2_3_mp_with_signal.png) | 
 58 | |:--:| 
 59 | | *Figure 2.4: histogram and pdf of eigenvalues with signal* |
 60 | 
 61 | 
 62 | 
 63 | ## Chapter 3 Distance Metrics
 64 | 
 65 | * definition of a metric: 
 66 |    1. identity of indiscernibles d(x,y) = 0 => x=y 
 67 |    2. Symmetry d(x,y) = d(y,x) 
 68 |    3. triangle inequality. 
 69 |    - 1,2,3 => non-negativ, d(x,y) >= 0 
 70 | * pearson correlation
 71 | * distance correlation
 72 | * angular distance
 73 | * Information-theoretic codependence/entropy dependence
 74 |     - cross-entropy:  H[X] = - &Sigma;<sub>s &isin; S<sub>X</sub></sub> p[x] log (p[x])
 75 |     - Kullback-Leilbler divergence:  D<sub>KL</sub>[p||q] = - &Sigma;<sub>s &isin; S<sub>X</sub></sub> p[x] log (q[x]/p[x]) = p[x] &Sigma;<sub>s &isin; S</sub> log (p[x]/q[x])
 76 |     - Cross-entropy: H<sub>c</sub>[p||q] = H[x] = D<sub>KL</sub>[p||q]
 77 |     - Mutual information: Decrease in uncertainty in X from knowing Y: I[X,Y] = H[X] - H[X|Y] = H[X] + H[Y] - H[X,Y] = E<sub>X</sub>[D<sub>KL</sub>[p[y|x]||p[y]]]
 78 |     - variation of information: VI[X,Y] = H[X|Y] + H[Y|X] = H[X,Y] - I[X,Y]. It is uncertainty we expect in one variable given another variable: VI[X,Y] = 0 <=> X=Y
 79 |     - Kullback-Leilbler divergence is not a metric while variation of information is.
 80 |    
 81 |  
 82 |  ```
 83 |  >>> ss.entropy([1./2,1./2], base=2)
 84 | 1.0
 85 | >>> ss.entropy([1,0], base=2)
 86 | 0.0
 87 | >>> ss.entropy([1./3,2./3], base=2)
 88 | 0.9182958340544894
 89 | ```
 90 | 1. 1 bit of information in coin toss
 91 | 2. 0 bit of information in deterministic outcome
 92 | 3. less than 1 bit of information in unfair coin toss
 93 | 
 94 | 
 95 | * Angular distance: p_d = sqrt(1/2 - (1-rho(X, Y)))
 96 | * Absolute angular distance: p_d = sqrt(1/2 - (1-|rho|(X, Y)))
 97 | * Squared angular distance: p_d = sqrt(1/2 - (1-rho^2(X, Y)))
 98 | 
 99 | ![fig_3_1_angular_distance.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_3_1_angular_distance.png)  ![fig_3_1_abs_squared_angular_distance.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_3_1_abs_squared_angular_distance.png) 
100 | Standard angular distance is better used for long-only portfolio appliacations. Squared and Absolute Angular Distances for long-short portfolios. 
101 | 
102 | ## Chapter 4 Optimal Clustering
103 | 
104 | Use unsupervised learning to maximize intragroup similarities and minimize intergroup similarities. Consider matrix X of shape N x F. N objects and F features. Features are used to compute proximity(correlation, mutual information) to N objects in an NxN matrix.
105 | 
106 | There are 2 types of clustering algorithms. Partitional and hierarchical:
107 | 1. Connectivity: hierarchical clustering
108 | 2. Centroids: like k-means
109 | 3. Distribution: gaussians
110 | 4. Density: search for connected dense regions like DBSCAN, OPTICS
111 | 5. Subspace: modeled on two dimension, feature and observation. [Example](https://quantdare.com/biclustering-time-series/)
112 | 
113 | 
114 | Generating of random block correlation matrices is used to simulate instruments with correlation. The utility for doing this is in code snippet 4.3, and it uses clustering algorithms <i>optimal number of cluster</i> (ONC) defined in snippet 4.1 and 4.2, which does not need a predefined number of clusters (unlike k-means), but uses an 'elbow method' to stop adding clusters. The optimal number of clusters are achieved when there is high intra-cluster correlation and low inter-cluster correlation. The [silhouette score](https://en.wikipedia.org/wiki/Silhouette_(clustering)) is used to minimize within-group distance and maximize between-group distance. 
115 | | ![random_block_corr_matrix.jpg](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_4_1_random_block_correlation_matrix_mini.png) | 
116 | |:--:| 
117 | | *Random block correlation matrix. Light colors indicate a high correlation, and dark colors indicate a low correlation. In this example, the number of blocks K=6, minBlockSize=2, and number of instruments N=30* |
118 | | ![fig_4_1_random_block_correlation_matrix_onc.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_4_1_random_block_correlation_matrix_onc_mini.png) | 
119 | | *Applying the ONC algorithm to the random block correlation matrix. ONC finds all the clusters.* |
120 | 
121 | ## Chapter 5 Financial Labels
122 | 
123 | * Fixed-Horizon method
124 | * Time-bar method
125 | * Volume-bar method
126 | 
127 | Tiple-Barrier Method involves holding a position until
128 | 1. Unrealized profit target achieved
129 | 2. unrealized loss limit reached
130 | 3. Position is held beyond a maximum number of bars
131 | 
132 | Trend-scanning method: the idea is to identify trends and let them run for as long and as far as they may persists, without setting any barriers. 
133 | 
134 | | ![fig_5_1_trend_scanning.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_5_1_trend_scanning.png) | 
135 | |:--:| 
136 | | *Example of trend-scanning labels on sine wave with gaussian noise:* |
137 | 
138 | | ![fig_5_2_trend_scanning_t_values.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_5_2_trend_scanning_t_values.png) | 
139 | |:--:| 
140 | | *trend-scanning with t-values which shows confidence in trend. 1 is high confidence going up and -1 is high confidence going down.* |
141 | 
142 | An alternative to look-forward algorithm as presented in the book is to use look-backward from the latest data-point to the window-size. E.g. if the latest data-point is at index 20 - and the window size is between 3 and 10 days. The  look-backward algorithm will scan window at index 17 to 20 all the way back to index 11 to 20. Hence only considering the most recent information.
143 | 
144 | 
145 | | ![fig_5_2_trend_scanning_t_values2.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_5_2_trend_scanning_t_values2.png) | 
146 | |:--:| 
147 | | *trend-scanning with t-values using look-backwards* |
148 | 
149 | ## Chapter 6 Feature Importance Analysis
150 | 
151 | <i>"p-value does not measure the probability that neither the null nor the alternative hypothesis is true, or the significance of a result."</i>
152 | | ![fig_6_1_p_values_explanatory_vars.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_6_1_p_values_explanatory_vars.png) | 
153 | |:--:| 
154 | | *p-Values computed on a set of informative, redundant, and noisy explanatory variables. The explanatory variables has not the hightest p-values.* |
155 | 
156 | "Backtesting is not a research tool. Feature importance is." (Lopez de Prado) The Mean Decrease Impurity (MDI) algorithm deals with 3 out of 4 problems with p-values:
157 | 1. MDI is not imposing any tree structure, algebraic specification, or relying on any stochastic or distributional characteristics of the residuals (e.g. y=b<sub>0</sub>+b<sub>1</sub>*x<sub>i</sub>+&epsilon;)
158 | 2. betas are estimated from single sample, MDI relies on bootstrapping, so the variance can be reduced by the numbers of trees in the random forrest ensemble.
159 | 3. In MDI the goal is not to estimate a coefficient of a given algebraic equation (b_hat_0, b_hat_1) describing the probability of a null-hypotheses.
160 | 4. MDI does not correct of calculation in-sample, as there is no cross-validation.
161 | 
162 | | ![fig_6_2_mdi_example.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_6_2_mdi_example.png) | 
163 | |:--:| 
164 | | *MDI algorithm example* |
165 | 
166 | Figure 6.4 shows that ONC correctly recognizes that there are six relevant clusters(one cluster for each informative feature, plus one cluster of noise features), and it assigns the redundant features to the cluster that contains the informative feature from which the redundant features where derived. Given the low correlation across clusters, there is no need to replace the features with their residuals.
167 | | ![fig_6_4_feature_clustering.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_6_4_feature_clustering.png) | 
168 | |:--:| 
169 | 
170 | Next, apply the clustered MDI method to the clustered data:
171 | | ![fig_6_5_clustered_MDI.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_6_5_clustered_MDI.png) | 
172 | |:--:| 
173 | | *Figure 6.5 Clustered MDI* |
174 | 
175 | Clustered MDI works better han non-clustered MDI. Finally, apply the clustered MDA method to this data:
176 | | ![fig_6_6_clustered_MDA.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_6_6_clustered_MDA.png) | 
177 | |:--:| 
178 | | *Figure 6.6 Clustered MDA* |
179 | 
180 | Conclusion: C_5 which is associated with noisy features is not important, and all other clusters has similar importance.
181 | 
182 | ## Chapter 7 Portfolio Construction
183 | 
184 | Convex portfolio optimization can calculate minimum variance portfolio and max sharp-ratio.
185 | 
186 | Definition Condition number: absolute value of the ratio between the maximum and minimum eigenvalues: A_n_n / A_m_m. The condition number says something about the instability of the instability caused by covariance structures.
187 | Definition trace = sum(diag(A)) - its the sum of the diagonal elements
188 | 
189 | Highly correlated time-series implies high condition number of the correlation matrix.
190 | 
191 | ### Markowitz's curse
192 | The correlation matrix C is stable only when the correlation $\ro = 0$ - when there is no correlation.
193 | 
194 | Hierarchical risk parity (HRP) outperforms Markowit in out-of-sample Monte-Carlo experiments, but is sub-optimal in-sample.
195 | 
196 | Code-snippet 7.1 illustrates the signal-induced instability of the correlation matrix.
197 | ```
198 | >>> corr0 = mc.formBlockMatrix(2, 2, .5)
199 | >>> corr0
200 | array([[1. , 0.5, 0. , 0. ],
201 |        [0.5, 1. , 0. , 0. ],
202 |        [0. , 0. , 1. , 0.5],
203 |        [0. , 0. , 0.5, 1. ]])
204 | >>> eVal, eVec = np.linalg.eigh(corr0)
205 | >>> print(max(eVal)/min(eVal))
206 | 3.0
207 | ```
208 | 
209 | | ![fig_7_1_block_diagonal.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_7_1_block_diagonal.png) | 
210 | |:--:| 
211 | | *Figure 7.1 Heatmap of a block-diagonal correlation matrix* |
212 | 
213 | Code-snippet 7.2 creates same block diagonal matrix but with one dominant block. However the condition number is the same.
214 | ```
215 | >>> corr0 = block_diag(mc.formBlockMatrix(1,2, .5))
216 | >>> corr1 = mc.formBlockMatrix(1,2, .0)
217 | >>> corr0 = block_diag(corr0, corr1)
218 | >>> corr0
219 | array([[1. , 0.5, 0. , 0. ],
220 |        [0.5, 1. , 0. , 0. ],
221 |        [0. , 0. , 1. , 0. ],
222 |        [0. , 0. , 0. , 1. ]])
223 | >>> eVal, eVec = np.linalg.eigh(corr0)
224 | >>> matrix_condition_number = max(eVal)/min(eVal)
225 | >>> print(matrix_condition_number)
226 | 3.0
227 | ```
228 | This demonstrates bringing down the intrablock correlation in only one of the two blocks doesnt reduce the condition number. This shows that the instablility in Markowitz's solution can be traced back to the dominant blocks.
229 | | ![fig_7_2_block_diagonal.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/fig_7_2_block_diagonal.png) | 
230 | |:--:| 
231 | | *Figure 7.2 Heatmap of a dominant block-diagonal correlation matrix* |
232 | 
233 | ### The nested Clustered Optimization Algorithm (NCO)
234 | 
235 | NCO provides a strategy for addressing the effect of Markowitz's curse on an existing mean-variance allocation method.
236 | 1. step: cluster the correlation matrix
237 | 2. step: compute optimal intracluster allocations, using the denoised covariance matrix
238 | 3. step: compute optimal intercluster allocations, using the reduced covariance matrix which is close to a diagonal matrix, so optimization problem is close to ideal 
239 | markowitz case when $\ro$ = 0
240 | 
241 | ## Chapter 8 Testing set overfitting
242 | 
243 | Backtesting is a historical simulation of how an investment strategy would have performed in the past. Backtesting suffers from selection bias under multiple testing, as researchers run millions of tests on historical data and presents the best ones (overfitted). This chapter studies how to measure the effect of selection bias. 
244 | 
245 | ### Precision and recall
246 | ### Precision and recall under multiple testing
247 | ### The sharpe ratio
248 | 
249 | Sharpe Ratio = μ/σ
250 | 
251 | ### The 'False Strategy' theorem
252 | 
253 | A researcher may run many historical simulations and report only the best one (max sharp ratio). 
254 | The distribution of max sharpe ratio is not the same as the expected sharpe ratio. Hence selection bias under multiple replications (SBuMT).
255 | 
256 | ### Experimental results
257 | 
258 | A monte carlo experiment shows that the distribution of the max sharp ratio increases (E[max(sharp_ratio)] = 3.26) even 
259 | when the expected sharp ratio is 0 (E[sharp_ratio]). So an investment strategy will seem promising even when there are no good strategy.
260 | 
261 | When more than one trial takes place, the expected value of the maximum Sharpe Ratio is greater than the expected value 
262 |  of the Sharpe Ratio, from a random trial (when true Sharpe Ratio=0 and variance > 0).
263 | | ![maxSR_across_uniform_strategies_8_1.png](https://github.com/emoen/Machine-Learning-for-Asset-Managers/blob/master/img/maxSR_across_uniform_strategies_8_1.png) | 
264 | |:--:| 
265 | | *Figure 8.1 Comparison of experimental and theoretical results from False Strategy Theorem* |
266 | 
267 | ### The Deflated Sharpe Ratio
268 | The main conclusion from the False Strategy Theorem is that, unless $max<sub>k</sub>{SR^<sub>k</sub>}>>E[max<sub>k</sub>{SR^<sub>k</sub>}], 
269 | the discovered strategy is likely to be false positive.
270 | 
271 | ### Type II errors under multiple testing
272 | ### The interaction between type I and type II errors
273 | 
274 | 
275 | ## Appendix A: Testing on Synthetic data
276 | 
277 | Either from resampling or monte carlo
278 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .Machine_Learning_for_Asset_Managers import *


--------------------------------------------------------------------------------
/img/fig_2_3_mp_with_signal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_2_3_mp_with_signal.png


--------------------------------------------------------------------------------
/img/fig_3_1_abs_squared_angular_distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_3_1_abs_squared_angular_distance.png


--------------------------------------------------------------------------------
/img/fig_3_1_angular_distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_3_1_angular_distance.png


--------------------------------------------------------------------------------
/img/fig_4_1_random_block_correlation_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_4_1_random_block_correlation_matrix.png


--------------------------------------------------------------------------------
/img/fig_4_1_random_block_correlation_matrix_mini.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_4_1_random_block_correlation_matrix_mini.png


--------------------------------------------------------------------------------
/img/fig_4_1_random_block_correlation_matrix_onc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_4_1_random_block_correlation_matrix_onc.png


--------------------------------------------------------------------------------
/img/fig_4_1_random_block_correlation_matrix_onc_mini.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_4_1_random_block_correlation_matrix_onc_mini.png


--------------------------------------------------------------------------------
/img/fig_5_1_trend_scanning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_5_1_trend_scanning.png


--------------------------------------------------------------------------------
/img/fig_5_2_trend_scanning_t_values.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_5_2_trend_scanning_t_values.png


--------------------------------------------------------------------------------
/img/fig_5_2_trend_scanning_t_values2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_5_2_trend_scanning_t_values2.png


--------------------------------------------------------------------------------
/img/fig_5_3_distribution_t_values.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_5_3_distribution_t_values.png


--------------------------------------------------------------------------------
/img/fig_5_3_distribution_t_values_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_5_3_distribution_t_values_2.png


--------------------------------------------------------------------------------
/img/fig_6_1_p_values_explanatory_vars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_6_1_p_values_explanatory_vars.png


--------------------------------------------------------------------------------
/img/fig_6_2_mdi_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_6_2_mdi_example.png


--------------------------------------------------------------------------------
/img/fig_6_3_mda_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_6_3_mda_example.png


--------------------------------------------------------------------------------
/img/fig_6_4_feature_clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_6_4_feature_clustering.png


--------------------------------------------------------------------------------
/img/fig_6_5_clustered_MDI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_6_5_clustered_MDI.png


--------------------------------------------------------------------------------
/img/fig_6_6_clustered_MDA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_6_6_clustered_MDA.png


--------------------------------------------------------------------------------
/img/fig_7_1_block_diagonal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_7_1_block_diagonal.png


--------------------------------------------------------------------------------
/img/fig_7_2_block_diagonal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/fig_7_2_block_diagonal.png


--------------------------------------------------------------------------------
/img/figure_2_3_eigenvalue_method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/figure_2_3_eigenvalue_method.png


--------------------------------------------------------------------------------
/img/figure_2_3_eigenvalue_method_with_denoise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/figure_2_3_eigenvalue_method_with_denoise.png


--------------------------------------------------------------------------------
/img/figure_2_3_eigenvalue_method_with_denoise_zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/figure_2_3_eigenvalue_method_with_denoise_zoomed.png


--------------------------------------------------------------------------------
/img/gaussian_mp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/gaussian_mp.png


--------------------------------------------------------------------------------
/img/gaussian_mp_excersize_2_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/gaussian_mp_excersize_2_7.png


--------------------------------------------------------------------------------
/img/maxSR_across_uniform_strategies_8_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emoen/Machine-Learning-for-Asset-Managers/5292b3c4cf61e6f41c71dc1da66681ecc366cc41/img/maxSR_across_uniform_strategies_8_1.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | contourpy==1.3.1
 2 | cycler==0.12.1
 3 | fonttools==4.55.7
 4 | joblib==1.4.2
 5 | kiwisolver==1.4.8
 6 | matplotlib==3.10.0
 7 | numpy==2.2.2
 8 | packaging==24.2
 9 | pandas==2.2.3
10 | patsy==1.0.1
11 | pillow==11.1.0
12 | pyparsing==3.2.1
13 | python-dateutil==2.9.0.post0
14 | pytz==2024.2
15 | scikit-learn==1.6.1
16 | scipy==1.15.1
17 | seaborn==0.13.2
18 | six==1.17.0
19 | statsmodels==0.14.4
20 | threadpoolctl==3.5.0
21 | tzdata==2025.1
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name='Machine_Learning_for_Asset_Managers',
 5 |     packages=find_packages(),
 6 |     version='1.0.0',
 7 |     description='Implementation of code snippets found in Machine-Learning-for-Asset-Managers by Marcos M. Lòpez de Prado',
 8 |     author='Endre Moen',
 9 |     license='MIT',
10 | )


--------------------------------------------------------------------------------