├── Chapter 02 - Financial Data Structures.ipynb ├── Chapter 03 - Meta-Labeling.ipynb ├── Chapter 04 - Sample Weights.ipynb ├── Chapter 05 - Fractionally Differentiated Features.ipynb ├── Chapter 06 - Ensemble Methods.ipynb ├── Chapter 07 - Cross-Validation in Finance.ipynb ├── Chapter 08 - Feature Importance.ipynb ├── Chapter 09 - Hyper-Parameter Tuning with Cross-Validation.ipynb ├── Chapter 10 - Bet Sizing.ipynb ├── Chapter 11 - The Dangers of Backtesting.ipynb ├── Chapter 12 - Backtesting through Cross-Validation.ipynb ├── Chapter 13 - Backtesting on Synthetic Data.ipynb ├── Chapter 14 - Backtest Statistics.ipynb ├── Chapter 15 - Understanding Strategy Risk.ipynb ├── Chapter 16 - Machine Learning Asset Allocation.ipynb ├── README.md ├── active_signals.py ├── cla_mlf.py ├── cv.py ├── feature_imp.py ├── feature_importances_mp.py ├── filters.py ├── hrp_mlf.py ├── images ├── mda_feat_imp.png ├── mda_feat_imp_8.1.png ├── mda_feat_imp_8.1c.png ├── mda_feat_imp_8.2a.png ├── mda_feat_imp_8.3b.png ├── mda_feat_imp_8.4c.png ├── mda_feat_imp_8.4c2.png ├── mda_feat_imp_8.4c_10chunks.png ├── mda_feat_imp_8.4c_1chunk.png ├── mda_feat_imp_8.5_1.png ├── mda_feat_imp_8.5_2.png ├── mda_feat_imp_8.5_3.png ├── mda_feat_imp_8.5_4.png ├── mda_feat_imp_8.5_5.png ├── mdi_feat_imp.png ├── mdi_feat_imp_8.1.png ├── mdi_feat_imp_8.1c.png ├── mdi_feat_imp_8.2a.png ├── mdi_feat_imp_8.3b.png ├── mdi_feat_imp_8.4c.png ├── mdi_feat_imp_8.4c2.png ├── mdi_feat_imp_8.4c_10chunks.png ├── mdi_feat_imp_8.4c_1chunk.png ├── mdi_feat_imp_8.5_1.png ├── mdi_feat_imp_8.5_2.png ├── mdi_feat_imp_8.5_3.png ├── mdi_feat_imp_8.5_4.png ├── mdi_feat_imp_8.5_5.png ├── sfi_feat_imp.png ├── sfi_feat_imp_8.1.png ├── sfi_feat_imp_8.1c.png ├── sfi_feat_imp_8.2a.png ├── sfi_feat_imp_8.3b.png ├── sfi_feat_imp_8.4c.png ├── sfi_feat_imp_8.4c2.png ├── sfi_feat_imp_8.4c_10chunks.png ├── sfi_feat_imp_8.4c_1chunk.png ├── sfi_feat_imp_8.5_1.png ├── sfi_feat_imp_8.5_2.png ├── sfi_feat_imp_8.5_3.png ├── sfi_feat_imp_8.5_4.png └── sfi_feat_imp_8.5_5.png ├── img ├── MDA_feat_imp_8.1c.png ├── MDA_feat_imp_8.1c2.png ├── MDA_feat_imp_8.2a.png ├── MDA_feat_imp_8.3b.png ├── MDA_feat_imp_8.4c_10chunks.png ├── MDA_feat_imp_8.4c_1chunk.png ├── MDI_feat_imp_8.1c.png ├── MDI_feat_imp_8.1c2.png ├── MDI_feat_imp_8.2a.png ├── MDI_feat_imp_8.3b.png ├── MDI_feat_imp_8.4c_10chunks.png ├── MDI_feat_imp_8.4c_1chunk.png ├── MDI_feat_imp_8.5_1.png ├── MDI_feat_imp_8.5_1_2.png ├── MDI_feat_imp_8.5_2.png ├── MDI_feat_imp_8.5_3.png ├── MDI_feat_imp_8.5_4.png ├── MDI_feat_imp_8.5_5.png ├── SFI_feat_imp_8.1c.png ├── SFI_feat_imp_8.1c2.png ├── SFI_feat_imp_8.2a.png ├── SFI_feat_imp_8.3b.png ├── SFI_feat_imp_8.4c_10chunks.png └── SFI_feat_imp_8.4c_1chunk.png ├── labeling.py ├── load_data.py ├── load_data_orig.py ├── mean_variance_mlf.py ├── multiprocess.py ├── sampling.py ├── stats.py ├── synthetic_data.py ├── testFunc ├── 81c_mda.png ├── 81c_mdi.png ├── 81c_sfi.png ├── 82b_mda.png ├── 82b_mdi.png ├── 82b_sfi.png ├── 83b_mda.png ├── 83b_mdi.png ├── 83b_sfi.png ├── 84d_mdi_10chunks.png ├── 84d_mdi_1chunk.png ├── 85_1.png ├── 85_2.png ├── 85_3.png ├── 85_4.png ├── 85_5.png ├── stats.csv └── trnsX.csv └── util.py /Chapter 06 - Ensemble Methods.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 6.1\n", 8 | "\n", 9 | "Why is bagging based on random sampling with replacement? Would bagging still reduce a forecast's variance if sampling were without replacement?" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "**A: So every training set winds up with a disparate set of data, yes this is a technique called \"pasting\".**" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# 6.2a\n", 24 | "\n", 25 | "Suppose that your training set is based on highly overlap labels (i.e. with low uniqueness, as defined by Chapter 4)\n", 26 | "\n", 27 | "Does this make bagging prone to overfitting, or just ineffective? Why?" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "**A: Bagging helps with overfitting, but the selected training sets will likely still be very similar, thus not helping improve accuracy much.**" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "# 6.2b\n", 42 | "\n", 43 | "Suppose that your training set is based on highly overlap labels (i.e. with low uniqueness, as defined by Chapter 4)\n", 44 | "\n", 45 | "Is out-of-bag accuracy generally reliable in financial applications? Why?" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "**A: High levels of serial correlation in financial datasets mean that the further apart your training from your validation data the better -- which is why K-Fold CV is generally preferable.**" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# 6.3a\n", 60 | "\n", 61 | "Build an ensemble of estimators, where the base estimator is a decision tree.\n", 62 | "\n", 63 | "How is this ensemble different from a RF?" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 27, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "OOB score: 0.92\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "from sklearn.datasets import make_classification\n", 81 | "from sklearn.tree import DecisionTreeClassifier\n", 82 | "from sklearn.ensemble import BaggingClassifier\n", 83 | "\n", 84 | "\n", 85 | "X, y = make_classification(random_state=1)\n", 86 | "bc = BaggingClassifier(DecisionTreeClassifier(), oob_score=True, random_state=1)\n", 87 | "bc.fit(X, y)\n", 88 | "print(\"OOB score:\", bc.oob_score_)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "**A: They are indeed very similar, however the RF uses a random subset of features to generate splits.**" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "# 6.3b\n", 103 | "\n", 104 | "Build an ensemble of estimators, where the base estimator is a decision tree.\n", 105 | "\n", 106 | "Using sklearn, produce a bagging classifier that behaves like an RF. What parameters did you have to set up, and how?" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 34, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "OOB score: 0.95\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "X, y = make_classification(random_state=1)\n", 124 | "bc = BaggingClassifier(DecisionTreeClassifier(splitter='random'), oob_score=True, random_state=1)\n", 125 | "bc.fit(X, y)\n", 126 | "print(\"OOB score:\", bc.oob_score_)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "# 6.4a\n", 134 | "\n", 135 | "Consider the relation between an RF, the number of trees it is composed of, and the number of features utilized:\n", 136 | "\n", 137 | "Could you envision a relation between the minimum number of trees needed in an RF and the number of features utilized?" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "**A: If the number of trees is much smaller than the number of features, there might be features that are ignored via the random sub-sampling.**" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "# 6.4b\n", 152 | "\n", 153 | "Consider the relation between an RF, the number of trees it is composed of, and the number of features utilized:\n", 154 | "\n", 155 | "Could the number of trees be too small for the number of features used?" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "**A: See above.**" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "# 6.4c\n", 170 | "\n", 171 | "Consider the relation between an RF, the number of trees it is composed of, and the number of features utilized:\n", 172 | "\n", 173 | "Could the number of trees be too high for the number of observations available?\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "**A: Generally much too high numbers of decision trees will only materially increase processing time rather than accuracy.**" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "# 6.5\n", 188 | "\n", 189 | "Consider the relation between an RF, the number of trees it is composed of, and the number of features utilized:\n", 190 | "\n", 191 | "How is out-of-bag accuracy different from statified k-fold (with shuffling) cross-validation accuracy?" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "**A:Same answer as above: High levels of serial correlation in financial datasets mean that the further apart your training from your validation data the better -- which is why K-Fold CV is generally preferable.**" 199 | ] 200 | } 201 | ], 202 | "metadata": { 203 | "kernelspec": { 204 | "display_name": "Python 3", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.5.6" 219 | } 220 | }, 221 | "nbformat": 4, 222 | "nbformat_minor": 2 223 | } 224 | -------------------------------------------------------------------------------- /Chapter 07 - Cross-Validation in Finance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 73, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib\n", 12 | "import matplotlib.pyplot as mpl\n", 13 | "\n", 14 | "from collections import defaultdict\n", 15 | "from functools import reduce\n", 16 | "from path import Path\n", 17 | "from pprint import pprint\n", 18 | "import seaborn as sns\n", 19 | "\n", 20 | "%matplotlib inline\n", 21 | "mpl.style.use('ggplot')\n", 22 | "mpl.rcParams['figure.figsize'] = 16,6" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# 7.1\n", 30 | "\n", 31 | "Why is shuffling a dataset before conducting a k-fold CV generall a bad idea in finance? What is the purpose of shuffling? Why does shuffling defeat the purpose of k-fold CV in financial datasets?" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "**A: Markets are adaptive systems and thus sample order is meaningful. Generally we would shuffle data because we don't want the model to make predictions based on the order of the data.**" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from sampling import dollar_bars\n", 48 | "from filters import cusum\n", 49 | "from multiprocess import mpPandasObj\n", 50 | "from load_data import load_contracts\n", 51 | "from labeling import getEvents, getVerticalBarriers, getBins\n", 52 | "from util import getDailyVol" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "es_contracts = load_contracts('@ES')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "# 7.2a\n", 69 | "\n", 70 | "Take a pair of matrices (X, y) representing observed features and labels. These could be one of the datasets derived from the exercises in Chapter 3.\n", 71 | "\n", 72 | "Derive the performance from a 10-fold CV of an RF classifier on (X, y), without shuffling." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stderr", 84 | "output_type": "stream", 85 | "text": [ 86 | "C:\\Users\\doda\\Dropbox\\algotrading\\AFML\\labeling.py:7: FutureWarning: \n", 87 | "Passing list-likes to .loc or [] with any missing label will raise\n", 88 | "KeyError in the future, you can use .reindex() as an alternative.\n", 89 | "\n", 90 | "See the documentation here:\n", 91 | "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n", 92 | " trgt = trgt.loc[tEvents]\n" 93 | ] 94 | }, 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 99 | " max_depth=5, max_features='auto', max_leaf_nodes=None,\n", 100 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 101 | " min_samples_leaf=1, min_samples_split=2,\n", 102 | " min_weight_fraction_leaf=0.0, n_estimators=512, n_jobs=None,\n", 103 | " oob_score=True, random_state=42, verbose=0, warm_start=False)" 104 | ] 105 | }, 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "dbars = dollar_bars(es_contracts, 100000000)\n", 113 | "\n", 114 | "df = dbars.copy()['2018-8-1':'2019-5-1']\n", 115 | "close = df['Close']\n", 116 | "daily_vol = getDailyVol(close)\n", 117 | "tEvents = cusum(close, getDailyVol(close).mean())\n", 118 | "t1 = getVerticalBarriers(close, tEvents, numDays=1)\n", 119 | "events = getEvents(close, tEvents=tEvents, ptSl=[1,1], t1=t1, trgt=daily_vol, minRet=0.01)\n", 120 | "\n", 121 | "close = df['Close']\n", 122 | "fast_ma = close.rolling(50).mean()\n", 123 | "slow_ma = close.rolling(200).mean()\n", 124 | "\n", 125 | "long_signals = (fast_ma >= slow_ma)\n", 126 | "short_signals = (fast_ma < slow_ma)\n", 127 | "\n", 128 | "df.loc[long_signals, 'side'] = 1\n", 129 | "df.loc[short_signals, 'side'] = -1\n", 130 | "events['side'] = df['side']\n", 131 | "\n", 132 | "bins = getBins(events, df['Close'])\n", 133 | "bins['bin'].value_counts()\n", 134 | "\n", 135 | "df['log_ret'] = np.log(close).diff()\n", 136 | "df['vol5'] = df['log_ret'].rolling(5).std()\n", 137 | "df['vol10'] = df['log_ret'].rolling(10).std()\n", 138 | "\n", 139 | "df['serialcorr20-1'] = df['log_ret'].rolling(20).apply(lambda x: pd.Series(x).autocorr(lag=1))\n", 140 | "\n", 141 | "\n", 142 | "df = df.shift()\n", 143 | "\n", 144 | "from sklearn.ensemble import RandomForestClassifier\n", 145 | "\n", 146 | "train_features = df.loc[events.index][['side', 'vol5', 'serialcorr20-1']]\n", 147 | "train_labels = bins['bin']\n", 148 | "\n", 149 | "rf = RandomForestClassifier(n_estimators=512, random_state=42, max_depth=5, oob_score=True)\n", 150 | "rf.fit(train_features, train_labels)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "from sklearn.model_selection import cross_validate, KFold\n", 160 | "\n", 161 | "cvd = cross_validate(rf, train_features, train_labels, cv=KFold(10))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 7, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "0.6983229813664596" 173 | ] 174 | }, 175 | "execution_count": 7, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "cvd['test_score'].mean()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "# 7.2b\n", 189 | "\n", 190 | "Take a pair of matrices (X, y) representing observed features and labels. These could be one of the datasets derived from the exercises in Chapter 3.\n", 191 | "\n", 192 | "Derive the performance from a 10-fold CV of an RF classifier on (X, y), with shuffling." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 8, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "cvd = cross_validate(rf, train_features, train_labels, cv=KFold(10, shuffle=True))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 9, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "0.7011180124223603" 213 | ] 214 | }, 215 | "execution_count": 9, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "cvd['test_score'].mean()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "# 7.2c\n", 229 | "\n", 230 | "Take a pair of matrices (X, y) representing observed features and labels. These could be one of the datasets derived from the exercises in Chapter 3.\n", 231 | "\n", 232 | "Why are both results so different?" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "**A: Shuffling introduces information from the future into our training set.**" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "# 7.2d\n", 247 | "\n", 248 | "Take a pair of matrices (X, y) representing observed features and labels. These could be one of the datasets derived from the exercises in Chapter 3.\n", 249 | "\n", 250 | "How does shuffling leak information?" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "**A: The model is able to train on data from the future.**" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "# 7.3a\n", 265 | "\n", 266 | "Take the same pair of matrices (X, y) you used in exercise 2.\n", 267 | "\n", 268 | "Derive the performance from a 10-fold purged CV of an RF on (X, y), with 1% embargo." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 70, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "The mean CV performance is 0.70.\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "from sklearn.model_selection._split import _BaseKFold\n", 286 | "from sklearn.metrics import log_loss, accuracy_score\n", 287 | "from cv import cvScore\n", 288 | "\n", 289 | "cvd = cvScore(rf, train_features, train_labels, pctEmbargo=0.01,\n", 290 | " sample_weight=None, cv=10,\n", 291 | " t1=pd.Series(train_features.index, index=train_features.index),\n", 292 | " scoring='accuracy'\n", 293 | " )\n", 294 | "print(\"The mean CV performance is {:.2f}.\".format(np.mean(cvd)))\n" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 72, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAFpCAYAAABklI6gAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAIABJREFUeJzs3XeYVNXh//H3ubvswrKAwIIIigqCiBVBQVRERcASW2wkGo1RoyYxJvb0X9qXqCTGxBJ7osYSY4kdsWFXBCuKvSAgXTrC3vP7YwyJsdB298zsvl/Psw87zGX2s851Zz57zj0nxBgjkiRJkiQlkKUOIEmSJElquiylkiRJkqRkLKWSJEmSpGQspZIkSZKkZCylkiRJkqRkLKWSJEmSpGQspZIkSZKkZCylkiRJkqRkLKWSJEmSpGQspZIkSZKkZCylkiRJkqRkylN+8SlTpqT88qonNTU1zJw5M3UMNRKeT6pLnk+qS55PqkueT6pLxXI+de7ceZWOc6RUkiRJkpSMpVSSJEmSlIylVJIkSZKUjKVUkiRJkpSMpVSSJEmSlIylVJIkSZKUjKVUkiRJkpSMpVSSJEmSlIylVJIkSZKUjKVUkiRJkpSMpVSSJEmSlIylVJIkSZKUjKVUkiRJkpRMeeoAkiQ1FfnYe1JHUB1aVF1NvmBB6hgrlQ0anjqCJH0pR0olSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyVhKJUmSJEnJWEolSZIkSclYSiVJkiRJyZSv7ICZM2dywQUXMHfuXEIIDBkyhL322utTx7z88sucffbZdOzYEYD+/ftz0EEH1U9iSZIkSVKjsdJSWlZWxhFHHEG3bt1YvHgxZ555JltttRXrr7/+p47bbLPNOPPMM+stqCRJkiSp8Vnp9N22bdvSrVs3AFq0aEGXLl2YPXt2vQeTJEmSJDV+Kx0p/W/Tp0/n7bffZpNNNvnMfa+99hqnnXYabdu25YgjjmCDDTb4zDFjxoxhzJgxAIwcOZKampo1jK1iVl5e7nOrOuP5pLqU+nxaVF2d7Gur7pVlZVSXwHNa5c/QkpD655Mal1I7n0KMMa7KgUuWLOHnP/85Bx54IP379//UfYsWLSLLMpo3b8748eO56qqrOP/881f6mFOmTFmz1CpqNTU1zJw5M3UMNRKeT6pLqc+nfOw9yb626l51dTULFixIHWOlskHDU0fQKkj980mNS7GcT507d16l41Zp9d3ly5czatQodt55588UUoCqqiqaN28OwLbbbkttbS3z5s1bjbiSJEmSpKZopaU0xsjFF19Mly5d2GeffT73mLlz5/LvAdc33niDPM9p1apV3SaVJEmSJDU6K72mdNKkSYwdO5auXbty2mmnATBixIgVw8FDhw7lySefZPTo0ZSVlVFRUcHJJ59MCKF+k0uSJEmSSt5KS2mvXr248cYbv/SY4cOHM3y41ytIkiRJklbPKl1TKkmSJElSfbCUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpKxlEqSJEmSkrGUSpIkSZKSsZRKkiRJkpIpTx1AkiSp1MUY4eOlsGghLF4EixfC8mVQUfnJR3OorFxxO2SOC0jSv1lKJUmSVkPMc5g1g6WTXiS+9zbM/6hQRPPaVXuAEIht20PHzrDuetCxM6F5i/oNLUlFzFIqSZL0JWKMMHc2TJsM0z6AD6fAso/5GKBte+jYCVq0LHxUVf3n82bNCqOn//5YuqTw5+JFMPNDeH0ivPpC4Wu0aQsd14MuG0GXro6kSmpSLKWSJEmfIy5bBu+8Dq+9BLNnFv6yVWvYaBPotD4tu/VgUR6//EFaVH3x49fWwuwZhZI7fQq880ahqLasJvbYHHr0dgRVUpNgKZUkSfovce7sQhF96zVY9nFhNHT7naHLRoTqViuOy6pawoIFa/x1QlkZdOhU+GDbwrTgyW/DpJfguafghWeIG20CPbeEmo6EEOrgu5Ok4mMplSRJTV6MESa/AxOfg+lTIctgw02g5+bQoVODFMKQZdC1O3Tt/l/FeFKhHLfvSNx2B0KnLvWeQ5IamqVUkiQ1aXH6VBj/BMyYBtWtYdsdoHuvpFNnwzrtYPtBxD4DCqX05Qlw323EDTeBvgMJLauTZZOkumYplSRJTVL8aA5MeBLef7tw7eeAwYUyWkSLDIVmFbDpFsTumxaK6csTYPI7xC37Qu9tClOAJanEWUolSVKTEhcvguefhjdegbJy2Hp72GxrQrNmqaN9oVDeDLbentitFzz7WOGa0zdfJW63E6HLhqnjSdJasZRKkqQmIcYIr71cmKpbW1u4XnTLfoQvWSG32IRWrWHwnsQp78Ezj8IDdxI37gH9Bxd1qZakL2MplSRJjV6cPw+eeBA+/AA6rQ/9BxFar5M61hoLnbsS9zkUXh4PL4yD2TOJg4YVrkWVpBJjKZUkSY1WYXT0pcLoaAiF60Y32axRbK8Syspgq+2IHTvDI6PhrpuIA3YhdNs0dTRJWi2WUkmS1CjF+R99Mjo6BdbbAHYYTGjZauX/sMSETl2Iex8Cj94Hj91P/HAqbL8Tocy3eZJKw0p/Ws2cOZMLLriAuXPnEkJgyJAh7LXXXp86JsbIlVdeyYQJE6isrOTEE0+kW7du9RZakiTpi8QYC4sYPfNoYb/RHXYtrKrbCEZHv0ioakkcsm9hAaeXxsOs6YXpvK3bpI4mSSu10lJaVlbGEUccQbdu3Vi8eDFnnnkmW221Feuvv/6KYyZMmMC0adM4//zzef3117nsssv47W9/W6/BJUmS/ldctgyeHgtvTSpcOzpwtyazp2fIMugzgNihEzx2P9x9E3HXvVb+DyUpsZVuxNW2bdsVo54tWrSgS5cuzJ49+1PHjBs3jkGDBhFCoGfPnixcuJA5c+bUT2JJkqTPET+aA3f/s1BIt9oOdt+nyRTS/xbW3wj2Ohgqm8OY24kvjksdSZK+1GrtDj19+nTefvttNtlkk0/9/ezZs6mpqVlxu3379p8prpIkSfUlvvM63HUTLFkEu3+FsPV2hZHDJiq0ag3DDoA265Bf8Bvypx5OHUmSvtAqXwG/ZMkSRo0axVFHHUVV1af384oxfub4z7tuY8yYMYwZMwaAkSNHfqrIqvEoLy/3uVWd8XxSXUp9Pi2qbnqjdvUt1i5n6eMPseyl8WTrdqbF0P3IqhtmMaOyrIzqYn5Oq6uJB3ydpU89wrLLRlFFTtXeB6dOpS+Q+ueTGpdSO59WqZQuX76cUaNGsfPOO9O/f//P3N++fXtmzpy54vasWbNo27btZ44bMmQIQ4YMWXH7v/+NGo+amhqfW9UZzyfVpdTnU75gQbKv3RjFxYvgobth5oew2dbk2w5gEQEa6L9zdXU1C0rgOQ0n/gguOZf5l/2BBdOmEvYd0agXfSpVqX8+qXEplvOpc+fOq3TcSue1xBi5+OKL6dKlC/vss8/nHtOvXz/Gjh1LjJHXXnuNqqqqzy2lkiRJdSHOmVmYrjtnFgwaRui3IyErSx2rKIVmFWTHn0HYcQjxjuuJf/8LMa9NHUuSVljpSOmkSZMYO3YsXbt25bTTTgNgxIgRK5r30KFD6dOnD+PHj+ekk06ioqKCE088sX5TS5KkJitOfgceuQ+aVcCwAwjtO6SOVPRCWRkc+T2obk2892bIczj8BEdMJRWFlZbSXr16ceONN37pMSEEjjnmmDoLJUmS9L9ijPDqC/Ds49C2Bnbdk1BVxNd0FpkQAuGgo8izQLz7n9CyJeHAI1PHkqRVX+hIkiQplZjXwtOPwOsToWs3GLg7oVmz1LFKUjjgG7BoIfHuf5K3qCbb86upI0lq4iylkiSpqMWPl8LD98K0ybB5H+gzwGmnayGEAF/7NixeRLz5r+RVLcl2GZ46lqQmzFIqSZKKVly0EB64Az6aAwN3I3TvlTpSoxCyMvjmycTFi4jXXkTeoops+0GpY0lqoprurtKSJKmoxY/mwD03w/x5sOveFtI6FsrLyY4/A3r0Jl7xB+KL41JHktREWUolSVLRiTM/hHtvgeXLYeh+hM4bpI7UKIWKSrLv/hS6bER+0Ujiay+ljiSpCbKUSpKkohI/eA9G31bY8mX4AYT2HVNHatRCiyqyk38B7TuSX/Bb4vQpqSNJamIspZIkqWjEtybBg3dB63Vg+IGE1uukjtQkhFZtyE76GWSB/E+/LlzLK0kNxFIqSZKKQpz4HDx2P6y7Hgzdn9CiKnWkJiV06ER2/FkwYyr5pecQa2tTR5LURFhKJUlSUjFG4vPPwLOPQ9fusNs+hIqK1LGapLDpFoSvHQ8vjSfedFXqOJKaCLeEkSRJycQYYfzjMPF56N4LBgwmZP7OPKVs0DDyKe8Rx9xG3nkDsp2Hpo4kqZGzlEpqcvKx96SOoEQWVVeTL1iQOoY+EWOEp8fCay/DplvCdjsRQkgdS0A4+GjitMnEay8mrtuZ0HOL1JEkNWL+KlKSJDW4mOfw+P2FQrp5HwtpkQllZWTHnQYd1i1sFTNjWupIkhoxS6kkSWpQsbYWHhkNb70G2/QnbLuDhbQIharqwh6meU7+518TlyxOHUlSI2UplSRJDSYuXwYP3Q3vvQX9diRs2Td1JH2JsG5nsm+fDlMnE6+9qDDlWpLqmKVUkiQ1iLjsY3jgTpjyXmFBo822Th1JqyD03obwlcOITz5EfPS+1HEkNUKWUkmSVO/i0iVw379g+jTYaQ9Cj96pI2k1hL0Pht7bEK+7hDj57dRxJDUyllJJklSv4uJFcN9tMGcm7DKMsHGP1JG0mkJWRvatH0LLavKLzyYuWZQ6kqRGxFIqSZLqTVy4AEbfCvM+gt32JmywcepIWkOh9Tpkx54K06cS/3aB15dKqjOWUkmSVC/i/I/g3ltg8SIY8hXCehukjqS1FHpuQdj/68RnHiE+7J7PkuqGpVSSJNW5+NFsuPdWWL4M9tiX0HG91JFUR8Lwr8IWfYk3XEp8983UcSQ1ApZSSZJUp+LsGYVCSoQ99iO075g6kupQyDKyo38ArdYh/8vviIsWpo4kqcRZSiVJUp2J06fC6NugvByGHkBo2z51JNWD0Ko12XGnwqzpxGsvSh1HUomzlEqSpDoRp74PY26HFlUw7ABC6zapI6kehU16E74ygvj0WPKnHk4dR1IJs5RKkqS1Ft9/Gx64E1q1hqH7E1q2Sh1JDSDseRB070W89mLirBmp40gqUZZSSZK0VuLbr8PD90DbmkIhbVGVOpIaSCgrK1xfmufkV55HzPPUkSSVIEupJElaY/H1ifDofdBxvcIqu5XNU0dSAwsd1yMcdgxMepF4322p40gqQZZSSZK0RuLE5+HJh6BzV9htH0KzitSRlEjYcQj0GUC89erCVG5JWg2WUkmStFpijMQXnoFnH4Ou3WHwnoTy8tSxlFAIgeyI70LLVuSXjSIu+zh1JEklxFIqSZJWWYwRxj8Bzz8D3XvBznsQyspSx1IRCK1akx11Ekx5j3jz1anjSCohllJJkrRKYozw1MMw8TnYdEvYYVdC5lsJ/UfYoi9h172IY24jTnwudRxJJcJXEkmStFIxz+GxMfD6RNhiW9huJ0IIqWOpCIWvfhM6rU9+5R+JixamjiOpBFhKJUnSl4q1y2HsvfD267BNf0KfARZSfaFQWVnYJuajOcSbrkwdR1IJsJRKkqQvFD9eCvffAe+/XRgd3bJv6kgqAWHjHoRhBxAfGU18eULqOJKKnKVUkiR9rrhoIYy+FWZMg52GEHptlTqSSkjYd0RhGu/f/kxcvCh1HElFzFIqSZI+I340B+65GebPg133JmzcM3UklZjQrKKwGu+cmcSbrkodR1IRs5RKkqRPiTOmwb23QO1yGLo/ofMGqSOpRIXuvQh77Eccew/xledTx5FUpCylkiRphTj5HbjvX1BRAcMPJLTvkDqSSlzY7+vQsTP5X/9EXLI4dRxJRchSKkmSAIivT4SH7oY2bWHYgYRWbVJHUiMQKioL03hnzyDe/NfUcSQVIUupJElNXIyROP4JePIhWG99GLofoUVV6lhqREKP3oTd9iE+eBdx0kup40gqMpZSSZKasLhsGTx8D7w8AXpuXljUqFlF6lhqhMIBR0CHTuR/PZ+4dEnqOJKKiKVUkqQmasWWL5PfgX47wvaDCJlvDVQ/QmVzsiNPghnTiP+6LnUcSUXEVx5JkpqgOHsG3H0TzJsLg/ckbLY1IYTUsdTIhU23IAwaRrzvNuK7b6aOI6lIWEolSWpi4vtvF7Z8IcDwAwjrb5Q6kpqQ8NUjoXUb8r/9iVhbmzqOpCJgKZUkqYmIMRKff/qTFXbbwZ5fJbStSR1LTUyoqiYbcRy89xbx/n+ljiOpCFhKJUlqAuLSJfDgnfDCOOi2aWGF3aqWqWOpqdp2IGy9PfG2vxNnTEudRlJillJJkhq5OHsm3HUTTJ0M2w+CgbsRypuljqUmLIRA9rVvQ8jIr72IGGPqSJISKl/ZARdeeCHjx4+nTZs2jBo16jP3v/zyy5x99tl07NgRgP79+3PQQQfVfVJJkrTa4puvwlMPQ2VzGLo/oUOn1JEkAEK7DoQDjyBedwnxqYcJAwanjiQpkZWW0sGDBzN8+HAuuOCCLzxms80248wzz6zTYJIkac3F2loY9xi89hKs2xl2HkpoUZU6lvQpYfCexCcfIt5wGXHzbQmtWqeOJCmBlU7f7d27N9XV1Q2RRZIk1YH40Wy4+5+FQtp7Gxiyr4VURSlkZWTf+C4sXkj8xxWp40hKpE6uKX3ttdc47bTT+O1vf8v7779fFw8pSZJWU4yROOkluPMfsGhBYf/RvgMJmUtIqHiF9TciDDuQ+MQDxInPpY4jKYGVTt9dmY033pgLL7yQ5s2bM378eM455xzOP//8zz12zJgxjBkzBoCRI0dSU+My9I1ReXm5z63qTH2cT4uc/dFklWVljXb2T75oIUseuofad9+ibIONab7bnmRVjfN7LRalcj5VlcBrcjzyRGZNeBKuv4T2511NqKhMHanB+f5JdanUzqe1LqVVVf+ZDrTtttty+eWXM2/ePFq3/uw1AUOGDGHIkCErbs+cOXNtv7yKUE1Njc+t6kx9nE/5ggV1+ngqHdXV1SxohM9/nPwOPPEgfPwx9NuJ2l5bsigHGuH3WkxK5XxaVCKvyfGwY8n/8DNmXHMJ2b4jUsdpcL5/Ul0qlvOpc+fOq3TcWs/nmTt37oplvN944w3yPKdVq1Zr+7CSJGkl4sdLiU8+BA/eBS2qYK+DCJttRQghdTRptYXe2xC225l4903E6VNSx5HUgFY6UnreeecxceJE5s+fz/HHH88hhxzC8uXLARg6dChPPvkko0ePpqysjIqKCk4++WRfDCVJqkcxRnjvTXjmUViyGDbbGvr0J5St9QQoKalwyLeILz1Lfu1fyE7+he8ppSZipa9eJ5988pfeP3z4cIYPH15ngSRJ0heLC+bB02Phg/egXQfYdS9C+46pY0l1IqzTjrDf4cTrLyGOe4yw3U6pI0lqAP5KVZKkEhDzHF55Hp5/BgLQd0fotaUr66rRCbvuSXz8/sLepVts63ZGUhPgK5kkSUUuTp0Md/0Dxj8B660P+44g9N7aQqpGKWRlZIefAPPmEG+7NnUcSQ3AkVJJkopUnD0TJjwBU96HqmrYZRhs0M3r7NTohY17EgYNIz5wJ3Hg7oSu3VJHklSP/BWrJElFJi6YR3x0DNx5I8ycDn0Hwv5fI3TtbiFVkxEO+AZUtyK/9qLC9HVJjZYjpZIkFYm4ZDG8NB4mvQghwOZ9YIttCRWVqaNJDS60rCYcfDTxij8QHx1NGOTCmlJjZSmVJCmxOG9uYRGjN1+FPIdum8LW2xNaVqeOJiUVBgwmPjaG+M+/EbcZQGi9TupIkuqB03clSUokTp9KfOhuuO3v8MYrsHFP+MphhIG7WUglIIRA9vXjYekS4k1XpY4jqZ44UipJUgOKtbUw+R145TmY8SFUVMKWfWHTLd36QvocYb0NCEP3J959E3GnIYSeW6SOJKmOWUolSapnMcbCgkVvTYJ3XoePl0J1a9huZ+jei9CsWeqIUlELex9KfHos+TUXkf3sPEK5/89IjYmlVJKkehIXzIe3J8Fbr8G8uVBWDhtsDN03hU7ru8+otIpCZSXZiG+T//lXxPv+Rdjzq6kjSapDllJJkupIzHOY+SF88B5MeQ9mzyjcsW7nwkq6XbsTKirShpRKVNh6O9hmAPGO64nb70xo3zF1JEl1xFIqSdIaijHCooUwbXKhiE59vzA1NwTo0Am26Q8b9yBUt04dVWoUssOOJf/ZieTXX0rZd36cOo6kOmIplSRpFcWlS2DWDJj1YeEa0VnTYfGiwp0tqgpTc7t0LUzNrWyeNqzUCIX2HQj7jiDedBXx+acJW2+fOpKkOmAplSTpv8S8FhYsgPkfwfy5hT/n/fvzef85sPU60Gl9qOkIHdeDtjWEENIFl5qIsPu+xMcfIL/uErJeW/kLIKkRsJRKklaIeV4Y+VuyGJYuLvz574+lS2DZMqhdDsuXQ+2ywp/Ll0OeQ4xA/OSB+M/nWQZZWeGjrAzK/ud2VlY4puy/bn/p5+Wf/vuyMggZfF4fjBFqaz/Ju5xlzcqJCxcWbn+8BJYs+a/vb/Entxd98r18orwZtGoDbTtA980KJbR9R0JFZf0+GZI+VygvJ/v6CeTnnEW88wbCgUemjiRpLVlKJamJiXleWAl23iejgAvmFT7mz4OF8wsF83+FDJo3h2YVhVJYXg5lzaCyReHzrKxwHeWK4wMrWmJeW3jMvLZQEGs/ub3sY1hS+8n9/77vv47La+v8e1/yv3/RrKLwfVW2gJatoH1HaF4FrdsUimirNtC8hSOgUpEJPTcnDNydOPpW4oBdCZ27po4kaS1YSiWpEYu1y2HOLJg9E+bMLFwPOXd2YbTz3yoqoVVraFcDXbsV9s9s3uI/H5XNoaKywYtZjLFQXmuX/6ek1tZ+9vPa5Z8e2fxfK0p0GVWtWrNo6ceF2xUVhDJfBqVSFQ46ivj80+TXXkx26m/85ZFUwnw1lqRGJC5fDjOmwbQP4MMPCovxxE9GPptVFIpnz80Lf7ZpC9Wti/Z6rBDCf6bn1pGy6mrCggV19niS0gmt2hAOPIJ49YXEpx4mDBicOpKkNWQplaQSFmMsFM8p78K0KTBzWmF0MYTCVNTeWxf+bFdTKKCOJEhqRMJOQ4mPjiH+4wriVv0IVdWpI0laA5ZSSSoxMc/hwynw/lvw3tuweGGhhLatgV5bQacu0GE9QkVF6qiSVK9ClhUWPfrNKcTb/k4YcVzqSJLWgKVUkkpAzHOY+j68+ya8/zZ8vLRwrWSXrp/sjblh0U7DlaT6FDbsThi8J/HBu4gDdyds2D11JEmryVIqSUUszpsLb7wCb00qbNXSrAI22Ag26AadNyCUN0sdUZKSC/t/nfjsY+TXXkR25tmELEsdSdJqsJRKUpGJy5bBe28Wyuj0qYWpuV02hE02g85dCXW48I8kNQahqppw8DeJl/+B+Oh9hEHDUkeStBospZJUJOKCefDKC/DmK7BsWWGPzD4DoNumhKqWqeNJUlEL/QcTH7mP+M+/EvsMILRqkzqSpFVkKZWkxOKs6fDyc4XRUQJs2L2wbUvH9VwtV5JWUQiB7OvHk//y+8R//pVw1EmpI0laRZZSSUogxggfvAsTnyuspNusAjbbGnptRWjplgaStCZC566EIfsR772ZuNMehE02Sx1J0iqwlEpSA4oxFlbPff5pmDsbqqqh70DYpLdbuEhSHQj7HEp8emxh0aOf/MHr8KUSYCmVpAYQYyxs6fLcUzBrBrReB3YcAht1J2S+YZKkuhKatyA77Bjyi0YSH7yTMGTf1JEkrYSlVJLqWfxwSqGMTp8KLVvBwN1g455uWSBJ9aXPDrBFX+Jt1xL77UhYp33qRJK+hO+IJKmexDmziPffDqNvhfkfwfaDYL+vEbr3spBKUj0KIZCNOA6WLyfeeEXqOJJWwpFSSapjcemSwjWjr71cWMBo2x1g0y0I5c1SR5OkJiN0XI+w50HE268j7jyUsNnWqSNJ+gL+ql6S6kjMc+Kkl+DWawuFtOfmsP/XCZv3sZBKUgJhz69Ch07kf7+YuGxZ6jiSvoClVJLqQJz2Adx5Izw9FtrWwN6HELYfRKhsnjqaJDVZoVkF2Yhvw7QPiPfdmjqOpC/g9F1JWgtx8SIY9xi883phEaNBw6BrN0IIqaNJkoCwZV/YdiDxzhuI2w8i1KybOpKk/+FIqSStgRgj8c1X4V/XwXtvwlb9YN8RhA27W0glqchkh34LQkZ+/aWpo0j6HI6UStJqivPnwVMPwdTJ0KETDBhMWKdd6liSpC8Q2nUgfOUw4k1XEZ9/mrD19qkjSfovllJJWkUxz+HVF+C5pyELhS1eem7uyKgklYCw+77Ex+4nv+4Ssl5bEyorU0eS9Amn70rSKogfzYF7boZnH4dOXeArhxE23cJCKkklIpSXk339BJg1nXjnDanjSPovjpRK0peIMcKkl2D841DeDHbeAzbcxDIqSSUobLoFYYfdiKNvIfYfTOilxucTAAAgAElEQVTSNXUkSThSKklfKC5aAPffAc88Uhgd3edQwkY9LKSSVMLCwUdD8yryay4sXJYhKTlLqSR9jvjO63D7DTB9KvTfBXbdm1DVMnUsSdJaCq1aEw46Ct6YSHz8/tRxJOH0XUn6lPjxUnhqbGHf0Zp1YcfdCa3XSR1LklSHwsDdiY/dX1iNd+vtCa3apI4kNWmOlErSJ+LMD+HOf8C7b8DW28OwAyykktQIhSwjO/xEWLKI+I8rU8eRmjxLqaQmL8ZInPg83HsLxLxQRrfqR8j8ESlJjVXo0pUw9ADiEw8QJ72YOo7UpPmOS1KTFpcugYfuhmcfg84bwt6HEDp0Sh1LktQAwt6HQs265NdcRFy2LHUcqcmylEpqsuL0qXDHjTDlPei3IwweTqhsnjqWJKmBhMpKsq8fD9MmE++9OXUcqcmylEpqcmKMxJcnwOhbIctg+IGEzbZ2qxdJaoLCFn0JfXck3nkjcfqU1HGkJmmlq+9eeOGFjB8/njZt2jBq1KjP3B9j5Morr2TChAlUVlZy4okn0q1bt3oJK0lrKy5eBA/fA++/DV27ww6DCRWVqWNJkhIKhx1DnDiB/JqLyH7wS39JKTWwlY6UDh48mB/96EdfeP+ECROYNm0a559/PscddxyXXXZZnQaUpLoSp75P/ttTYPI70HcgDBpqIZUkEdZpTzjgG/DK88QnHkwdR2pyVlpKe/fuTXV19RfeP27cOAYNGkQIgZ49e7Jw4ULmzJlTpyElaW3FZx8n/82psHABDNmX0HsbfxMuSVoh7DIcuvci3ng5cd7c1HGkJmWtrymdPXs2NTU1K263b9+e2bNnr+3DSlKdiLW15DddSX7xSOjSlewnfyB06pI6liSpyIQsI/vGd2HJYuINl6eOIzUpK72mdGVijJ/5uy8afRgzZgxjxowBYOTIkZ8qs2o8ysvLfW5VZ9bmfMrnzeWjc/8fH7/4LC2GH0Cro79PaFbBondereOUKhVlWdmXzv6RVkepnE9VviavupoaFhz0DRbecAWthu5LZd8dGuxL+/5JdanUzqe1LqXt27dn5syZK27PmjWLtm3bfu6xQ4YMYciQIStu//e/U+NRU1Pjc6s6s6bnU5z8Dvmffw0fzSEc9X0+3nF3Zn00D4B8wYK6jqkSUV1dzQKff9WRUjmfFvmavFriLnvD2PuYe9HvyH7xJ0LzFg3ydX3/pLpULOdT586dV+m4tZ6+269fP8aOHUuMkddee42qqqovLKWS1BDihCfJR54Oy5eTnf5/ZDvunjqSJKlEhGbNyI74DsyaTrzt76njSE3CSkdKzzvvPCZOnMj8+fM5/vjjOeSQQ1i+fDkAQ4cOpU+fPowfP56TTjqJiooKTjzxxHoPLUmfJ8ZY2Gfutmthox5k3/kRYZ32qWNJkkpM6NGbsMtw4v23E7cfRNi4R+pIUqO20lJ68sknf+n9IQSOOeaYOgskSWsiLl1KvOqPxHGPEgYMJhzxHbd7kSStsXDgkcTnnyb/25/JfjyKUL7WV71J+gJrPX1XklKLs2eQn30G8dnHCAcdRTj6BxZSSdJaCVUtyUZ8Gya/TbzvttRxpEbNUiqppMW3Xyf/7akwYxrZ935KNuxA9x+VJNWJsO0O0GcA8fbriNM+SB1HarQspZJKVnz2MfJzzoLyZmRnnE3Ysl/qSJKkRib72vHQrBn5X/9EzPPUcaRGyVIqqeTEGMnvvJH84t9B125kPzqX0KVr6liSpEYorNOOcOgx8MZE4oN3po4jNUqWUkklJS5bRrzyj8RbryFsP4jslF8TWq+TOpYkqRELO+wGW/Yj3vw34vSpqeNIjY6lVFLJiPPnkf/hp8QnHiB8ZQThmFMIzSpSx5IkNXIhBLLDT4SyMvK//dlpvFIds5RKKgnxwynkI0+Dt18nHHMK2b4jXNBIktRgQrsawsFHw6QXiWPvSR1HalQspZKKXnzjlUIhXbSA7JRfk/XfJXUkSVITFHbaA3r3Id70V+Ks6anjSI2GpVRSUVvyxIPko34CVdVkZ55D2GSz1JEkSU1UCIHsG98FKEzjjTFxIqlxsJRKKkoxRvL7buOjc35SWGH3zLMJ63ZOHUuS1MSF9h0IBx0FE58jPnpf6jhSo2AplVR0Yl5LvP5S4o2XU9l/l8IKu63apI4lSRIAYdAw2HRL4j+uIM6ekTqOVPIspZKKSly6lPyikcQH7iAM2Y82p/6KUFGZOpYkSSuELCM78nuQ5+R//ZOr8UpryVIqqWjEeXPJR/0Ynn+acNixZId+i1BWljqWJEmfETp0KqzGO/E54sN3p44jlTRLqaSiEKdNJh95Okx+h+z4M8l2/0rqSJIkfakwaBhs0Zd405XEaZNTx5FKlqVUUnLx9YnkI8+AxYsK149uu0PqSJIkrVQIoTCNt1kl+RXnEWtrU0eSSpKlVFJScdyj5L//KbRsRXbWOYTuvVJHkiRplYV12pEdfgK8/Rrx7n+kjiOVJEuppCRijOT33kL+l7Nhw+6FLV86rpc6liRJqy3024mw/S7EO24gvvtG6jhSybGUSmpwMa8lXvcX4k1XQt+BZD/8FaFV69SxJElaY+Fr34ZW65Bf/gfix0tTx5FKiqVUUoNaseXLg3cRhu5PdtzpbvkiSSp5oWU12TdPgqnvE2+5OnUcqaRYSiU1mDj/o/9s+TLiOLKDjyZk/hiSJDUOoXcfwq57E8f8i/jK86njSCXDd4OSGkT8cAr5/532ny1fdtsndSRJkupc+OpRsG4X8iv/SFw4P3UcqSRYSiXVu/jmq+QjT4PFC93yRZLUqIXKSrJjT4F5c8n/+idijKkjSUXPUiqpXsXxT5CP+gm0aEl2plu+SJIav7DhJoQDvwETniQ+fHfqOFLRs5RKqjf5/XeQXzwS1t+osOXLup1TR5IkqUGEIfvCFtsSb7icOPmd1HGkomYplVTnYp6T/+MK4vWXwNbbk53yG0LrdVLHkiSpwYQsI/vm96GqJfml5xKXuk2M9EUspZLqVFz2MfHSc4mjbyXsuhfZCWcSKt3yRZLU9ITWbcm+9QOY8h7xH5enjiMVLUuppDoTF84n//3PiOMeJRx0FGHEtwlZWepYkiQlE3r3IQw7kPjwPcTxj6eOIxUlS6mkOhFnfkg+8gx45zXCsaeSDTuQEELqWJIkJRf2/zps1KOwGu+sGanjSEXHUipprcV33yjsQTpvDtkPfkm2/aDUkSRJKhqhvBnZsadCnpNfNopYW5s6klRULKWS1kp8cRz5OT+C8mZkZ/yO0HOL1JEkSSo6oeN6hMNPhDcmEm+9JnUcqaiUpw4gqXTlY+8lXntRYcuX7/2MsE671JEkSSpaWf9dyF9/mXjPP4ndNiX0GZA6klQUHCmVtNpijOS3XEO8+gLovQ3Zab+1kEqStArCocfChpuQX3kecfqU1HGkomAplbRa4vJlxCvOI951I2GnPci+8xNC86rUsSRJKgmhWTOyE86ErIz8opHuXyphKZW0GuKiheTn/5L45IOE/b5G+MZ3CeVeBSBJ0uoI7TuSHfND+OBd4rUXEmNMHUlKylIqaZXEWdPJzz4TXnuJ8M3vk+1zmFu+SJK0hsIWfQn7HEZ84kHiI/emjiMl5RCHpJWKb75KfsFvYPlyspN+RujdJ3UkSZJKXtjnUOLbk4jXXcKyLbeFth1TR5KScKRU0pfKnx5Lfu6PoXkLsrPOtpBKklRHQpaRfeuH0Lotc8/5MXHBvNSRpCQspZI+V4yR/F/XES89FzbqQXbWuYT1NkgdS5KkRiVUtyY7/kzyObPJLzmHWFubOpLU4Cylkj4jLvuYeNko4u3XEXbYleyHvyK0ap06liRJjVLYuAetTzgdXnmeeOPlqeNIDc5rSiV9Spw3h/zC/4M3XyUccARhz4Nc0EiSpHrWYre9mT/pZeLoW8k7dyXbZXjqSFKDsZRKWiF+8C75n34F8+eSHX8Goe+OqSNJktRkhK8eSZzyPvG6vxA7dSFsumXqSFKDcPquJADii8+Sjzwdli8jO/X/LKSSJDWwkJWRHXsqdOxMftFI4oxpqSNJDcJSKon8/jsKI6QdOpH96FzCxj1SR5IkqUkKVS3JvvtjAPI//Yq4eFHiRFL9s5RKTVisrSX/+8XE6y+BrfqRnT6S0K5D6liSJDVpoWNnsuPPgOlTyC89l5i7Iq8aN0up1ETFRQvJ//RL4oN3EYYeQHbiWYTmLVLHkiRJQOi1FeGw4+DFccR//i11HKleudCR1ATFaR+QX/AbmDGV8I3vku08NHUkSZL0P7LBe5JPeZc4+hby9h3IdtsndSSpXqxSKX3uuee48soryfOc3Xffnf333/9T9z/00ENcffXVtGvXDoDhw4ez++67131aSWstvjiO/NJzoayc7Ae/dGU/SZKKWDj0WOLsmcTrLyW2aUfoOzB1JKnOrbSU5nnO5Zdfzk9+8hPat2/PWWedRb9+/Vh//fU/ddzAgQP51re+VW9BJa2dGCPx7puIt14DG2xMduKPCO07po4lSZK+RCgrIzv2NPI//JT8slFkrdoQem6eOpZUp1Z6Tekbb7xBp06dWHfddSkvL2fgwIE888wzDZFNUh2JS5cQ/3I28ZarCdvtTHb67yykkiSViFBZSfa9n0LNuuQX/Jr4wXupI0l1aqWldPbs2bRv337F7fbt2zN79uzPHPfUU09x6qmnMmrUKGbOnFm3KSWtsThjGvnI04njnyAcdBThmFMIlZWpY0mSpNUQWrYiO/kXUFFJ/sdfEGfPSB1JqjMrnb4bY/zM34UQPnW7b9++7LjjjjRr1ozRo0dzwQUX8POf//wz/27MmDGMGTMGgJEjR1JTU7OmuVXEysvLfW6LxNIJT/LR739BiJE2Pz2Xyj4DUkdabfVxPi2qrq7Tx1PpKMvKqPb5Vx0plfOpytfkkrBKr3c1NSz7+XnM+fEJhD//mna/vYisunXDBFRJKbX34ystpe3bt2fWrFkrbs+aNYu2bdt+6phWrVqt+HzIkCFce+21n/tYQ4YMYciQIStuO6LaONXU1PjcJhbznHjHDcQ7rocuG5KdcCbzO3Zmfgk+L/VxPuULFtTp46l0VFdXs8DnX3WkVM6nRSX4s78pWuXXu+p1CCecRe0ff8GMX/6Q7OT/R6hwBpQ+rVjej3fu3HmVjlvp9N3u3bszdepUpk+fzvLly3n88cfp16/fp46ZM2fOis/HjRv3mUWQJDWcuGBeYf/R268jDBhMduY5hI6r9gNBkiQVv9BrK8LRP4Q3XiG/+HfEZctSR5LWykpHSsvKyjj66KP5zW9+Q57n7LrrrmywwQbccMMNdO/enX79+nH33Xczbtw4ysoK01hOPPHEhsgu6X/Ed14nv/h38NFswuEnEgYN+8x0e0mSVPqy7XYiX7yQePUF5JecTfbtMwjlq7Tbo1R0Qvy8i0YbyJQpU1J9adWjYpku0JTEGImP3Eu87hJo3Zbs+DMJG/dIHatO1Mv03bH31OnjqXSUynRLlYZSOZ+yQcNTR9AqWNPXu/zBO4l//wuh746EY08llJXVQzqVmmJ5P76q03f9dYpU4uKSRcRrLyY++RBs3ofsmFMILnogSVKTkO26N/ny5cQbL4eycvjWyYTMYqrSYimVSlh87y3yv5wNM6YR9v0aYe+DfSGSJKmJyfbYj3z5MuLNf4Pycjjye4RspUvHSEXDUiqVoBgj8cE7if+4Aqpbk53ya8KmW6SOJUmSEsn2PIh82TLi7ddBeTM4/ATXlVDJsJRKJSYuXED+1/NhwpOwZT+yb55MaOV0XUmSmrrwlcNg+TLi3TdBlsGI4xwxVUmwlEolJL75Kvml58LcWYSDjyYM2dcXG0mSBFAYGT3gCMhriffeAksWw1EnufiRip6lVCoBcfly4p03Eu+8Edp3IDvjd4SNe6aOJUmSikwIAb56FLRoSbz1GuKSxWTHnUpoVpE6mvSFLKVSkYvTPiC//PfwzuuEAbsSRhxHqGqZOpYkSSpSIQTC3oeQt6giXncJ+fm/JPvOjwnNW6SOJn0uS6lUpGKMxIfvKSxmVN6M7NunE/rtlDqWJEkqEdlu+5A3ryL+9Xzy3/+U7Ps/J7RslTqW9BmWUqkIxXlzyK/6E7w4DnpvQ3bU9wlt26eOJUmSSkw2cDdiiyryS84mP+dHZD/4JaFN29SxpE9xhRSpyMRnHyP/xUnw6guEw44l+/4vLKSSJGmNhT4DyL73M5j5IfnvziB+OCV1JOlTLKVSkYjz5lB70Ujyi38H7TqQ/fj3ZLt/xdV1JUnSWgu9tyH7wS9h8SLy/zuNOOml1JGkFXy3KyUWYyR/6mHyn38XXniacOA3yM46h9Cla+pokiSpEQnde5GddQ60akP+h5+RP3Z/6kgS4DWlUlJx7izyay6C55+GbpuSHXUSYb0NUseSJEmNVOi4HtlZZ5Nf/DviVX8k/3AyYf8jnJmlpCylUgIxz4mP319YWXfZMsLB3yQM2ZeQubm1JEmqX6GqmuyknxOv+wvx7n8SP5xKdvQPCJWVqaOpibKUSg0sfvBuYXT0jYmwSW+yI79H6NQldSxJktSEhPJyOPxEWLcL8aYryc+ZTvbdHxPWcXFFNTxLqdRA4tIlxNuvJ465DVpUEY78HmHg7k6XkSRJSYQQCEP3J67bmfzSc8l/9QOyY08l9NoqdTQ1Mb4blhpAfO4p8p99h3jvzYQBu5L98iKynfawkEqSpOTC1tuTnXUuVFWT//5n5HfeSMzz1LHUhDhSKtWjOH0q+Y2XFxYy6tyV7PSRhB69U8eSJEn6lNClK9mPRxGvvoB46zXEN18lO/pkQnXr1NHUBFhKpXoQFy8i3nkj8f5/QVk54cAjCXvsV7h+Q5IkqQiF5i3gmFOgR2/iDZcVpvN++3RCt01TR1Mj5ztkqQ7FvJb42P3EW66G+R8Vrhk94HAXDZAkSSUhhEAYvBdxox7kF/+O/OyzCAcdRdhtHy87Ur2xlEp1JE56ifyGS+H9t2GTzchO+hlhox6pY0mSJK22sFEPsp+eR37lecQbLiO+8AzZkScR2ndIHU2NkKVUWktx8jvkt15TuG60XQfCcacR+u1ECCF1NEmSpDUWWlaTfefHxEfuJd54Bfn/+x7h0GMJA3fzfY7qlKVUWkNxxjTiv/5OfOphaF5F2P/wwnWjFW48LUmSGocQAmHQcOJm25Bf9UfiVX8kTniC7IjvENq0TR1PjYSlVFpNce7swiJGj4yGLCMMO5Aw/EBCy1apo0mSJNWL0KET2Sm/Id5/O/Hmv5H/4rtkXz+B0G+n1NHUCFhKpVUU580l3ncb8YE7YPkyws5DCfsc6iJGkiSpSQhZRthjP+IW25JfcR75X86GJx8iO/QYQodOqeOphFlKpZWIc2YRR99CHHsPLFtG2G5nwn5fI3TsnDqaJElSgwvrbUB25tnEMbcRb7+e/OffJex1cGH2WLNmqeOpBFlKpS8QZ35IvPdm4qP3QZ4T+g8m7HUQodP6qaNJkiQlFcrKCMMOJG43iHjj5cTbriU+8QDZiG8Tttg2dTyVGEup9D/iB+8S77uV+ORDQCDsOKRwzajTUiRJkj4ltKshHH8G8eUJ5NddQv7HX8C2O5Ad8i1C+46p46lEWEolIOY5vDyBfMxtMPE5qKggDN6LMPQAQrua1PEkSZKKWti8D9nPzy9c8nTXjeQvjCPsuhdhz4MJrVqnjqciZylVkxY/Xkp88kHimNth6vvQpl1ha5ddhhOq/QEqSZK0qkKzZoS9DyHusCvxX9cRx9xOfGQ0YdgBhCH7EZq3SB1RRcpSqiYpTp9KfGQ08dHRsGA+dO1G+NYPCP12IpR7gb4kSdKaCu06EI46iTjsAPJbryHe9nfiA3cS9j6UMGiYiyHpMyylajLi8mXw3FPkY++FV56HLIOtticbsi/03JwQQuqIkiRJjUZYbwPKTjiL+NYk8pv/Rrz+EuJ9txKG7k/YcQ9CZWXqiCoSllI1enH6VOKjo4mPjoH5H0G7DoUtXXbcg9DWPUYlSZLqU+i2Kdkpv4ZXniP/13XE6y4h3n4dYdd9CLvt7SVTspSqcYoL5hHHPUZ88kF489VPRkW3Ixs0DDbvQ8jKUkeUJElqMkII0LsPZb37EF+fSH7PP4m3X0e892bCzkMJe+znar1NmKVUjUZc9jG8MI78yQfhxWehdjl07ko48EjCgMGOikqSJBWB0KM3ZT16F7bhu/dm4kN3ER+8E/oMIBs0HHptRciy1DHVgCylKmnx46UwcQJx/BPE556GxQuhTdvCVJABu8IGG3utqCRJUhEKXTYkHP0D4v6HE++/nfjY/eTPPg4d1yssiDRwd0KrNqljqgFYSlVy4pJFxBefhWcfJ770LCxdAlXVhG36E/rvAptt5fRcSZKkEhHadSAcfHShnD77GPHhe4k3XUW89RpCnx0IOw+FTbfw/V0jZilVSYgzphFferZQRl95HpYvg1ZtCP0HE/ruAD23JJR7OkuSJJWq0KyiMNNtwK7ED94jPnIv8YkHiM88UthLfrudCNvtDBv3dCZcI+O7eBWluOxjeO3l/xTRDz8o3NFxPcIu/7+9e42Nqtz3OP5dndKW0vv0Rsul0F0wgCIwJQpbAkLk7POCsEnQHW8viDtRxIBEULwbQ2iMEhPAlBhscMcXhJx4PMeccziB6i6hGzdYy0XlUlqg0JEyHaBlSrHtPPvF6oVKywxaZjH190kms2bW6uQ/8J/Lb55nrfVvWNNnwR/u0S9mIiIiIkOQlT8G6y9/xfz5aczhA5h/Vtj7nu7+L8jKxSp+CGvmHPv4IQqoUU+hVO4KpqMDTp/EHD+COX4ETv0IP/8MscPgnnux5v071pQZWDl5TpcqIiIiIhFixcdjFf8Riv+Iab2K+W6/HVD/9z8w/7PTDqj3FWPdV2yfdz52mNMly6+gUCqOMO0/w5kazMkfMccPQ82P9r6hAPljsR5aiDV5mj0tVydWFhEREfndsxKTsGYvgNkLMM2XMFX77VHUil2YPf8NCcNh0jQ7pE6ehpWW4XTJEiaFUrnjjDHgv4g5dQxqj2Nqj8PZWvuULQAjR2PNehhr4n32L1w6ypqIiIiI3IKVko41908w90+Y69fh2GE7oB4+gKmqxADk5mNNuNeedTdhClZqutNlywAUSmVQGWPovNCAOXQQc6YWc/YU1NfClUv2BnFxUFBknyC5cCKMn4iVojcIEREREfl1rPh4mFqMNbXYHgypr8McO4Q5dgTzz79Dxf/ZIXXkaKyiyfb3z3FFkDtK50O9SyiUyq9mAi1w/izGWw8NZzHnz0B9Lb7WgL1BTIz94p80DQr+gFV4D+QX6Ci5IiIiInJHWJYFY8ZjjRkPj/wZ09kJZ2sxJ47YIfVARW9ITRhuD5aMK8IaNxFGjwN3tg6c5AClA7kl09EOvgvQ6MU0NsAFL+anc+Ct7x39BIgfDnmjsTwPkTTpPgIZOZA/BitO+4OKiIiIiDMslwvG2cGThUswwSBcOI+pOwF1JzF1JzD//592eAU7qOaPxcovgFFd1/ljsEYkO/k0hjyF0t85E+yEy35ouohpaoSui2m6CBe94GsEE+z9g+GJkJOPNXm6fQjuvNGQNwbSM3umPyRmZtLq8zn0jERERERE+md1z+QbORpmzQe6DsB5thZz/jScO405fwZzcG/viCrAiGT71IQ5+ZAzErLz7LNCuLNhRLJGV38jhdIhyBgDbdfgajNcbYFAM+bKZbjcBFf8mMt+O4he8dujnd2/DHVLToWMLKyCIpg5x37RZY+EnDxIStGLTkRERESGDGtYHBTeY+9q1sUYA5ea4PxpjPccXGjANDbYpy7c/5W9TffG8QmQkQXuLKyMLHs53Y2VmgGpaZCaYQdX7b86oLBCaXV1NWVlZQSDQebPn8/ixYv7rG9vb2fz5s3U1taSnJzMqlWryM7OviMF/x4YY6D9Z/sUKd2Xtms9y+ZawA6bV5sh0ILpDp9dt7na0ntk219KTIK0DEjLwModBenurhdQtv1LT0aWTsEiIiIiIr9rlmVBRiZkZGLd6+mzzly/bs8obGzA+C/2zjj0+zCna+zv5NwQWgFcLkhOg5Q0SEq2pwMnJcOIlK7rZKzhifb04fgEe9e47uWEBKwYV+SevANChtJgMMi2bdt4/fXXcbvdrFu3Do/Hw6hRo3q2KS8vZ8SIEWzatIl9+/bx2Wef8eKLL97Rwu+04M4yCHaCZdkXrN7lX15uXNfZYY883njd0btsblrX3hU2r8P1ruDZ1tZ3yuxAYmLsqQRJXc2ck4fVvTwixR7V7GpyUtMhNV37eIqIiIiI/AZWfDyMKoBRBfQ3f9Bcv947I7H5EubKJXv5yiVM82V7UMnXaIfXawEwdnw1/TxWj7i4vkE1PgEShhPzxHNYWbl34FlGVshQWlNTQ25uLjk5OQDMmjWLAwcO9AmlBw8eZOnSpQA88MADfPLJJxhjonqap/lHuT1aaegKiKZ3+cZrTE8jAXYwdbnAFXvDdWw/97kgNhZih0GaG6u7uXp+GUmAuK5fRrrv614/fLgdRIePiOp/YxERERGRocaKj4fskfYF+g2u3UywEwIBCDT3zoxsu4Zpu2YPWLXdOGvymr2ueyZla6BrgCz6hQylfr8ft9vdc9vtdnPy5MkBt3G5XCQmJtLS0kJKSsoglxs5ro1/u63tjTFggkN+aF1ERERERAaHFeOC5BT7cuP9DtXjlJCh1JibB5J/OToXzjYAu3fvZvfu3QCUlJSQl5cXdqESXfR/K4Np0PvpL8sG9/EkqqQ5XYAMKeonGUz6/iSDKZr6KeQhoNxuN01NTT23m5qaSE9PH3Cbzs5OWltbSUpKuumxFixYQElJCSUlJb+1brmLvfLKK06XIEOI+kkGk/pJBpP6SQaT+kkGU7T1U8hQWlhYiNfrpbGxkbgmsiQAAAYASURBVI6ODiorK/F4+h6BasaMGXz99dcA7N+/n8mTJ2tfRxEREREREQkp5PRdl8vFsmXLWL9+PcFgkHnz5jF69Gh27NhBYWEhHo+Hhx9+mM2bN/PCCy+QlJTEqlWrIlG7iIiIiIiIRLmwzlM6ffp0pk+f3ue+xx57rGc5Li6O1atXD25lErUWLFjgdAkyhKifZDCpn2QwqZ9kMKmfZDBFWz9Zpr+jFImIiIiIiIhEQMh9SkVERERERETulLCm74r8UnV1NWVlZQSDQebPn8/ixYv7rP/yyy/Zs2cPLpeLlJQUnnvuObKyshyqVqJBqJ7qtn//fjZu3MiGDRsoLCyMcJUSLcLpp8rKSnbu3IllWYwdO5aVK1c6UKlEg1D95PP52LJlC4FAgGAwyOOPP37Tbk8iAB999BFVVVWkpqbywQcf3LTeGENZWRnfffcd8fHxLF++nPHjxztQqUSDUP20d+9evvjiCwASEhJ45plnKCgoiHCVYTIit6mzs9OsWLHC/PTTT6a9vd289NJLpr6+vs82R44cMW1tbcYYY3bt2mU2btzoRKkSJcLpKWOMaW1tNW+++aZ59dVXTU1NjQOVSjQIp58aGhrMmjVrTEtLizHGmMuXLztRqkSBcPqptLTU7Nq1yxhjTH19vVm+fLkTpUoU+P77782pU6fM6tWr+13/7bffmvXr15tgMGiOHz9u1q1bF+EKJZqE6qdjx471fM5VVVXd1f2k6bty22pqasjNzSUnJ4fY2FhmzZrFgQMH+mwzZcoU4uPjASgqKsLv9ztRqkSJcHoKYMeOHSxatIhhw4Y5UKVEi3D6ac+ePSxcuLDnnNqpqalOlCpRIJx+siyL1tZWAFpbW286n7tIt0mTJvW87/Tn4MGDzJkzB8uymDBhAoFAgEuXLkWwQokmofpp4sSJPeuLiopoamqKVGm3TaFUbpvf78ftdvfcdrvdtwyd5eXl3H///ZEoTaJUOD1VV1eHz+djxowZkS5Pokw4/dTQ0IDX6+WNN97gtddeo7q6OtJlSpQIp5+WLl3K3r17efbZZ9mwYQPLli2LdJkyRPj9fjIzM3tuh/qOJRKu8vJypk2b5nQZA1Ioldtm+jlgs2VZ/W5bUVFBbW0tixYtutNlSRQL1VPBYJDt27fz9NNPR7IsiVLhvEcFg0G8Xi9vvfUWK1eupLS0lEAgEKkSJYqE00/79u1j7ty5lJaWsm7dOjZt2kQwGIxUiTKE3M53LJFwHT16lK+++oonnnjC6VIGpFAqt83tdvcZ/m9qaup3qtLhw4f5/PPPWbt2raZbyi2F6qm2tjbq6+t55513eP755zl58iTvvfcep06dcqJcucuF8x6VkZFBcXExsbGxZGdnk5eXh9frjXSpEgXC6afy8nIefPBBACZMmEB7ezstLS0RrVOGBrfbjc/n67k90HcskXCdOXOGrVu3smbNGpKTk50uZ0AKpXLbCgsL8Xq9NDY20tHRQWVlJR6Pp882dXV1fPzxx6xdu1b7aklIoXoqMTGRbdu2sWXLFrZs2UJRURFr167V0XelX+G8R82cOZOjR48C0NzcjNfrJScnx4ly5S4XTj9lZmb29NO5c+dob28nJSXFiXIlynk8HioqKjDGcOLECRITExVK5Vfz+Xy8//77rFixgry8PKfLuSXL9DdPQCSEqqoqtm/fTjAYZN68eSxZsoQdO3ZQWFiIx+Ph3Xff5ezZs6SlpQH2B/bLL7/scNVyNwvVUzd6++23eeqppxRKZUCh+skYw6effkp1dTUxMTEsWbKE2bNnO1223KVC9dO5c+fYunUrbW1tADz55JNMnTrV4arlbvThhx/yww8/0NLSQmpqKo8++igdHR0APPLIIxhj2LZtG4cOHSIuLo7ly5frs04GFKqfSktL+eabb3r2U3a5XJSUlDhZ8oAUSkVERERERMQxmr4rIiIiIiIijlEoFREREREREccolIqIiIiIiIhjFEpFRERERETEMQqlIiIiIiIi4hiFUhEREREREXGMQqmIiIiIiIg4RqFUREREREREHPMvEmDS6+DcJdoAAAAASUVORK5CYII=\n", 305 | "text/plain": [ 306 | "
" 307 | ] 308 | }, 309 | "metadata": {}, 310 | "output_type": "display_data" 311 | } 312 | ], 313 | "source": [ 314 | "# Just for fun\n", 315 | "sns.distplot(cvd);" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "# 7.4\n", 323 | "\n", 324 | "In this chapter we have focused on one reason why k-fold CV fails in financial applications, namely the fact that some information from the testing set leaks into the training set. Can you think of a second reason for CV's failure?" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "**A: Overfitting due to multiple train/test bias (explored further in chapters 11-13)**" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "# 7.5\n", 339 | "\n", 340 | "Suppose you try one thousand configurations of the same investment strategy, and perform CV on each of them. Some results are guaranteed to look good, just by sheer luck. If you only publish those positive, and hide the rest, your audience will not be able to deduce that these results are false positives, a statistical fluke. This phenomenon is called \"selection bias.\"\n", 341 | "\n", 342 | "- a) Can you imagine one procedure to prevent this?\n", 343 | "\n", 344 | "- **A: Publish the amount of trials it took to get the results**\n", 345 | "\n", 346 | "- b) What if we split the dataset in three sets: training, validation, and testing? The validation set is used to evaluate the trained parameters, and the testing is run only on the one configuration chosen in the validation phase. In what case does this procedure still fail?\n", 347 | "\n", 348 | "- **A: If repeatedly test against the 3rd set, we can still bias our results.**\n", 349 | "\n", 350 | "- c) What is the key to avoiding selection bias?\n", 351 | "\n", 352 | "- **A: Taking the number of trials into account and not using backtests as a research tool.**" 353 | ] 354 | } 355 | ], 356 | "metadata": { 357 | "kernelspec": { 358 | "display_name": "Python 3", 359 | "language": "python", 360 | "name": "python3" 361 | }, 362 | "language_info": { 363 | "codemirror_mode": { 364 | "name": "ipython", 365 | "version": 3 366 | }, 367 | "file_extension": ".py", 368 | "mimetype": "text/x-python", 369 | "name": "python", 370 | "nbconvert_exporter": "python", 371 | "pygments_lexer": "ipython3", 372 | "version": "3.5.6" 373 | } 374 | }, 375 | "nbformat": 4, 376 | "nbformat_minor": 2 377 | } 378 | -------------------------------------------------------------------------------- /Chapter 08 - Feature Importance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib\n", 12 | "import matplotlib.pyplot as mpl\n", 13 | "\n", 14 | "%matplotlib inline\n", 15 | "mpl.rcParams['figure.figsize'] = (16, 6)\n", 16 | "\n", 17 | "from sklearn.tree import DecisionTreeClassifier\n", 18 | "from sklearn.ensemble import BaggingClassifier\n", 19 | "\n", 20 | "from sklearn.metrics import accuracy_score\n", 21 | "\n", 22 | "from mlfinlab.feature_importance import (\n", 23 | " feature_importance_mean_decrease_impurity,\n", 24 | " feature_importance_mean_decrease_accuracy,\n", 25 | " feature_importance_sfi,\n", 26 | " plot_feature_importance,\n", 27 | " get_orthogonal_features,\n", 28 | ")\n", 29 | "\n", 30 | "from mlfinlab.cross_validation import PurgedKFold, ml_cross_val_score\n", 31 | "from mlfinlab.util.multiprocess import process_jobs" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "A few interesting notes from this chapter:\n", 39 | "\n", 40 | "**Marcos' first law of backtesting:**\n", 41 | "\n", 42 | "**Backtesting is not a research tool. Feature importance is.**\n", 43 | "\n", 44 | "\n", 45 | "Once we have found what features are important, we can learn more by conducting a number of experiments.\n", 46 | "\n", 47 | "- Are these features important all the time, or only in some specific environments?\n", 48 | "- What triggers a change in importance over time?\n", 49 | "- Can these regime switches be predicted?\n", 50 | "- Are those important features also relevant to other related financial instruments?\n", 51 | "- Ahe they relevant to other asset classes?\n", 52 | "- What are the most relevant features across all financial instruments?\n", 53 | "- What is the subset of features with the highest rank correlation across the entire investment universe?\n", 54 | "\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "scrolled": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "from sklearn.datasets import make_classification\n", 66 | "\n", 67 | "\n", 68 | "def get_test_data(n_features=40, n_informative=10, n_redundant=10, n_samples=10000):\n", 69 | " # generate a random dataset for a classification problem \n", 70 | " trnsX, cont = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, random_state=0, shuffle=False)\n", 71 | " df0 = pd.DatetimeIndex(periods=n_samples, freq=pd.tseries.offsets.Minute(), end=pd.datetime.today())\n", 72 | " trnsX = pd.DataFrame(trnsX, index=df0)\n", 73 | " cont = pd.Series(cont, index=df0).to_frame('bin')\n", 74 | " df0 = ['I_%s' % i for i in range(n_informative)] + ['R_%s' % i for i in range(n_redundant)]\n", 75 | " df0 += ['N_%s' % i for i in range(n_features - len(df0))]\n", 76 | " trnsX.columns = df0\n", 77 | " cont['w'] = 1.0 / cont.shape[0]\n", 78 | " cont['t1'] = pd.Series(cont.index, index=cont.index)\n", 79 | " return trnsX, cont" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "def feature_importances(X, cont, method, allow_masking_effects=False, n_splits=10):\n", 89 | " max_features = None if allow_masking_effects else 1\n", 90 | " clf = DecisionTreeClassifier(\n", 91 | " criterion='entropy', max_features=max_features, class_weight='balanced', min_weight_fraction_leaf=0.0\n", 92 | " )\n", 93 | " clf = BaggingClassifier(\n", 94 | " base_estimator=clf, n_estimators=1000, max_features=1.0, max_samples=1.0, oob_score=True, n_jobs=-1\n", 95 | " )\n", 96 | " fit = clf.fit(X, cont['bin'])\n", 97 | " oob_score = fit.oob_score_\n", 98 | "\n", 99 | " cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=cont['t1'])\n", 100 | " oos_score = ml_cross_val_score(clf, X, cont['bin'], cv_gen=cv_gen, scoring=accuracy_score).mean()\n", 101 | "\n", 102 | " if method == 'MDI':\n", 103 | " imp = feature_importance_mean_decrease_impurity(fit, X.columns)\n", 104 | " elif method == 'MDA':\n", 105 | " imp = feature_importance_mean_decrease_accuracy(clf, X, cont['bin'], cv_gen, scoring=accuracy_score)\n", 106 | " elif method == 'SFI':\n", 107 | " imp = feature_importance_sfi(clf, X, cont['bin'], cv_gen, scoring=accuracy_score)\n", 108 | " \n", 109 | " return imp, oob_score, oos_score\n", 110 | "\n", 111 | "\n", 112 | "def test_data_func(X, cont, run='', allow_masking_effects=False, methods=['MDI', 'MDA', 'SFI']):\n", 113 | " for method in methods:\n", 114 | " feature_imp, oob_score, oos_score = feature_importances(X, cont, method, allow_masking_effects)\n", 115 | "\n", 116 | " plot_feature_importance(\n", 117 | " feature_imp, oob_score=oob_score, oos_score=oos_score,\n", 118 | " savefig=True, output_path='img/{}_feat_imp{}.png'.format(method, run)\n", 119 | " )\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "# 8.1a\n", 127 | "\n", 128 | "Using the code presented in Section 8.6\n", 129 | "\n", 130 | "Generate a dataset $(X, y)$" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 9, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/html": [ 141 | "
\n", 142 | "\n", 155 | "\n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | "
I_0I_1I_2I_3R_0R_1R_2R_3N_0N_1N_2N_3
2020-02-16 05:29:14.539910-3.941539-1.955124-1.247683-0.6655362.8709240.706670-0.144982-1.498281-0.2294300.1772310.648948-0.818646
2020-02-16 05:30:14.539910-2.882175-1.822702-0.5688620.1034512.1966510.966482-0.527894-1.1003320.130209-0.8313101.4842910.320911
2020-02-16 05:31:14.539910-1.897824-0.659752-0.5759681.4320491.6473450.800773-0.995133-1.899108-1.667659-0.0053892.3478500.202494
2020-02-16 05:32:14.539910-2.5745871.9908870.3837413.9803723.6379300.773705-1.803899-6.4904070.1057381.093880-0.037027-1.414238
2020-02-16 05:33:14.539910-1.885823-2.601728-1.325420-0.7362740.6211260.755418-0.2376971.156626-1.1788070.0690230.454516-0.522534
\n", 251 | "
" 252 | ], 253 | "text/plain": [ 254 | " I_0 I_1 I_2 I_3 R_0 \\\n", 255 | "2020-02-16 05:29:14.539910 -3.941539 -1.955124 -1.247683 -0.665536 2.870924 \n", 256 | "2020-02-16 05:30:14.539910 -2.882175 -1.822702 -0.568862 0.103451 2.196651 \n", 257 | "2020-02-16 05:31:14.539910 -1.897824 -0.659752 -0.575968 1.432049 1.647345 \n", 258 | "2020-02-16 05:32:14.539910 -2.574587 1.990887 0.383741 3.980372 3.637930 \n", 259 | "2020-02-16 05:33:14.539910 -1.885823 -2.601728 -1.325420 -0.736274 0.621126 \n", 260 | "\n", 261 | " R_1 R_2 R_3 N_0 N_1 \\\n", 262 | "2020-02-16 05:29:14.539910 0.706670 -0.144982 -1.498281 -0.229430 0.177231 \n", 263 | "2020-02-16 05:30:14.539910 0.966482 -0.527894 -1.100332 0.130209 -0.831310 \n", 264 | "2020-02-16 05:31:14.539910 0.800773 -0.995133 -1.899108 -1.667659 -0.005389 \n", 265 | "2020-02-16 05:32:14.539910 0.773705 -1.803899 -6.490407 0.105738 1.093880 \n", 266 | "2020-02-16 05:33:14.539910 0.755418 -0.237697 1.156626 -1.178807 0.069023 \n", 267 | "\n", 268 | " N_2 N_3 \n", 269 | "2020-02-16 05:29:14.539910 0.648948 -0.818646 \n", 270 | "2020-02-16 05:30:14.539910 1.484291 0.320911 \n", 271 | "2020-02-16 05:31:14.539910 2.347850 0.202494 \n", 272 | "2020-02-16 05:32:14.539910 -0.037027 -1.414238 \n", 273 | "2020-02-16 05:33:14.539910 0.454516 -0.522534 " 274 | ] 275 | }, 276 | "execution_count": 9, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "X, cont = get_test_data(n_features=12, n_informative=4, n_redundant=4, n_samples=5000)\n", 283 | "X.head()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "# 8.1b\n", 291 | "\n", 292 | "Using the code presented in Section 8.6\n", 293 | "\n", 294 | "Apply a PCA transformation on X, which we denote $\\dot X$." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 10, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/html": [ 305 | "
\n", 306 | "\n", 319 | "\n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | "
PCA_0PCA_1PCA_2PCA_3PCA_4PCA_5PCA_6
2020-02-16 05:29:14.5399102.0653790.146105-1.1829830.1345900.530868-0.3633180.852387
2020-02-16 05:30:14.5399101.8397690.822382-0.9497301.217152-0.766848-0.0266611.015070
2020-02-16 05:31:14.5399101.9555101.0953250.2806852.4280070.794605-0.7327631.020887
2020-02-16 05:32:14.5399104.5698431.2172881.651004-1.1102741.290850-0.0473550.666502
2020-02-16 05:33:14.5399100.2347430.652217-1.1149010.6210420.784154-0.9468050.232960
\n", 385 | "
" 386 | ], 387 | "text/plain": [ 388 | " PCA_0 PCA_1 PCA_2 PCA_3 PCA_4 \\\n", 389 | "2020-02-16 05:29:14.539910 2.065379 0.146105 -1.182983 0.134590 0.530868 \n", 390 | "2020-02-16 05:30:14.539910 1.839769 0.822382 -0.949730 1.217152 -0.766848 \n", 391 | "2020-02-16 05:31:14.539910 1.955510 1.095325 0.280685 2.428007 0.794605 \n", 392 | "2020-02-16 05:32:14.539910 4.569843 1.217288 1.651004 -1.110274 1.290850 \n", 393 | "2020-02-16 05:33:14.539910 0.234743 0.652217 -1.114901 0.621042 0.784154 \n", 394 | "\n", 395 | " PCA_5 PCA_6 \n", 396 | "2020-02-16 05:29:14.539910 -0.363318 0.852387 \n", 397 | "2020-02-16 05:30:14.539910 -0.026661 1.015070 \n", 398 | "2020-02-16 05:31:14.539910 -0.732763 1.020887 \n", 399 | "2020-02-16 05:32:14.539910 -0.047355 0.666502 \n", 400 | "2020-02-16 05:33:14.539910 -0.946805 0.232960 " 401 | ] 402 | }, 403 | "execution_count": 10, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "Xdot = pd.DataFrame(get_orthogonal_features(X), index=X.index).add_prefix(\"PCA_\")\n", 410 | "Xdot.head()" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "# 8.1c\n", 418 | "\n", 419 | "Using the code presented in Section 8.6\n", 420 | "\n", 421 | "Compute MDI, MDA, and SFI feature importance on $(\\dot X, y)$, where the base estimator is a RF." 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "test_data_func(Xdot, cont, '_8.1c')" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "![title](img/MDI_feat_imp_8.1c2.png)\n", 438 | "![title](img/MDA_feat_imp_8.1c2.png)\n", 439 | "![title](img/SFI_feat_imp_8.1c2.png)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "# 8.1d\n", 447 | "\n", 448 | "Using the code presented in Section 8.6\n", 449 | "\n", 450 | "Do the three methods agree on what features are important? Why?" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": {}, 456 | "source": [ 457 | "**A: PCA successfully helped us reduce our data from 12 features to 7 and across those 7 features, our 3 feature importance methods agreed that the first few principal components (PCA_{0, 1,2}) are the most important.**" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "# 8.2a\n", 465 | "\n", 466 | "From exercise 1, generate a new dataset $(\\ddot X, y)$, where $\\ddot X$ is a feature union of $X$ and $\\dot X$.\n", 467 | "\n", 468 | "Compute MDI, MDA, and SFI feature importance on $(\\ddot X, y)$, where the base estimator is a RF." 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 11, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "Xdotdot = pd.concat([X, Xdot], axis=1)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "test_data_func(Xdotdot, cont, '_8.2a')" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "![title](img/MDI_feat_imp_8.2a.png)\n", 494 | "![title](img/MDA_feat_imp_8.2a.png)\n", 495 | "![title](img/SFI_feat_imp_8.2a.png)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "# 8.2b\n", 503 | "\n", 504 | "From exercise 1, generate a new dataset $(\\ddot X, y)$, where $\\ddot X$ is a feature union of $X$ and $\\dot X$.\n", 505 | "\n", 506 | "Do the three methods agree on what features are important? Why?" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "**A: MDI & SFI rank untransformed informative & redundant features above noisy ones and the first principal components over latter ones. MDA in this case does not seem to be able to rank the features correctly.**" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "# 8.3a\n", 521 | "\n", 522 | "Take the results from exercise 2: \n", 523 | "\n", 524 | "Drop the most important features according to each method, resulting in a features matrix $\\dddot X$." 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 12, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "most_important_features = ['I_2', 'PCA_1', 'PCA_0', 'R_2', 'I_1']\n", 534 | "Xdotdotdot = Xdotdot.loc[:, ~Xdotdot.columns.isin(most_important_features)]" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "# 8.3b\n", 542 | "\n", 543 | "Take the results from exercise 2: \n", 544 | "\n", 545 | "Compute MDI, MDA, and SFI feature importance on $(\\dddot X, y)$, where the base estimator is a RF." 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "test_data_func(Xdotdotdot, cont, '_8.3b')" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "# 8.3c\n", 562 | "\n", 563 | "Take the results from exercise 2: \n", 564 | "\n", 565 | "Do you appreciate significant changes in the rankings of important features, relative to the results from exercise 2?" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "![title](img/MDI_feat_imp_8.3b.png)\n", 573 | "![title](img/MDA_feat_imp_8.3b.png)\n", 574 | "![title](img/SFI_feat_imp_8.3b.png)" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "**A: MDI & SFI seem unperturbed, while MDA has shifted a lot and now assigns positive feature importance to all remaining first principal components and informative and redundant features.**\n" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "# 8.4a\n", 589 | "\n", 590 | "Using the code presented in Section 8.6:\n", 591 | "\n", 592 | "Generate a dataset $(X, y)$ of 1E6 observations, where 5 features are informative, 5 are redundant and 10 are noise." 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 4, 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [ 601 | "n_samples = 10000\n", 602 | "X, cont = get_test_data(n_features=20, n_informative=5, n_redundant=5, n_samples=n_samples)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": {}, 608 | "source": [ 609 | "# 8.4b\n", 610 | "\n", 611 | "Using the code presented in Section 8.6:\n", 612 | "\n", 613 | "Split $(X, y)$ into 10 datasets, each of 1E5 observations." 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "**A: Implemented in the next answer.**" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "# 8.4c\n", 628 | "\n", 629 | "Using the code presented in Section 8.6:\n", 630 | "\n", 631 | "Compute the parallelized feature importance on each of the 10 datasets." 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 5, 637 | "metadata": { 638 | "scrolled": false 639 | }, 640 | "outputs": [], 641 | "source": [ 642 | "def combine_imps(imps):\n", 643 | " return pd.DataFrame({\n", 644 | " 'mean': pd.concat([x['mean'] for x in imps], axis=1).mean(axis=1),\n", 645 | " 'std': pd.concat([x['std'] for x in imps], axis=1).mean(axis=1),\n", 646 | " })\n", 647 | "\n", 648 | "def chunked_test_data_func(X, cont, n_chunks=1, run='', allow_masking_effects=False, methods=['MDI', 'MDA', 'SFI']):\n", 649 | " from feature_importances_mp import feature_importances\n", 650 | " chunks = np.array_split(X.index, n_chunks)\n", 651 | "\n", 652 | " for method in methods:\n", 653 | " jobs = [{\n", 654 | " 'func': feature_importances,\n", 655 | " 'X': X.loc[chunk],\n", 656 | " 'cont': cont.loc[chunk],\n", 657 | " 'method': method,\n", 658 | " 'allow_masking_effects': allow_masking_effects,\n", 659 | " } for chunk in chunks]\n", 660 | "\n", 661 | " results = process_jobs(jobs, num_threads=32)\n", 662 | " \n", 663 | " imps, oobs, ooss = zip(*results)\n", 664 | "\n", 665 | " feature_imp = combine_imps(imps)\n", 666 | " oob_score, oos_score = pd.Series(oobs).mean(), pd.Series(ooss).mean()\n", 667 | "\n", 668 | " plot_feature_importance(\n", 669 | " feature_imp, oob_score=oob_score, oos_score=oos_score,\n", 670 | " savefig=True, output_path='img/{}_feat_imp{}.png'.format(method, run)\n", 671 | " )\n", 672 | "\n" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "chunked_test_data_func(X, cont, n_chunks=10, run='_8.4c_10chunks')" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "#### Parallelized feature importance:" 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": {}, 694 | "source": [ 695 | "![title](img/MDI_feat_imp_8.4c_10chunks.png)\n", 696 | "![title](img/MDA_feat_imp_8.4c_10chunks.png)\n", 697 | "![title](img/SFI_feat_imp_8.4c_10chunks.png)" 698 | ] 699 | }, 700 | { 701 | "cell_type": "markdown", 702 | "metadata": {}, 703 | "source": [ 704 | "# 8.4d\n", 705 | "\n", 706 | "Using the code presented in Section 8.6:\n", 707 | "\n", 708 | "Compute the stacked feature importance on the combined dataset $(X, y)$." 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "test_data_func(X, cont, run='_8.4c_1chunk')" 718 | ] 719 | }, 720 | { 721 | "cell_type": "markdown", 722 | "metadata": {}, 723 | "source": [ 724 | "#### Stacked feature importance:" 725 | ] 726 | }, 727 | { 728 | "cell_type": "markdown", 729 | "metadata": {}, 730 | "source": [ 731 | "![title](img/MDI_feat_imp_8.4c_1chunk.png)\n", 732 | "![title](img/MDA_feat_imp_8.4c_1chunk.png)\n", 733 | "![title](img/SFI_feat_imp_8.4c_1chunk.png)" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "# 8.4e \n", 741 | "\n", 742 | "Using the code presented in Section 8.6:\n", 743 | "\n", 744 | "What causes the discrepancy between the two? Which one is more reliable?" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "**A: Both methods generate similar rankings, with informative and redundant features above noisy ones, while the more computationally intensive (stacked) does so by a much wider margin.**" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "# 8.5\n", 759 | "\n", 760 | "Repeat all MDI calculations from exercises 1-4, but this time allow for masking effects. That means, do not set `max_features=int(1)` in Snippet 8.2. How do results differ as a consequence of this change? Why?" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": null, 766 | "metadata": { 767 | "scrolled": false 768 | }, 769 | "outputs": [], 770 | "source": [ 771 | "X, cont = get_test_data(n_features=12, n_informative=4, n_redundant=4, n_samples=5000)\n", 772 | "Xdot = pd.DataFrame(get_orthogonal_features(X), index=X.index).add_prefix(\"PCA_\")\n", 773 | "test_data_func(Xdot, cont, '_8.5_1', allow_masking_effects=True, methods=['MDI'])" 774 | ] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "metadata": {}, 779 | "source": [ 780 | "![title](img/MDI_feat_imp_8.5_1_2.png)" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": null, 786 | "metadata": {}, 787 | "outputs": [], 788 | "source": [ 789 | "Xdotdot = pd.concat([X, Xdot], axis=1)\n", 790 | "test_data_func(Xdotdot, cont, '_8.5_2', allow_masking_effects=True, methods=['MDI'])" 791 | ] 792 | }, 793 | { 794 | "cell_type": "markdown", 795 | "metadata": {}, 796 | "source": [ 797 | "![title](img/MDI_feat_imp_8.5_2.png)" 798 | ] 799 | }, 800 | { 801 | "cell_type": "markdown", 802 | "metadata": {}, 803 | "source": [ 804 | "**A: There is little change for the PCA-transformed features, while MDI seems to perform a lot better on the union of transformed and non-transformed features when allowing for masking effects.**" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": null, 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [ 813 | "most_important_features = ['I_2', 'PCA_1', 'I_1', 'PCA_0', 'R_3']\n", 814 | "\n", 815 | "Xdotdotdot = Xdotdot.loc[:, ~Xdotdot.columns.isin(most_important_features)]\n", 816 | "test_data_func(Xdotdotdot, cont, '_8.5_3', allow_masking_effects=True, methods=['MDI'])" 817 | ] 818 | }, 819 | { 820 | "cell_type": "markdown", 821 | "metadata": {}, 822 | "source": [ 823 | "![title](img/MDI_feat_imp_8.5_3.png)" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "metadata": {}, 830 | "outputs": [], 831 | "source": [ 832 | "n_samples = 10000\n", 833 | "X, cont = get_test_data(n_features=20, n_informative=5, n_redundant=5, n_samples=n_samples)\n", 834 | "\n", 835 | "chunked_test_data_func(X, cont, n_chunks=10, run='_8.5_4', allow_masking_effects=True, methods=['MDI'])" 836 | ] 837 | }, 838 | { 839 | "cell_type": "markdown", 840 | "metadata": {}, 841 | "source": [ 842 | "![title](img/MDI_feat_imp_8.5_4.png)" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "test_data_func(X, cont, '_8.5_5', allow_masking_effects=True, methods=['MDI'])" 852 | ] 853 | }, 854 | { 855 | "cell_type": "markdown", 856 | "metadata": {}, 857 | "source": [ 858 | "![title](img/MDI_feat_imp_8.5_5.png)\n" 859 | ] 860 | }, 861 | { 862 | "cell_type": "markdown", 863 | "metadata": {}, 864 | "source": [ 865 | "**A: Allowing for masking effects still manages to rank PCA features correctly, however when untransformed redundant and noisy features are introduced, the feature importance methods quickly produce much worse results than when run when not allowing for masking effects. While stacked feature importance still does OK, parallelized feature importance also ranks some noisy above informative, and some redundant below most other features.**" 866 | ] 867 | } 868 | ], 869 | "metadata": { 870 | "kernelspec": { 871 | "display_name": "Python 3", 872 | "language": "python", 873 | "name": "python3" 874 | }, 875 | "language_info": { 876 | "codemirror_mode": { 877 | "name": "ipython", 878 | "version": 3 879 | }, 880 | "file_extension": ".py", 881 | "mimetype": "text/x-python", 882 | "name": "python", 883 | "nbconvert_exporter": "python", 884 | "pygments_lexer": "ipython3", 885 | "version": "3.6.7" 886 | } 887 | }, 888 | "nbformat": 4, 889 | "nbformat_minor": 2 890 | } 891 | -------------------------------------------------------------------------------- /Chapter 09 - Hyper-Parameter Tuning with Cross-Validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 49, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib\n", 12 | "import matplotlib.pyplot as mpl\n", 13 | "\n", 14 | "from collections import defaultdict\n", 15 | "from functools import reduce\n", 16 | "from path import Path\n", 17 | "from pprint import pprint\n", 18 | "\n", 19 | "%matplotlib inline\n", 20 | "mpl.style.use('ggplot')\n", 21 | "mpl.rcParams['figure.figsize'] = 16,6" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 50, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from sklearn.pipeline import Pipeline\n", 31 | "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", 32 | "from sklearn.ensemble import BaggingClassifier\n", 33 | "from scipy.stats import rv_continuous, kstest\n", 34 | "from cv import PurgedKFold\n", 35 | "\n", 36 | "# Code from Chapter 9\n", 37 | "\n", 38 | "class TheNewPipe(Pipeline):\n", 39 | " def fit(self, X, y, sample_weight=None, **fit_params):\n", 40 | " if sample_weight is not None:\n", 41 | " fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight\n", 42 | " return super(TheNewPipe, self).fit(X, y, **fit_params)\n", 43 | "\n", 44 | "def clfHyperFit(feat, lbl, t1, pipe_clf, param_grid, cv=3, bagging=[0, None, 1.0],\n", 45 | " rndSearchIter=0, n_jobs=-1, pctEmbargo=0, **fit_params):\n", 46 | " if set(lbl.values) == {0, 1}:\n", 47 | " scoring = 'f1' # f1 for meta-labeling\n", 48 | " else:\n", 49 | " scoring = 'neg_log_loss' # symmetric towards all classes\n", 50 | " \n", 51 | " # 1) hyperparameter searching, on train data\n", 52 | " inner_cv = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo)\n", 53 | " if rndSearchIter == 0:\n", 54 | " gs = GridSearchCV(estimator=pipe_clf, param_grid=param_grid, scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)\n", 55 | " else:\n", 56 | " gs = RandomizedSearchCV(estimator=pipe_clf, param_distributions=param_grid, scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False, n_iter=rndSearchIter)\n", 57 | " gs = gs.fit(feat, lbl, **fit_params).best_estimator_\n", 58 | " # 2) fit validated model on the entirety of the data\n", 59 | " if bagging[1] > 0:\n", 60 | " gs = BaggingClassifier(bare_estimator=TheNewPipe(gs.steps), n_estimators=int(bagging[0]), max_samples=float(bagging[1]),\n", 61 | " max_features=float(bagging[2]), n_jobs=n_jobs)\n", 62 | " gs = gs.fit(feat, lbl, sample_weight=fit_params[gs.base_estimator.steps[-1][0] + '__sample_weight'])\n", 63 | " gs = Pipeline([('bag', gs)])\n", 64 | " return gs\n", 65 | " \n", 66 | "class logUniform_gen(rv_continuous):\n", 67 | " # random numbers log-uniformly distributed between 1 and e\n", 68 | " def _cdf(self, x):\n", 69 | " return np.log(x / self.a) / np.log(self.b / self.a)\n", 70 | " \n", 71 | "def logUniform(a=1, b=np.exp(1)):\n", 72 | " return logUniform_gen(a=a, b=b, name='logUniform')\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "# 9.1a\n", 80 | "\n", 81 | "Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. \n", 82 | "\n", 83 | "Use `GridSearchCV` on 10-fold-CV to find the `C, gamma` optimal hyper-parameters on a SVC with RBF kernel, where `param_grid={'C': [1E-2, 1E-1, 1, 10, 100], 'gamma': [1E-2, 1E-1, 1, 10, 100]}` and the scoring function is `neg_log_loss`" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 51, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "from feature_imp import getTestData\n", 93 | "from sklearn.svm import SVC\n", 94 | "\n", 95 | "param_grid = {'C': [1e-2, 1e-1, 1, 10, 100], 'gamma': [1e-2, 1e-1, 1, 10, 100]}\n", 96 | "\n", 97 | "testing = False\n", 98 | "n_samples = 1000 if testing else 10000\n", 99 | "n_splits = 3 if testing else 10\n", 100 | "\n", 101 | "trnsX, cont = getTestData(n_features=10, n_informative=5, n_redundant=0, n_samples=n_samples)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 52, 107 | "metadata": { 108 | "scrolled": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "pipe_clf = SVC(probability=True)\n", 113 | "\n", 114 | "inner_cv = PurgedKFold(n_splits=n_splits, t1=cont.index.to_series())\n", 115 | "gs1 = GridSearchCV(estimator=pipe_clf, param_grid=param_grid, scoring='neg_log_loss', cv=inner_cv,\n", 116 | " n_jobs=-1, iid=False, return_train_score=True)\n", 117 | "\n", 118 | "gs1 = gs1.fit(X=trnsX, y=cont['bin'])\n", 119 | "gs1_results = pd.DataFrame(gs1.cv_results_)\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "# 9.1b\n", 127 | "\n", 128 | "Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. \n", 129 | "\n", 130 | "How many nodes are there in the grid?" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "# 9.1c\n", 138 | "\n", 139 | "Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. \n", 140 | "\n", 141 | "How many fits did it take to find the optimal solution?" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 54, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "It took 25 fits\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "print(\"It took %s fits\" % len(gs1_results))" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# 9.1d\n", 166 | "\n", 167 | "Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. \n", 168 | "\n", 169 | "How long did it take to find this solution?" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 71, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "It took 328 seconds.\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "print(\"It took {:.0f} seconds.\".format(gs1_results['mean_fit_time'].sum()))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "# 9.1e\n", 194 | "\n", 195 | "Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. \n", 196 | "\n", 197 | "How can you access the optimal result?" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 56, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,\n", 209 | " decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',\n", 210 | " max_iter=-1, probability=True, random_state=None, shrinking=True,\n", 211 | " tol=0.001, verbose=False)" 212 | ] 213 | }, 214 | "execution_count": 56, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "be1 = gs1.best_estimator_\n", 221 | "be1" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "# 9.1f\n", 229 | "\n", 230 | "Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. \n", 231 | "\n", 232 | "What is the CV score of the optimal parameter combination?" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 77, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/html": [ 243 | "
\n", 244 | "\n", 257 | "\n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | "
16
mean_test_score-0.288744
params{'C': 10, 'gamma': 0.1}
\n", 275 | "
" 276 | ], 277 | "text/plain": [ 278 | " 16\n", 279 | "mean_test_score -0.288744\n", 280 | "params {'C': 10, 'gamma': 0.1}" 281 | ] 282 | }, 283 | "execution_count": 77, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "best1_idx = gs1_results['mean_test_score'].idxmax()\n", 290 | "best1 = gs1_results['mean_test_score'].max()\n", 291 | "gs1_results.iloc[best1_idx][['mean_test_score', 'params']].to_frame()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 78, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "name": "stdout", 301 | "output_type": "stream", 302 | "text": [ 303 | "The CV score is -0.289\n" 304 | ] 305 | } 306 | ], 307 | "source": [ 308 | "print(\"The CV score is {:.3f}\".format(best1))" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "# 9.1g\n", 316 | "\n", 317 | "Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. \n", 318 | "\n", 319 | "How can you pass sample weights to the SVC?" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "# 9.2a\n", 327 | "\n", 328 | "Using the same dataset from exercise 1,\n", 329 | "\n", 330 | "Use `RandomizedSearchCV` on 10-fold-CV to find the `C, gamma` optimal hyper-parameters on a SVC with RBF kernel, where `param_distributions={'C': logUniform(a=1E-2, b=1E2), 'gamma': logUniform(a=1E-2, b=1E2)}, n_iter=25` and the scoring function is `neg_log_loss`" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 58, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "inner_cv = PurgedKFold(n_splits=n_splits, t1=cont.index.to_series())\n", 340 | "param_distributions = {'C': logUniform(a=1e-2, b=1e2), 'gamma': logUniform(a=1e-2, b=1e2)}\n", 341 | "n_iter = 25\n", 342 | "gs2 = RandomizedSearchCV(estimator=pipe_clf, param_distributions=param_distributions, scoring='neg_log_loss',\n", 343 | " cv=inner_cv, n_jobs=-1, iid=False, n_iter=n_iter, return_train_score=True)\n", 344 | "\n", 345 | "gs2 = gs2.fit(X=trnsX, y=cont['bin'])\n", 346 | "gs2_results = pd.DataFrame(gs2.cv_results_)\n" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "# 9.2b\n", 354 | "\n", 355 | "Using the same dataset from exercise 1,\n", 356 | "\n", 357 | "How long did it take to find this solution?" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 74, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "name": "stdout", 367 | "output_type": "stream", 368 | "text": [ 369 | "It took 328 seconds.\n" 370 | ] 371 | } 372 | ], 373 | "source": [ 374 | "print(\"It took {:.0f} seconds.\".format(gs1_results['mean_fit_time'].sum()))" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "# 9.2c\n", 382 | "\n", 383 | "Using the same dataset from exercise 1,\n", 384 | "\n", 385 | "Is the optimal parameter combination similar to the one found in exercise 1?" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 60, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": [ 396 | "SVC(C=10.7875109732391, cache_size=200, class_weight=None, coef0=0.0,\n", 397 | " decision_function_shape='ovr', degree=3, gamma=0.07549547952136182,\n", 398 | " kernel='rbf', max_iter=-1, probability=True, random_state=None,\n", 399 | " shrinking=True, tol=0.001, verbose=False)" 400 | ] 401 | }, 402 | "execution_count": 60, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "be2 = gs2.best_estimator_\n", 409 | "be2" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "# 9.2d\n", 417 | "\n", 418 | "Using the same dataset from exercise 1,\n", 419 | "\n", 420 | "What is the CV score of the optimal parameter combination? How does it compare to the CV score from exercise 1?" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 79, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "The CV score is -0.282, and therefore higher than -0.289 from the first exercise.\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "best2 = gs2_results['mean_test_score'].max()\n", 438 | "print(\"The CV score is {:.3f}, and therefore higher than {:.3f} from the first exercise.\".format(best2, best1))" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "# 9.3a\n", 446 | "\n", 447 | "From exercise 1,\n", 448 | "\n", 449 | "Compute the Sharpe ratio of the resulting in-sample forecasts, from point 1.a " 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 80, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "name": "stdout", 459 | "output_type": "stream", 460 | "text": [ 461 | "The Sharpe ratio is 0.85.\n" 462 | ] 463 | } 464 | ], 465 | "source": [ 466 | "def sharpe(r):\n", 467 | " return r.mean() / r.std()\n", 468 | "\n", 469 | "predictions1 = be1.predict(trnsX)\n", 470 | "bin_returns = cont['bin'] * 2 - 1\n", 471 | "\n", 472 | "print(\"The Sharpe ratio is {:.2f}.\".format(sharpe(predictions1 * bin_returns)))" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "# 9.3b\n", 480 | "\n", 481 | "From exercise 1,\n", 482 | "\n", 483 | "Repeat point 1.a, this time with `accuracy` as the scoring function. Compute the in-sample forecasts derived from the hyper-tuned parameters." 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 82, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | "The Sharpe ratio is 0.85.\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "inner_cv = PurgedKFold(n_splits=10, t1=cont.index.to_series())\n", 501 | "gs3 = GridSearchCV(estimator=pipe_clf, param_grid=param_grid, scoring='accuracy', cv=inner_cv,\n", 502 | " n_jobs=-1, iid=False, return_train_score=True)\n", 503 | "\n", 504 | "gs3 = gs3.fit(X=trnsX, y=cont['bin'])\n", 505 | "gs3_results = pd.DataFrame(gs3.cv_results_)\n", 506 | "be3 = gs3.best_estimator_\n", 507 | "\n", 508 | "predictions3 = be3.predict(trnsX)\n", 509 | "\n", 510 | "print(\"The Sharpe ratio is {:.2f}.\".format(sharpe(predictions3 * bin_returns)))" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "# 9.3c\n", 518 | "\n", 519 | "What scoring method leads to higher (in-sample) Sharpe ratio?" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "**A: In this instance GridSearchCV with either accuracy or neg_log_loss picks the same set of parameters.**" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "# 9.4a\n", 534 | "\n", 535 | "From exercise 2,\n", 536 | "\n", 537 | "Compute the Sharpe ratio of the resulting in-sample forecasts, from point 2.a " 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 83, 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "name": "stdout", 547 | "output_type": "stream", 548 | "text": [ 549 | "The Sharpe ratio is 0.81.\n" 550 | ] 551 | } 552 | ], 553 | "source": [ 554 | "predictions2 = be2.predict(trnsX)\n", 555 | "\n", 556 | "print(\"The Sharpe ratio is {:.2f}.\".format(sharpe(predictions2 * bin_returns)))" 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "# 9.4b\n", 564 | "\n", 565 | "From exercise 2,\n", 566 | "\n", 567 | "Repeat point 2.a, this time with `accuracy` as the scoring function. Compute the in-sample forecasts derived from the hyper-tuned parameters." 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 84, 573 | "metadata": {}, 574 | "outputs": [ 575 | { 576 | "name": "stdout", 577 | "output_type": "stream", 578 | "text": [ 579 | "The Sharpe ratio is 0.78.\n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "gs4 = RandomizedSearchCV(estimator=pipe_clf, param_distributions=param_distributions, scoring='accuracy', cv=inner_cv, n_jobs=-1, iid=False, n_iter=n_iter)\n", 585 | "\n", 586 | "gs4 = gs4.fit(X=trnsX, y=cont['bin'])\n", 587 | "be4 = gs4.best_estimator_\n", 588 | "\n", 589 | "predictions4 = be4.predict(trnsX)\n", 590 | "\n", 591 | "sharpe(predictions4 * bin_returns)\n", 592 | "print(\"The Sharpe ratio is {:.2f}.\".format(sharpe(predictions4 * bin_returns)))" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "# 9.4c\n", 600 | "\n", 601 | "From exercise 2,\n", 602 | "\n", 603 | "What scoring method leads to higher (in-sample) Sharpe ratio?" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "**A: For randomized search, negative log-loss leads to higher in-sample Sharpe ratio.**" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "# 9.5a\n", 618 | "\n", 619 | "Read the definition of log loss, $L[Y,P]$.\n", 620 | "\n", 621 | "Why is the scoring function `neg_log_loss` defined as the negative log loss, $-L[Y,P]$?" 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "**A: Because for most it's more intuitive to maximize a scoring function.**" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "# 9.5b\n", 636 | "\n", 637 | "Read the definition of log loss, $L[Y,P]$.\n", 638 | "\n", 639 | "What would be the outcome of maximizing the log loss, rather than the negitive log loss?" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "**A: I'd expect this to select for the model with the least predictive power.**" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "# 9.6\n", 654 | "\n", 655 | "Consider an investment strategy that sizes its bets equally, regardless of the forecast's confidence. In this case, what is the more appropriate scoring function for hyper-parameter tuning, accuracy or cross-entropy loss?" 656 | ] 657 | }, 658 | { 659 | "cell_type": "markdown", 660 | "metadata": {}, 661 | "source": [ 662 | "**A: Accuracy accounts equally for erronous predictions with high or low probabilities while Log loss computes the log-likelihood of the classifier given the true label, which takes predictions' probabilities into account.**\n", 663 | "\n" 664 | ] 665 | } 666 | ], 667 | "metadata": { 668 | "kernelspec": { 669 | "display_name": "Python 3", 670 | "language": "python", 671 | "name": "python3" 672 | }, 673 | "language_info": { 674 | "codemirror_mode": { 675 | "name": "ipython", 676 | "version": 3 677 | }, 678 | "file_extension": ".py", 679 | "mimetype": "text/x-python", 680 | "name": "python", 681 | "nbconvert_exporter": "python", 682 | "pygments_lexer": "ipython3", 683 | "version": "3.5.6" 684 | } 685 | }, 686 | "nbformat": 4, 687 | "nbformat_minor": 2 688 | } 689 | -------------------------------------------------------------------------------- /Chapter 11 - The Dangers of Backtesting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 11.1\n", 8 | "\n", 9 | "An analyst fits an RF classifier where some of the features include seasonally adjusted employment data. He aligns with January data the seasonally adjusted value of January, etc. What \"sin\" has he committed?" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "**A: January data is released in February and generally revised 3 times thereafter, i.e. the lookahead sin.**" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# 11.2\n", 24 | "\n", 25 | "An analyst develops an ML algorithm where he generates a signal using closing prices, and exectude at the close. What's the sin?" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "**A: Closing prices aren't known until close, i.e. the lookahead & transaction cost sins.**" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "# 11.3\n", 40 | "\n", 41 | "There is a 98.51% correlation between total revenue generated by arcades and computer science doctorates awarded in the United States. As the number of doctorates is expected to grow, should we invest in arcades companies? If not, what's the sin?" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "**A: If there's no plausible way to explain why these are correlated, it's probably just a coincidence, i.e. the storytelling sin.**" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# 11.4\n", 56 | "\n", 57 | "The *Wall Street Journal* has reported that September is the only month of the year that has negative stock returns, looking back 20, 50, and 100 years. Should we sell stock at the end of August? If not, what's the sin?" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "**A: I don't think there's much of a sin here, seasonal patters (across all timeframes) such as these are very real in the markets.**" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "# 11.5\n", 72 | "\n", 73 | "We download P/E ratios from Bloomberg, rank stocks every month, sell the top quartile, and buy the ~~long~~ bottom quartile. Performance is amazing. What's the sin?" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "**A: Knowing whether we would've been able to and how much it would've cost to short these issues is hard, i.e. shorting sin.**" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.5.6" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 2 105 | } 106 | -------------------------------------------------------------------------------- /Chapter 12 - Backtesting through Cross-Validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 12.1\n", 8 | "\n", 9 | "Suppose that you develop a momentum strategy on a futures contract, where the forecast is based on an AR(1) process. You backtest this strategy using the WF method, and the Sharpe ratio is 1.5. You then repeat the backtest on the reversed series and achieve a Sharpe ratio of -1.5. What would be the mathematical grounds for disregarding the second result, if any?" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "**A: While the second result is certainly not encouring, it also -- like the first one -- only tests a single scenario, and only that one specific path.**" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# 12.2\n", 24 | "\n", 25 | "You develop a mean-reverting strategy on a futures contract. Your WF backtest achieves a Sharpe ratio of 1.5. You increase the length of the warm-up period, and the Sharpe ration drops to 0.7. You go ahead and present only the result with the higher Sharpe ratio, arguing that a strategy with a shorter warm-up is more realistic. Is this selection bias?" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "**A: WF's high variance leads to false discoveries, since, as we can see here, researchers will select backtests with the maximum expected SR.**" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "# 12.3\n", 40 | "\n", 41 | "Your strategy achieves a Sharpe ratio of 1.5 on a WF backtest, but a Sharpe ratio of 0.7 on a CV backtest. You go ahead and present only the result with the higher Sharpe ratio, arguing that the WF backtest is historically accurate, while the CV backtest is a scenario simulation, or an inferential exercise. Is this selection bias?" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "**A: I would argue, yes for the reasons above.**" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# 12.4\n", 56 | "\n", 57 | "Your strategy produces 100,000 forecasts over time. You would like to derive the CPCV distribution of Sharpe ratios by generating 1,000 paths. What are the possible combinations of parameters ($N,k$) that will allow you to achieve that?" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 48, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "1001.0" 69 | ] 70 | }, 71 | "execution_count": 48, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "from scipy.special import comb\n", 78 | "\n", 79 | "N = 15\n", 80 | "k = 5\n", 81 | "k / N * comb(N, N - k)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "**A: One solution would be (N, k) = (15, 5)**" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "# 12.5\n", 96 | "\n", 97 | "You discover a strategy that achieves a Sharpe ratio of 1.5 in a WF backtest. You write a paper explaining the theory that would justify such result, and submit to an academic journal. The editor replies that one referee has requested you repeat your backtest using a CPCT method with $N$ = 100 and $k$ = 2, including your code and full datasets. You follow these instructions, and the mean Sharpe ratio is -1 with a standard deviation of 0.5. Furious, you do not reply, but instead withdraw your submission, and resubmit in a different journal of higher impact factor. After 6 months, your paper is accepted. You appease your consience thinking that, if the discovery is false, it is the journal's fault for not having requested a CPCV test. You think, \"It cannot be unethical, since it is permitted, and everybody does it.\" What are the arguments, scientific or ethical, to justify your actions?" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "**A: While my knowledge of publishing processes in academia is limited, I would assign the responsibility of quality control to the journal. Fiction is a viable literary genre and probably shouldn't be completely abolished.**" 105 | ] 106 | } 107 | ], 108 | "metadata": { 109 | "kernelspec": { 110 | "display_name": "Python 3", 111 | "language": "python", 112 | "name": "python3" 113 | }, 114 | "language_info": { 115 | "codemirror_mode": { 116 | "name": "ipython", 117 | "version": 3 118 | }, 119 | "file_extension": ".py", 120 | "mimetype": "text/x-python", 121 | "name": "python", 122 | "nbconvert_exporter": "python", 123 | "pygments_lexer": "ipython3", 124 | "version": "3.5.6" 125 | } 126 | }, 127 | "nbformat": 4, 128 | "nbformat_minor": 2 129 | } 130 | -------------------------------------------------------------------------------- /Chapter 13 - Backtesting on Synthetic Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "from random import gauss\n", 12 | "from itertools import product\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "import seaborn as sns\n", 15 | "\n", 16 | "%load_ext autoreload\n", 17 | "%autoreload 2\n", 18 | "\n", 19 | "def main():\n", 20 | " rPT = rSLm = np.linspace(0, 10, 21)\n", 21 | " count = 0\n", 22 | " for prod_ in product([10, 5, 0, -5, -10], [5, 10, 25, 50, 100]):\n", 23 | " count += 1\n", 24 | " coeffs = {'forecast': prod_[0], 'hl': prod_[1], 'sigma': 1}\n", 25 | " output = batch(coeffs, nIter=1e5, maxHP=100, rPT=rPT, rSLm=rSLm)\n", 26 | " return output\n", 27 | "\n", 28 | "def batch(coeffs, nIter=1e5, maxHP=100, rPT=np.linspace(0.5, 10, 20), rSLm=np.linspace(0.5, 10, 20), seed=0):\n", 29 | " phi = 2 ** (-1.0 / coeffs['hl'])\n", 30 | " output1 = []\n", 31 | " for comb_ in product(rPT, rSLm):\n", 32 | " output2 = []\n", 33 | " for iter_ in range(int(nIter)):\n", 34 | " p, hp, count = seed, 0, 0\n", 35 | " while True:\n", 36 | " p = (1 - phi) * coeffs['forecast'] + phi * p + coeffs['sigma'] * gauss(0, 1)\n", 37 | " cP = p - seed\n", 38 | " hp += 1\n", 39 | " if cP > comb_[0] or cP < -comb_[1] or hp > maxHP:\n", 40 | " output2.append(cP)\n", 41 | " break\n", 42 | " mean, std = np.mean(output2), np.std(output2)\n", 43 | " print(comb_[0], comb_[1], mean, std, mean / std)\n", 44 | " output1.append((comb_[0], comb_[1], mean, std, mean / std))\n", 45 | " return output1\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "# 13.1.a\n", 53 | "\n", 54 | "Suppose you are an execution trader. A client class you with an order to cover a short position she entered at a price of 100. She gives you two exit conditions: profit-taking at 90 and a stop-loss at 105.\n", 55 | "\n", 56 | "Assuming the client belieces the price follows an O-U process, are these levels reasonable? " 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "**A: These levels appear to work well for Forecast values of -5 and all H-L values and Forecast values of -10 with H-L values between 25 and 100.**" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "# 13.1.b\n", 71 | "\n", 72 | "Suppose you are an execution trader. A client class you with an order to cover a short position she entered at a price of 100. She gives you two exit conditions: profit-taking at 90 and a stop-loss at 105.\n", 73 | "\n", 74 | "Can you think of an alternative stochastic process under which these levels make sense?" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "# 13.2.a\n", 82 | "\n", 83 | "Fit the time series of dollar bars of E-mini S&P 500 futures to an O-U process. Given those parameters:\n", 84 | "\n", 85 | "Produce a heat-map of Sharpe ratios for various profit-taking and stop-loss levels.\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 2, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "def gen_the_heat(forecast, hl, sigma=1):\n", 95 | " rPT = rSLm = np.linspace(0, 10, 21)\n", 96 | " coeffs = {'forecast': forecast, 'hl': hl, 'sigma': sigma}\n", 97 | " output = batch(coeffs, nIter=1e5, maxHP=100, rPT=rPT, rSLm=rSLm)\n", 98 | " return output\n", 99 | "\n", 100 | "def pump_heatmap(coeffs, outputs):\n", 101 | " heatdf = pd.DataFrame(outputs)\n", 102 | " heatdfp = heatdf.pivot(1, 0, 4)\n", 103 | " plt.subplots() # give us a new figure\n", 104 | " return sns.heatmap(heatdfp.sort_index(ascending=False))\n", 105 | "\n", 106 | " " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [ 116 | { 117 | "name": "stderr", 118 | "output_type": "stream", 119 | "text": [ 120 | "2019-11-22 10:33:51.311041 50.0% bbatch done after 2.48 minutes. Remaining 2.48 minutes..\r" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "from multiprocess import mpPandasObj\n", 126 | "from synthetic_data import process_batch\n", 127 | "runs = range(1,7)\n", 128 | "rPT = rSLm = np.linspace(0, 10, 21)\n", 129 | "coeffs_list = [{'forecast': x, 'hl': x, 'sigma': 1} for x in runs]\n", 130 | "\n", 131 | "ret = mpPandasObj(process_batch, ('coeffs_list', coeffs_list), numThreads=6, linMols=True, nIter=1e4, maxHP=100, rPT=rPT, rSLm=rSLm)\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "for coeffs, outputs in [x[0] for x in ret[1:]]:\n", 141 | " ax = pump_heatmap(coeffs, outputs)\n" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "# 13.2.b\n", 149 | "\n", 150 | "Fit the time series of dollar bars of E-mini S&P 500 futures to an O-U process. Given those parameters:\n", 151 | "\n", 152 | "What is the OTR?\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "**A: TODO: Figure out a good way of fitting to an O-U process.**\n" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "# 13.3.a\n", 167 | "\n", 168 | "Repeat exercise 2, this time on a time series of dollar bars of\n", 169 | "\n", 170 | "10-year U.S. Treasury Note futures\n" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "# 13.3.b\n", 178 | "\n", 179 | "Repeat exercise 2, this time on a time series of dollar bars of\n", 180 | "\n", 181 | "WTI Crude Oil futures\n" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "# 13.3.c\n", 189 | "\n", 190 | "Repeat exercise 2, this time on a time series of dollar bars of\n", 191 | "\n", 192 | "Are the results significantly different? Does this justify having execution traders specialized by product?\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "# 13.4.a\n", 200 | "\n", 201 | "Repeat exercise 2 after splitting the time series into two parts:\n", 202 | "\n", 203 | "The first time series ends on 3/15/2009.\n" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "# 13.4.b\n", 211 | "\n", 212 | "Repeat exercise 2 after splitting the time series into two parts:\n", 213 | "\n", 214 | "The second time series starts on 3/16/2009.\n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "# 13.4.c\n", 222 | "\n", 223 | "Repeat exercise 2 after splitting the time series into two parts:\n", 224 | "\n", 225 | "Are the OTRs significantly different?\n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "# 13.5\n", 233 | "\n", 234 | "How long do you estimate it would take to derive OTRs on the 100 most liquid futures contracts worldwide? Considering the results from exercise 4, how often do you think you may have to re-calibrate the OTRs? Does it make sense to precompute this data?" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "# 13.6\n", 242 | "\n", 243 | "Parallelize Snippets 13.1 and 13.2 using the `mpEngine` module described in Chapter 20." 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "**A: Code in synthetic_data.py**" 251 | ] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 3", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.5.6" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 2 275 | } 276 | -------------------------------------------------------------------------------- /Chapter 15 - Understanding Strategy Risk.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import matplotlib\n", 12 | "import matplotlib.pyplot as mpl\n", 13 | "\n", 14 | "%matplotlib inline\n", 15 | "mpl.style.use('ggplot')\n", 16 | "mpl.rcParams['figure.figsize'] = 16,6\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# 15.1.a\n", 24 | "\n", 25 | "A portfolio manager intends to launch a strategy that targets an annualized SR of 2. Bets have a precision rate of 60%, with weekly frequency. The exit conditions are 2% for profit taking and -2% for stop-loss.\n", 26 | "\n", 27 | "Is this strategy viable?" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "def run_sr_trials(p, pt=1, sl=1, trials=100000):\n", 37 | " out = []\n", 38 | " for i in range(trials):\n", 39 | " rnd = np.random.binomial(n=1, p=p)\n", 40 | " x = (pt if rnd == 1 else -sl)\n", 41 | " out.append(x)\n", 42 | " return np.mean(out) / np.std(out)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 32, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "As stated the strategy is expected to achieve an annualized SR of ~1.488.\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "p = 0.6\n", 60 | "freq = 52\n", 61 | "sr = run_sr_trials(p, trials=100000)\n", 62 | "print(\"As stated the strategy is expected to achieve an annualized SR of ~{:.3f}.\".format(float(np.sqrt(freq) * sr)))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "# 15.1.b\n", 70 | "\n", 71 | "A portfolio manager intends to launch a strategy that targets an annualized SR of 2. Bets have a precision rate of 60%, with weekly frequency. The exit conditions are 2% for profit taking and -2% for stop-loss.\n", 72 | "\n", 73 | "*Ceteris paribus*, what is the required precision rate that would make the strategy profitable?" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 12, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "Achieving an annualized SR of 2 while keeping the other values the same would require a precision rate of 0.642.\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "for p_ in np.linspace(0.55, 1.0, 50):\n", 91 | " sr = run_sr_trials(p_, trials=100000)\n", 92 | " if np.sqrt(freq) * sr > 2:\n", 93 | " break\n", 94 | " \n", 95 | "print(\"Achieving an annualized SR of 2 while keeping the other values the same would require a precision rate of {:.3f}.\".format(p_))\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "# 15.1.c\n", 103 | "\n", 104 | "A portfolio manager intends to launch a strategy that targets an annualized SR of 2. Bets have a precision rate of 60%, with weekly frequency. The exit conditions are 2% for profit taking and -2% for stop-loss.\n", 105 | "\n", 106 | "For what betting frequency is the target achievable?" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 16, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Achieving an annualized SR of 2 while keeping the other values the same would require a betting frequency of 96 trades per year.\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "sr = run_sr_trials(p, trials=1000000)\n", 124 | "target_sr = 2\n", 125 | "freq_ = (target_sr / sr) ** 2\n", 126 | "print(\"Achieving an annualized SR of 2 while keeping the other values the same would require a betting frequency of {:.0f} trades per year.\".format(freq_))\n" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "# 15.1.d\n", 134 | "\n", 135 | "A portfolio manager intends to launch a strategy that targets an annualized SR of 2. Bets have a precision rate of 60%, with weekly frequency. The exit conditions are 2% for profit taking and -2% for stop-loss.\n", 136 | "\n", 137 | "For what profit-taking threshold is the target achievable?" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 19, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "Achieving an annualized SR of 2 while keeping the other values the same would require a profit take limit of 2.30%.\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "pt = 2\n", 155 | "sl = 2\n", 156 | "for pt_ in np.linspace(2, 4, 200):\n", 157 | " sr = run_sr_trials(p, pt_, sl, 100000)\n", 158 | " if np.sqrt(freq) * sr > 2:\n", 159 | " break\n", 160 | " \n", 161 | "print(\"Achieving an annualized SR of 2 while keeping the other values the same would require a profit take limit of {:.2f}%.\".format(pt_))\n" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "# 15.1.e\n", 169 | "\n", 170 | "A portfolio manager intends to launch a strategy that targets an annualized SR of 2. Bets have a precision rate of 60%, with weekly frequency. The exit conditions are 2% for profit taking and -2% for stop-loss.\n", 171 | "\n", 172 | "What would be an alternative stop-loss?" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 20, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "Achieving an annualized SR of 2 while keeping the other values the same would require a stop loss limit of 1.74%.\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "for sl_ in np.linspace(2, 1, 200):\n", 190 | " sr = run_sr_trials(p, pt, sl_, 100000)\n", 191 | " if np.sqrt(freq) * sr > 2:\n", 192 | " break\n", 193 | " \n", 194 | "print(\"Achieving an annualized SR of 2 while keeping the other values the same would require a stop loss limit of {:.2f}%.\".format(sl_))\n" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "# 15.2.a\n", 202 | "\n", 203 | "Following up on the strategy from exercise 1.\n", 204 | "\n", 205 | "What is the sensitivity of SR to a 1% change in each parameter?" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 11, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "freq 0.003589926917184778\n", 218 | "p 0.02066441077148859\n", 219 | "pt 0.007574011138762322\n", 220 | "sl 0.006222697554722991\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "p = 0.55\n", 226 | "pt = 2\n", 227 | "sl = 2\n", 228 | "freq = 52\n", 229 | "\n", 230 | "def jiggle(v):\n", 231 | " return [v * 0.99, v, v * 1.01]\n", 232 | "\n", 233 | "print('freq', pd.Series([np.sqrt(freq_) * run_sr_trials(p, pt, sl, 10000000) for freq_ in jiggle(freq)]).pct_change().std())\n", 234 | "print('p ', pd.Series([np.sqrt(freq) * run_sr_trials(p_, pt, sl, 10000000) for p_ in jiggle(p)]).pct_change().std())\n", 235 | "print('pt ', pd.Series([np.sqrt(freq) * run_sr_trials(p, pt_, sl, 10000000) for pt_ in jiggle(pt)]).pct_change().std())\n", 236 | "print('sl ', pd.Series([np.sqrt(freq) * run_sr_trials(p, pt, sl_, 10000000) for sl_ in jiggle(sl)]).pct_change().std())\n", 237 | "\n" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 144, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "The SR is most sensitive to changes in precision\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "print(\"The SR is most sensitive to changes in precision.\")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "# 15.2.b\n", 262 | "\n", 263 | "Following up on the strategy from exercise 1.\n", 264 | "\n", 265 | "Given these sensitivies, and assuming that all parameters are equally hard to improve, which one offers the lowest hanging fruit?" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "**A: Under these assumptions -- precision.**" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "# 15.2.c\n", 280 | "\n", 281 | "Following up on the strategy from exercise 1.\n", 282 | "\n", 283 | "Does changing any of the parameters in exercise 1 impact the others? For example, does changing the betting frequency modify the precision rate, etc.?" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "**A: In this experiment the parameters function independent of one another. In real life, changing profit-take & stop-loss parameters would very likely impact precision and betting frequency.**" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "# 15.3.a\n", 298 | "\n", 299 | "Suppose a strategy that generates monthly bets over two years, with returns following a mixture of two Gaussians distributions. The first distribution has a mean of -0.1 and a standard deviation of 0.12. The second distribution has a mean of 0.06 and a standard deviation of 0.03. The probability that a draw comes from the first distribution is 0.15.\n", 300 | "\n", 301 | "Following Lopez de Prado and Peijan [2004] and Lopez de Prado and Foreman [2014], derive the first 4 moments for the mixture's returns." 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "# 15.3.b\n", 309 | "\n", 310 | "Suppose a strategy that generates monthly bets over two years, with returns following a mixture of two Gaussians distributions. The first distribution has a mean of -0.1 and a standard deviation of 0.12. The second distribution has a mean of 0.06 and a standard deviation of 0.03. The probability that a draw comes from the first distribution is 0.15.\n", 311 | "\n", 312 | "What is the annualized SR?" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 22, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "import scipy.stats as ss\n", 322 | "\n", 323 | "def binHR(sl, pt, freq, tSR):\n", 324 | " '''\n", 325 | " Given a trading rule characterized by the parameters {sl, pt, freq},\n", 326 | " what's the min precision p required to achieve a Sharpe ratio tSR?\n", 327 | " 1) Inputs\n", 328 | " sl: stop loss threshold\n", 329 | " pt: profit taking threshold\n", 330 | " freq: number of bets per year\n", 331 | " tSR: target annual Sharpe ratio\n", 332 | " 2) Output\n", 333 | " p: the min precision rate p required to achieve tSR\n", 334 | " '''\n", 335 | " a = (freq + tSR ** 2) * (pt - sl) ** 2\n", 336 | " b = (2 * freq * sl - tSR ** 2 * (pt - sl)) * (pt - sl)\n", 337 | " c = freq * sl ** 2\n", 338 | " p = (-b + (b ** 2 - 4 * a * c) ** 0.5) / (2.0 * a)\n", 339 | " return p\n", 340 | "\n", 341 | "def binFreq(sl, pt, p, tSR):\n", 342 | " ''' \n", 343 | " Given a trading rule characterized by the parameters {sl, pt, freq}, what's the number\n", 344 | " of bets/year needed to achieve a Sharpe ratio tSR with precision rate p?\n", 345 | " Note: Equation with radicals, check for extraneous solution. \n", 346 | " 1) Inputs\n", 347 | " sl: stop loss threshold\n", 348 | " pt: profit taking threshold\n", 349 | " p: precision rate\n", 350 | " tSR: target annual Sharpe ratio\n", 351 | " 2) Output\n", 352 | " freq: number of bets per year needed\n", 353 | " '''\n", 354 | " freq = (tSR * (pt - sl)) ** 2 * p * (1 - p) / ((pt - sl) * p + sl) ** 2 # possible extraneous\n", 355 | " if not np.isclose(binSR(sl, pt, freq, p), tSR):\n", 356 | " return\n", 357 | " return freq\n", 358 | "\n", 359 | "def mixGaussians(mu1, mu2, sigma1, sigma2, prob1, nObs):\n", 360 | " # Random draws from a mixture of gaussians\n", 361 | " ret1 = np.random.normal(mu1, sigma1, size=int(nObs * prob1))\n", 362 | " ret2 = np.random.normal(mu2, sigma2, size=int(nObs) - ret1.shape[0])\n", 363 | " ret = np.append(ret1, ret2, axis=0)\n", 364 | " np.random.shuffle(ret)\n", 365 | " return ret\n", 366 | "\n", 367 | "def probFailure(ret, freq, tSR):\n", 368 | " # Derive probability that the strategy may fail\n", 369 | " rPos, rNeg = ret[ret > 0].mean(), ret[ret <= 0].mean()\n", 370 | " p = ret[ret > 0].shape[0] / float(ret.shape[0])\n", 371 | " thresP = binHR(rNeg, rPos, freq, tSR)\n", 372 | " risk = ss.norm.cdf(thresP, p, p * (1 - p))\n", 373 | " return risk\n", 374 | "\n", 375 | "mu1, mu2, sigma1, sigma2, prob1, nObs = -0.1, 0.06, 0.12, 0.03, 0.15, 100000\n", 376 | "ret = mixGaussians(mu1, mu2, sigma1, sigma2, prob1, nObs)\n" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 23, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "The annualized SR is 1.58\n" 389 | ] 390 | } 391 | ], 392 | "source": [ 393 | "sr = np.mean(ret) / np.std(ret)\n", 394 | "asr = np.sqrt(12) * sr\n", 395 | "print(\"The annualized SR is {:.2f}.\".format(asr))" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "# 15.3.c\n", 403 | "\n", 404 | "Suppose a strategy that generates monthly bets over two years, with returns following a mixture of two Gaussians distributions. The first distribution has a mean of -0.1 and a standard deviation of 0.12. The second distribution has a mean of 0.06 and a standard deviation of 0.03. The probability that a draw comes from the first distribution is 0.15.\n", 405 | "\n", 406 | "Using these moments, compute PSR[1] (see Chapter 14). At a 95% confidence interval, would you discard this strategy?" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 33, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "name": "stdout", 416 | "output_type": "stream", 417 | "text": [ 418 | "The PSR is 1.0.\n" 419 | ] 420 | } 421 | ], 422 | "source": [ 423 | "from stats import psr\n", 424 | "ret = pd.Series(ret)\n", 425 | "print(\"The PSR is {}.\".format(psr(sr, len(ret), ret.skew(), ret.kurtosis())))" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "# 15.4\n", 433 | "\n", 434 | "Using Snippet 15.5, compute $P[p \\lt p_{\\theta*=1}]$ for the strategy described in exercise 3. At a significance level of 0.05, would you discard this strategy? Is this result consistent with $PSR[\\theta*]$?" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 31, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "The strategy is estimated to fail 41.09% of the time.\n" 447 | ] 448 | } 449 | ], 450 | "source": [ 451 | "# 1) Parameters\n", 452 | "mu1, mu2, sigma1, sigma2, prob1, nObs = -0.1, 0.06, 0.12, 0.03, 0.15, 100000\n", 453 | "tSR, freq = asr, 12\n", 454 | "# 2) Generate sample from mixture\n", 455 | "ret = mixGaussians(mu1, mu2, sigma1, sigma2, prob1, nObs)\n", 456 | "# 3) Compute prob failure\n", 457 | "probF = probFailure(ret, freq, tSR)\n", 458 | "print(\"The strategy is estimated to fail {:.2%} of the time.\".format(probF))\n" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "# 15.5\n", 466 | "\n", 467 | "In general what result do you expect to be more accurate, $PSR[\\theta*]$ or $P[p \\lt p_{\\theta*=1}]$? How are these two methods complementary?" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "# 15.6.a\n", 475 | "\n", 476 | "Re-examine the results from Chapter 13, in light of what you have learned in this chapter. \n", 477 | "\n", 478 | "Does the asymmetry between profit taking and stop-loss thresholds in OTRs make sense?" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "# 15.6.b\n", 486 | "\n", 487 | "Re-examine the results from Chapter 13, in light of what you have learned in this chapter. \n", 488 | "\n", 489 | "What is the range of $p$ implied by Figure 13.1, for a daily betting frequency?" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "# 15.6.c\n", 497 | "\n", 498 | "Re-examine the results from Chapter 13, in light of what you have learned in this chapter. \n", 499 | "\n", 500 | "What is the range of $p$ implied by Figure 13.5, for a weekly betting frequency?" 501 | ] 502 | } 503 | ], 504 | "metadata": { 505 | "kernelspec": { 506 | "display_name": "Python 3", 507 | "language": "python", 508 | "name": "python3" 509 | }, 510 | "language_info": { 511 | "codemirror_mode": { 512 | "name": "ipython", 513 | "version": 3 514 | }, 515 | "file_extension": ".py", 516 | "mimetype": "text/x-python", 517 | "name": "python", 518 | "nbconvert_exporter": "python", 519 | "pygments_lexer": "ipython3", 520 | "version": "3.5.6" 521 | } 522 | }, 523 | "nbformat": 4, 524 | "nbformat_minor": 2 525 | } 526 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Notes & Solutions to Advances in Financial Machine Learning 2 | 3 | I've been playing around with various trading strategies recently and one day stumbled upon [Advances in Financial Machine Learning my Marcos Lopez de Prado](https://www.amazon.com/Advances-Financial-Machine-Learning-Marcos/dp/1119482089): 4 | 5 | ![](https://i.imgur.com/QZYNwxx.jpg) 6 | 7 | 8 | Early on in the book, Marcos writes: 9 | 10 | >Investment management is one of the most multi-disciplinary areas of research, and this book reflects that fact. Understanding the various sections requires a practical knowledge of ML, market microstructure, portfolio management, mathematical finance, statistics, econometrics, linear algebra, convex optimization, discrete math, signal processing, information theory, object-oriented programming, parallel processing and supercomputing. 11 | 12 | And so while I can't quite claim all these competences (I would consider myself a beginner in 10 out of these 14 areas), I was still super-curious how one might go about algorithmic trading if they did and so embarked upon solving the exercises presented at the end of each chapter. The notebooks above contain these attempts at solutions. 13 | 14 | Thanks: 15 | -------------------------------- 16 | 17 | - **Huge thanks to Marcos for writing this book**. It's hard to explain how valuable it is, especially for someone like me. Marcos has synthesized 20 years of experience in financial mathematics and computer science into the most important & effective areas and then provided great code and guidance within them. In my view it stands completely alone in an industry shrouded in secrecy and elitism. He has helped me upgrade my thinking and toolkit 10x within just 2 weeks of working through the material. There were too many a-ha moments to count and I now have a much better picture of what I need to further learn & do to succeed in algorithmic trading. ❤️ 18 | - Many thanks also to https://github.com/hudson-and-thames/ for their [solutions](https://github.com/hudson-and-thames/research) and [mlfinlab package](https://github.com/hudson-and-thames/mlfinlab). Sometimes when nothing seemed to be working I was able to fall back on their solutions (where available) and implementations to sanity-check whether the bug was in my data, my understanding of the problem, my code or Marcos' code. (it was mostly #2 or #3) 19 | 20 | Notes: 21 | -------------------------------- 22 | 23 | - All of the questions and most of the code was transcribed from the book, slightly modified in places and made PEP-8 compliant. Most everything `camelCase` is Marcos' code, while `snake_case` is mine. 24 | - While I have a lot of experience in Python, I do not in finance or math, so there are likely bugs in my results somewhere :) This was exacerbated by the ridiculous pace I put on myself to work through the chapters (about 1 per day). 25 | 26 | **Current state of completion:** 27 | 28 | Begun or Done 29 | - [ ] `[3/5]` Chapter 2 - Financial Data Structures 30 | - [x] `[5/5]` Chapter 3 - Meta-Labeling 31 | - [x] `[7/7]` Chapter 4 - Sample Weights 32 | - [ ] `[5/6]` Chapter 5 - Fractionally Differentiated Features 33 | - [x] `[5/5]` Chapter 6 - Ensemble Methods 34 | - [x] `[5/5]` Chapter 7 - Cross-Validation in Finance 35 | - [x] `[5/5]` Chapter 8 - Feature Importance 36 | - [x] `[6/6]` Chapter 9 - Hyper-Parameter Tuning with Cross-Validation 37 | - [ ] `[4/7]` Chapter 10 - Bet Sizing 38 | - [x] `[5/5]` Chapter 11 - The Dangers of Backtesting 39 | - [x] `[5/5]` Chapter 12 - Backtesting through Cross-Validation 40 | - [ ] `[2/6]` Chapter 13 - Backtesting on Synthetic Data 41 | - [x] `[7/7]` Chapter 14 - Backtest Statistics 42 | - [ ] `[4/6]` Chapter 15 - Understanding Strategy Risk 43 | - [ ] `[3/5]` Chapter 16 - Machine Learning Asset Allocation 44 | 45 | Open 46 | - [ ] `[0/5]` Chapter 17 - Structural Breaks 47 | - [ ] `[0/5]` Chapter 18 - Entropy Features 48 | - [ ] `[0/12]` Chapter 19 - Microstructural Effects 49 | - [ ] `[0/6]` Chapter 20 - Multiprocessing and Vectorization 50 | 51 | 52 | -------------------------------------------------------------------------------- /active_signals.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Code from Chapter 10 of Advances in Financial Machine Learning 3 | ''' 4 | 5 | import pandas as pd 6 | from multiprocess import mpPandasObj 7 | 8 | def avgActiveSignals(signals, numThreads): 9 | # compute the average signal among those active 10 | # 1) time points where signals change (either one starts or one ends) 11 | tPnts = set(signals['t1'].dropna().values) 12 | tPnts = tPnts.union(signals.index.values) 13 | tPnts = list(tPnts) 14 | tPnts.sort() 15 | out = mpPandasObj(mpAvgActiveSignals, ('molecule', tPnts), numThreads, signals=signals) 16 | return out 17 | 18 | def mpAvgActiveSignals(signals, molecule): 19 | ''' 20 | At time loc, average signal among those still active. 21 | Signal is active if: 22 | a) issued before or at loc AND 23 | b) loc before signal's endtime, or endtime is still unknown (NaT). 24 | ''' 25 | out = pd.Series() 26 | for loc in molecule: 27 | df0 = (signals.index.values <= loc) & ((loc < signals['t1']) | pd.isnull(signals['t1'])) 28 | act = signals[df0].index 29 | if len(act) > 0: 30 | out[loc] = signals.loc[act, 'signal'].mean() 31 | else: 32 | out[loc] = 0 33 | return out -------------------------------------------------------------------------------- /cla_mlf.py: -------------------------------------------------------------------------------- 1 | # from https://github.com/hudson-and-thames/mlfinlab 2 | # license: https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt 3 | 4 | ''' 5 | This module implements the famous Critical Line Algorithm for mean-variance portfolio 6 | optimisation. It is reproduced with modification from the following paper: 7 | `D.H. Bailey and M.L. Prado “An Open-Source Implementation of the Critical- Line Algorithm for 8 | Portfolio Optimization”,Algorithms, 6 (2013), 169-196. `_ 9 | ''' 10 | 11 | import numbers 12 | from math import log, ceil 13 | import numpy as np 14 | import pandas as pd 15 | 16 | 17 | class CLA: 18 | # pylint: disable=too-many-instance-attributes 19 | ''' 20 | CLA is a famous portfolio optimisation algorithm used for calculating the optimal allocation weights for a given 21 | portfolio. It solves the optimisation problem with constraints on each weight - lower and upper bounds on the weight 22 | value. This class can compute multiple types of solutions - the normal cla solution, minimum variance solution, 23 | maximum sharpe solution and finally the solution to the efficient frontier. 24 | ''' 25 | 26 | def __init__(self, weight_bounds=(0, 1), calculate_returns="mean"): 27 | ''' 28 | Initialise the storage arrays and some preprocessing. 29 | 30 | :param weight_bounds: (tuple) a tuple specifying the lower and upper bound ranges for the portfolio weights 31 | :param calculate_returns: (str) the method to use for calculation of expected returns. 32 | Currently supports "mean" and "exponential" 33 | ''' 34 | 35 | self.weight_bounds = weight_bounds 36 | self.calculate_returns = calculate_returns 37 | self.weights = list() 38 | self.lambdas = list() 39 | self.gammas = list() 40 | self.free_weights = list() 41 | self.expected_returns = None 42 | self.cov_matrix = None 43 | self.lower_bounds = None 44 | self.upper_bounds = None 45 | self.max_sharpe = None 46 | self.min_var = None 47 | self.efficient_frontier_means = None 48 | self.efficient_frontier_sigma = None 49 | 50 | @staticmethod 51 | def _infnone(number): 52 | ''' 53 | Converts a Nonetype object to inf 54 | 55 | :param number: (int/float/None) a number 56 | :return: (float) -inf or number 57 | ''' 58 | return float("-inf") if number is None else number 59 | 60 | def _init_algo(self): 61 | ''' 62 | Initial setting up of the algorithm. Calculates the first free weight of the first turning point. 63 | 64 | :return: (list, list) asset index and the corresponding free weight value 65 | ''' 66 | 67 | # Form structured array 68 | structured_array = np.zeros((self.expected_returns.shape[0]), dtype=[("id", int), ("mu", float)]) 69 | expected_returns = [self.expected_returns[i][0] for i in range(self.expected_returns.shape[0])] # dump array into list 70 | 71 | # Fill structured array 72 | structured_array[:] = list(zip(list(range(self.expected_returns.shape[0])), expected_returns)) 73 | 74 | # Sort structured array based on increasing return value 75 | expected_returns = np.sort(structured_array, order="mu") 76 | 77 | # First free weight 78 | index, weights = expected_returns.shape[0], np.copy(self.lower_bounds) 79 | while np.sum(weights) < 1: 80 | index -= 1 81 | 82 | # Set weights one by one to the upper bounds 83 | weights[expected_returns[index][0]] = self.upper_bounds[expected_returns[index][0]] 84 | weights[expected_returns[index][0]] += 1 - np.sum(weights) 85 | return [expected_returns[index][0]], weights 86 | 87 | @staticmethod 88 | def _compute_bi(c_final, asset_bounds_i): 89 | ''' 90 | Calculates which bound value to assign to a bounded asset - lower bound or upper bound. 91 | 92 | :param c_final: (float) a value calculated using the covariance matrices of free weights. 93 | Refer to https://pdfs.semanticscholar.org/4fb1/2c1129ba5389bafe47b03e595d098d0252b9.pdf for 94 | more information. 95 | :param asset_bounds_i: (list) a list containing the lower and upper bound values for the ith weight 96 | :return: bounded weight value 97 | ''' 98 | 99 | if c_final > 0: 100 | return asset_bounds_i[1][0] 101 | return asset_bounds_i[0][0] 102 | 103 | def _compute_w(self, covar_f_inv, covar_fb, mean_f, w_b): 104 | ''' 105 | Compute the turning point associated with the current set of free weights F 106 | 107 | :param covar_f_inv: (np.array) inverse of covariance matrix of free assets 108 | :param covar_fb: (np.array) covariance matrix between free assets and bounded assets 109 | :param mean_f: (np.array) expected returns of free assets 110 | :param w_b: (np.array) bounded asset weight values 111 | 112 | :return: (array, float) list of turning point weights and gamma value from the langrange equation 113 | ''' 114 | 115 | # Compute gamma 116 | ones_f = np.ones(mean_f.shape) 117 | g_1 = np.dot(np.dot(ones_f.T, covar_f_inv), mean_f) 118 | g_2 = np.dot(np.dot(ones_f.T, covar_f_inv), ones_f) 119 | if w_b is None: 120 | g_final, w_1 = float(-self.lambdas[-1] * g_1 / g_2 + 1 / g_2), 0 121 | else: 122 | ones_b = np.ones(w_b.shape) 123 | g_3 = np.dot(ones_b.T, w_b) 124 | g_4 = np.dot(covar_f_inv, covar_fb) 125 | w_1 = np.dot(g_4, w_b) 126 | g_4 = np.dot(ones_f.T, w_1) 127 | g_final = float(-self.lambdas[-1] * g_1 / g_2 + (1 - g_3 + g_4) / g_2) 128 | 129 | # Compute weights 130 | w_2 = np.dot(covar_f_inv, ones_f) 131 | w_3 = np.dot(covar_f_inv, mean_f) 132 | free_asset_weights = -1*w_1 + g_final * w_2 + self.lambdas[-1] * w_3 133 | return free_asset_weights, g_final 134 | 135 | def _compute_lambda(self, covar_f_inv, covar_fb, mean_f, w_b, asset_index, b_i): 136 | ''' 137 | Calculate the lambda value in the langrange optimsation equation 138 | 139 | :param covar_f_inv: (np.array) inverse of covariance matrix of free assets 140 | :param covar_fb: (np.array) covariance matrix between free assets and bounded assets 141 | :param mean_f: (np.array) expected returns of free assets 142 | :param w_b: (np.array) bounded asset weight values 143 | :param asset_index: (int) index of the asset in the portfolio 144 | :param b_i: (list) list of upper and lower bounded weight values 145 | :return: (float) lambda value 146 | ''' 147 | 148 | # Compute C 149 | ones_f = np.ones(mean_f.shape) 150 | c_1 = np.dot(np.dot(ones_f.T, covar_f_inv), ones_f) 151 | c_2 = np.dot(covar_f_inv, mean_f) 152 | c_3 = np.dot(np.dot(ones_f.T, covar_f_inv), mean_f) 153 | c_4 = np.dot(covar_f_inv, ones_f) 154 | c_final = -1*c_1 * c_2[asset_index] + c_3 * c_4[asset_index] 155 | if c_final == 0: 156 | return None, None 157 | 158 | # Compute bi 159 | if isinstance(b_i, list): 160 | b_i = self._compute_bi(c_final, b_i) 161 | 162 | # Compute Lambda 163 | if w_b is None: 164 | 165 | # All free assets 166 | return float((c_4[asset_index] - c_1 * b_i) / c_final), b_i 167 | 168 | ones_b = np.ones(w_b.shape) 169 | l_1 = np.dot(ones_b.T, w_b) 170 | l_2 = np.dot(covar_f_inv, covar_fb) 171 | l_3 = np.dot(l_2, w_b) 172 | l_2 = np.dot(ones_f.T, l_3) 173 | lambda_value = float(((1 - l_1 + l_2) * c_4[asset_index] - c_1 * (b_i + l_3[asset_index])) / c_final) 174 | return lambda_value, b_i 175 | 176 | def _get_matrices(self, free_weights): 177 | ''' 178 | Calculate the required matrices between free and bounded assets 179 | 180 | :param free_weights: (list) list of free assets/weights 181 | :return: (tuple of np.array matrices) the corresponding matrices 182 | ''' 183 | 184 | covar_f = self._reduce_matrix(self.cov_matrix, free_weights, free_weights) 185 | mean_f = self._reduce_matrix(self.expected_returns, free_weights, [0]) 186 | bounded_weights = self._get_bounded_weights(free_weights) 187 | covar_fb = self._reduce_matrix(self.cov_matrix, free_weights, bounded_weights) 188 | w_b = self._reduce_matrix(self.weights[-1], bounded_weights, [0]) 189 | return covar_f, covar_fb, mean_f, w_b 190 | 191 | def _get_bounded_weights(self, free_weights): 192 | ''' 193 | Compute the list of bounded assets 194 | 195 | :param free_weights: (np.array) list of free weights/assets 196 | :return: (np.array) list of bounded assets/weights 197 | ''' 198 | 199 | return self._diff_lists(list(range(self.expected_returns.shape[0])), free_weights) 200 | 201 | @staticmethod 202 | def _diff_lists(list_1, list_2): 203 | ''' 204 | Calculate the set difference between two lists 205 | 206 | :param list_1: (list) a list of asset indices 207 | :param list_2: (list) another list of asset indices 208 | :return: (list) set difference between the two input lists 209 | ''' 210 | 211 | return list(set(list_1) - set(list_2)) 212 | 213 | @staticmethod 214 | def _reduce_matrix(matrix, row_indices, col_indices): 215 | ''' 216 | Reduce a matrix to the provided set of rows and columns 217 | 218 | :param matrix: (np.array) a matrix whose subset of rows and columns we need 219 | :param row_indices: (list) list of row indices for the matrix 220 | :param col_indices: (list) list of column indices for the matrix 221 | :return: (np.array) subset of input matrix 222 | ''' 223 | 224 | return matrix[np.ix_(row_indices, col_indices)] 225 | 226 | def _purge_num_err(self, tol): 227 | ''' 228 | Purge violations of inequality constraints (associated with ill-conditioned cov matrix) 229 | 230 | :param tol: (float) tolerance level for purging 231 | ''' 232 | 233 | index_1 = 0 234 | while True: 235 | flag = False 236 | if index_1 == len(self.weights): 237 | break 238 | if abs(sum(self.weights[index_1]) - 1) > tol: 239 | flag = True 240 | else: 241 | for index_2 in range(self.weights[index_1].shape[0]): 242 | if ( 243 | self.weights[index_1][index_2] - self.lower_bounds[index_2] < -tol 244 | or self.weights[index_1][index_2] - self.upper_bounds[index_2] > tol 245 | ): 246 | flag = True 247 | break 248 | if flag is True: 249 | del self.weights[index_1] 250 | del self.lambdas[index_1] 251 | del self.gammas[index_1] 252 | del self.free_weights[index_1] 253 | else: 254 | index_1 += 1 255 | 256 | def _purge_excess(self): 257 | ''' 258 | Remove violations of the convex hull 259 | ''' 260 | 261 | index_1, repeat = 0, False 262 | while True: 263 | if repeat is False: 264 | index_1 += 1 265 | if index_1 >= len(self.weights) - 1: 266 | break 267 | weights = self.weights[index_1] 268 | mean = np.dot(weights.T, self.expected_returns)[0, 0] 269 | index_2, repeat = index_1 + 1, False 270 | while True: 271 | if index_2 == len(self.weights): 272 | break 273 | weights = self.weights[index_2] 274 | mean_ = np.dot(weights.T, self.expected_returns)[0, 0] 275 | if mean < mean_: 276 | del self.weights[index_1] 277 | del self.lambdas[index_1] 278 | del self.gammas[index_1] 279 | del self.free_weights[index_1] 280 | repeat = True 281 | break 282 | index_2 += 1 283 | 284 | @staticmethod 285 | def _golden_section(obj, left, right, **kwargs): 286 | ''' 287 | Golden section method. Maximum if kargs['minimum']==False is passed 288 | 289 | :param obj: (function) The objective function on which the extreme will be found. 290 | :param left: (float) The leftmost extreme of search 291 | :param right: (float) The rightmost extreme of search 292 | ''' 293 | 294 | tol, sign, args = 1.0e-9, -1, None 295 | args = kwargs.get("args", None) 296 | num_iterations = int(ceil(-2.078087 * log(tol / abs(right - left)))) 297 | gs_ratio = 0.618033989 298 | complementary_gs_ratio = 1.0 - gs_ratio 299 | 300 | # Initialize 301 | x_1 = gs_ratio * left + complementary_gs_ratio * right 302 | x_2 = complementary_gs_ratio * left + gs_ratio * right 303 | f_1 = sign * obj(x_1, *args) 304 | f_2 = sign * obj(x_2, *args) 305 | 306 | # Loop 307 | for _ in range(num_iterations): 308 | if f_1 > f_2: 309 | left = x_1 310 | x_1 = x_2 311 | f_1 = f_2 312 | x_2 = complementary_gs_ratio * left + gs_ratio * right 313 | f_2 = sign * obj(x_2, *args) 314 | else: 315 | right = x_2 316 | x_2 = x_1 317 | f_2 = f_1 318 | x_1 = gs_ratio * left + complementary_gs_ratio * right 319 | f_1 = sign * obj(x_1, *args) 320 | 321 | if f_1 < f_2: 322 | return x_1, sign * f_1 323 | return x_2, sign * f_2 324 | 325 | def _eval_sr(self, alpha, w_0, w_1): 326 | ''' 327 | Evaluate the sharpe ratio of the portfolio within the convex combination 328 | 329 | :param alpha: (float) convex combination value 330 | :param w_0: (list) first endpoint of convex combination of weights 331 | :param w_1: (list) second endpoint of convex combination of weights 332 | :return: 333 | ''' 334 | 335 | weights = alpha * w_0 + (1 - alpha) * w_1 336 | returns = np.dot(weights.T, self.expected_returns)[0, 0] 337 | volatility = np.dot(np.dot(weights.T, self.cov_matrix), weights)[0, 0] ** 0.5 338 | return returns / volatility 339 | 340 | def _bound_free_weight(self, free_weights): 341 | ''' 342 | Add a free weight to list of bounded weights 343 | 344 | :param free_weights: (list) list of free-weight indices 345 | :return: (float, int, int) lambda value, index of free weight to be bounded, bound weight value 346 | ''' 347 | 348 | lambda_in = None 349 | i_in = None 350 | bi_in = None 351 | if len(free_weights) > 1: 352 | covar_f, covar_fb, mean_f, w_b = self._get_matrices(free_weights) 353 | covar_f_inv = np.linalg.inv(covar_f) 354 | j = 0 355 | for i in free_weights: 356 | lambda_i, b_i = self._compute_lambda( 357 | covar_f_inv, covar_fb, mean_f, w_b, j, [self.lower_bounds[i], self.upper_bounds[i]] 358 | ) 359 | if self._infnone(lambda_i) > self._infnone(lambda_in): 360 | lambda_in, i_in, bi_in = lambda_i, i, b_i 361 | j += 1 362 | return lambda_in, i_in, bi_in 363 | 364 | def _free_bound_weight(self, free_weights): 365 | ''' 366 | Add a bounded weight to list of free weights 367 | 368 | :param free_weights: (list) list of free-weight indices 369 | :return: (float, int) lambda value, index of the bounded weight to be made free 370 | ''' 371 | 372 | lambda_out = None 373 | i_out = None 374 | if len(free_weights) < self.expected_returns.shape[0]: 375 | bounded_weight_indices = self._get_bounded_weights(free_weights) 376 | for i in bounded_weight_indices: 377 | covar_f, covar_fb, mean_f, w_b = self._get_matrices(free_weights + [i]) 378 | covar_f_inv = np.linalg.inv(covar_f) 379 | lambda_i, _ = self._compute_lambda( 380 | covar_f_inv, 381 | covar_fb, 382 | mean_f, 383 | w_b, 384 | mean_f.shape[0] - 1, 385 | self.weights[-1][i], 386 | ) 387 | if (self.lambdas[-1] is None or lambda_i < self.lambdas[-1]) and lambda_i > self._infnone(lambda_out): 388 | lambda_out, i_out = lambda_i, i 389 | return lambda_out, i_out 390 | 391 | def _initialise(self, asset_prices, resample_by): 392 | ''' 393 | Initialise covariances, upper-counds, lower-bounds and storage buffers 394 | 395 | :param asset_prices: (pd.Dataframe) dataframe of asset prices 396 | :param resample_by: (str) specifies how to resample the prices - weekly, daily, monthly etc.. Defaults to 397 | 'B' meaning daily business days which is equivalent to no resampling 398 | ''' 399 | 400 | # Initial checks 401 | if not isinstance(asset_prices, pd.DataFrame): 402 | raise ValueError("Asset prices matrix must be a dataframe") 403 | if not isinstance(asset_prices.index, pd.DatetimeIndex): 404 | raise ValueError("Asset prices dataframe must be indexed by date.") 405 | 406 | # Resample the asset prices 407 | asset_prices = asset_prices.resample(resample_by).last() 408 | 409 | # Calculate the expected returns 410 | if self.calculate_returns == "mean": 411 | self.expected_returns = self._calculate_mean_historical_returns(asset_prices=asset_prices) 412 | elif self.calculate_returns == "exponential": 413 | self.expected_returns = self._calculate_exponential_historical_returns(asset_prices=asset_prices) 414 | else: 415 | raise ValueError("Unknown returns specified. Supported returns - mean, exponential") 416 | self.expected_returns = np.array(self.expected_returns).reshape((len(self.expected_returns), 1)) 417 | if (self.expected_returns == np.ones(self.expected_returns.shape) * self.expected_returns.mean()).all(): 418 | self.expected_returns[-1, 0] += 1e-5 419 | 420 | # Calculate the covariance matrix 421 | self.cov_matrix = np.asarray(asset_prices.cov()) 422 | 423 | # Intialise lower bounds 424 | if isinstance(self.weight_bounds[0], numbers.Real): 425 | self.lower_bounds = np.ones(self.expected_returns.shape) * self.weight_bounds[0] 426 | else: 427 | self.lower_bounds = np.array(self.weight_bounds[0]).reshape(self.expected_returns.shape) 428 | 429 | # Intialise upper bounds 430 | if isinstance(self.weight_bounds[0], numbers.Real): 431 | self.upper_bounds = np.ones(self.expected_returns.shape) * self.weight_bounds[1] 432 | else: 433 | self.upper_bounds = np.array(self.weight_bounds[1]).reshape(self.expected_returns.shape) 434 | 435 | # Initialise storage buffers 436 | self.weights = [] 437 | self.lambdas = [] 438 | self.gammas = [] 439 | self.free_weights = [] 440 | 441 | @staticmethod 442 | def _calculate_mean_historical_returns(asset_prices, frequency=252): 443 | ''' 444 | Calculate the annualised mean historical returns from asset price data 445 | 446 | :param asset_prices: (pd.DataFrame) asset price data 447 | :return: (np.array) returns per asset 448 | ''' 449 | 450 | returns = asset_prices.pct_change().dropna(how="all") 451 | returns = returns.mean() * frequency 452 | return returns 453 | 454 | @staticmethod 455 | def _calculate_exponential_historical_returns(asset_prices, frequency=252, span=500): 456 | ''' 457 | Calculate the exponentially-weighted mean of (daily) historical returns, giving 458 | higher weight to more recent data. 459 | 460 | :param asset_prices: (pd.DataFrame) asset price data 461 | :return: (np.array) returns per asset 462 | ''' 463 | 464 | returns = asset_prices.pct_change().dropna(how="all") 465 | returns = returns.ewm(span=span).mean().iloc[-1] * frequency 466 | return returns 467 | 468 | def allocate(self, asset_prices, solution="cla_turning_points", resample_by="B"): 469 | # pylint: disable=consider-using-enumerate,too-many-locals,too-many-branches,too-many-statements 470 | ''' 471 | Calculate the portfolio asset allocations using the method specified. 472 | 473 | :param asset_prices: (pd.Dataframe) a dataframe of historical asset prices (adj closed) 474 | :param solution: (str) specify the type of solution to compute. Options are: cla_turning_points, max_sharpe, 475 | min_volatility, efficient_frontier 476 | :param resample_by: (str) specifies how to resample the prices - weekly, daily, monthly etc.. Defaults to 477 | 'B' meaning daily business days which is equivalent to no resampling 478 | ''' 479 | 480 | # Some initial steps before the algorithm runs 481 | self._initialise(asset_prices=asset_prices, resample_by=resample_by) 482 | assets = asset_prices.columns 483 | 484 | # Compute the turning points, free sets and weights 485 | free_weights, weights = self._init_algo() 486 | self.weights.append(np.copy(weights)) # store solution 487 | self.lambdas.append(None) 488 | self.gammas.append(None) 489 | self.free_weights.append(free_weights[:]) 490 | while True: 491 | 492 | # 1) Bound one free weight 493 | lambda_in, i_in, bi_in = self._bound_free_weight(free_weights) 494 | 495 | # 2) Free one bounded weight 496 | lambda_out, i_out = self._free_bound_weight(free_weights) 497 | 498 | # 3) Compute minimum variance solution 499 | if (lambda_in is None or lambda_in < 0) and (lambda_out is None or lambda_out < 0): 500 | self.lambdas.append(0) 501 | covar_f, covar_fb, mean_f, w_b = self._get_matrices(free_weights) 502 | covar_f_inv = np.linalg.inv(covar_f) 503 | mean_f = np.zeros(mean_f.shape) 504 | 505 | # 4) Decide whether to free a bounded weight or bound a free weight 506 | else: 507 | if self._infnone(lambda_in) > self._infnone(lambda_out): 508 | self.lambdas.append(lambda_in) 509 | free_weights.remove(i_in) 510 | weights[i_in] = bi_in # set value at the correct boundary 511 | else: 512 | self.lambdas.append(lambda_out) 513 | free_weights.append(i_out) 514 | covar_f, covar_fb, mean_f, w_b = self._get_matrices(free_weights) 515 | covar_f_inv = np.linalg.inv(covar_f) 516 | 517 | # 5) Compute solution vector 518 | w_f, gamma = self._compute_w(covar_f_inv, covar_fb, mean_f, w_b) 519 | for i in range(len(free_weights)): 520 | weights[free_weights[i]] = w_f[i] 521 | self.weights.append(np.copy(weights)) # store solution 522 | self.gammas.append(gamma) 523 | self.free_weights.append(free_weights[:]) 524 | if self.lambdas[-1] == 0: 525 | break 526 | 527 | # 6) Purge turning points 528 | self._purge_num_err(10e-10) 529 | self._purge_excess() 530 | 531 | # Compute the specified solution 532 | self._compute_solution(assets=assets, solution=solution) 533 | 534 | def _compute_solution(self, assets, solution): 535 | ''' 536 | Compute the desired solution to the portfolio optimisation problem 537 | 538 | :param assets: (list) a list of asset names 539 | :param solution: (str) specify the type of solution to compute. Options are: cla_turning_points, max_sharpe, 540 | min_volatility, efficient_frontier 541 | ''' 542 | 543 | if solution == "max_sharpe": 544 | self.max_sharpe, self.weights = self._max_sharpe() 545 | self.weights = pd.DataFrame(self.weights) 546 | self.weights.index = assets 547 | self.weights = self.weights.T 548 | elif solution == "min_volatility": 549 | self.min_var, self.weights = self._min_volatility() 550 | self.weights = pd.DataFrame(self.weights) 551 | self.weights.index = assets 552 | self.weights = self.weights.T 553 | elif solution == "efficient_frontier": 554 | self.efficient_frontier_means, self.efficient_frontier_sigma, self.weights = self._efficient_frontier() 555 | weights_copy = self.weights.copy() 556 | for i, turning_point in enumerate(weights_copy): 557 | self.weights[i] = turning_point.reshape(1, -1)[0] 558 | self.weights = pd.DataFrame(self.weights, columns=assets) 559 | elif solution == "cla_turning_points": 560 | # Reshape the weight matrix 561 | weights_copy = self.weights.copy() 562 | for i, turning_point in enumerate(weights_copy): 563 | self.weights[i] = turning_point.reshape(1, -1)[0] 564 | self.weights = pd.DataFrame(self.weights, columns=assets) 565 | else: 566 | raise ValueError("Unknown solution string specified. Supported solutions - cla_turning_points, " 567 | "efficient_frontier, min_volatility, max_sharpe") 568 | 569 | def _max_sharpe(self): 570 | ''' 571 | Compute the maximum sharpe portfolio allocation 572 | 573 | :return: (float, np.array) tuple of max. sharpe value and the set of weight allocations 574 | ''' 575 | 576 | # 1) Compute the local max SR portfolio between any two neighbor turning points 577 | w_sr, sharpe_ratios = [], [] 578 | for i in range(len(self.weights) - 1): 579 | w_0 = np.copy(self.weights[i]) 580 | w_1 = np.copy(self.weights[i + 1]) 581 | kwargs = {"minimum": False, "args": (w_0, w_1)} 582 | alpha, sharpe_ratio = self._golden_section(self._eval_sr, 0, 1, **kwargs) 583 | w_sr.append(alpha * w_0 + (1 - alpha) * w_1) 584 | sharpe_ratios.append(sharpe_ratio) 585 | 586 | maximum_sharp_ratio = max(sharpe_ratios) 587 | weights_with_max_sharpe_ratio = w_sr[sharpe_ratios.index(maximum_sharp_ratio)] 588 | return maximum_sharp_ratio, weights_with_max_sharpe_ratio 589 | 590 | def _min_volatility(self): 591 | ''' 592 | Compute minimum volatility portfolio allocation 593 | 594 | :return: (float, np.array) tuple of minimum variance value and the set of weight allocations 595 | ''' 596 | 597 | var = [] 598 | for weights in self.weights: 599 | volatility = np.dot(np.dot(weights.T, self.cov_matrix), weights) 600 | var.append(volatility) 601 | min_var = min(var) 602 | return min_var ** .5, self.weights[var.index(min_var)] 603 | 604 | def _efficient_frontier(self, points=100): 605 | # pylint: disable=invalid-name 606 | ''' 607 | Compute the entire efficient frontier solution 608 | 609 | :param points: (int) number of efficient frontier points to be calculated 610 | :return: tuple of mean, variance amd weights of the frontier solutions 611 | ''' 612 | 613 | means, sigma, weights = [], [], [] 614 | 615 | # remove the 1, to avoid duplications 616 | partitions = np.linspace(0, 1, points // len(self.weights))[:-1] 617 | b = list(range(len(self.weights) - 1)) 618 | for i in b: 619 | w_0, w_1 = self.weights[i], self.weights[i + 1] 620 | 621 | if i == b[-1]: 622 | # include the 1 in the last iteration 623 | partitions = np.linspace(0, 1, points // len(self.weights)) 624 | 625 | for j in partitions: 626 | w = w_1 * j + (1 - j) * w_0 627 | weights.append(np.copy(w)) 628 | means.append(np.dot(w.T, self.expected_returns)[0, 0]) 629 | sigma.append(np.dot(np.dot(w.T, self.cov_matrix), w)[0, 0] ** 0.5) 630 | return means, sigma, weights 631 | -------------------------------------------------------------------------------- /cv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection._split import _BaseKFold 4 | from sklearn.metrics import log_loss, accuracy_score 5 | 6 | def getTrainTimes(t1, testTimes): 7 | trn = t1.copy(deep=True) 8 | for i, j in testTimes.iteritems(): 9 | df0 = trn[(i <= trn.index) & (trn.index <= j)].index # Train starts within test 10 | df1 = trn[(i <= trn) & (trn <= j)].index # Train ends within test 11 | df2 = trn[(trn.index <= i) & (j <= trn)].index # Train envelops test 12 | trn = trn.drop(df0.union(df1).union(df2)) 13 | return trn 14 | 15 | 16 | def getEmbargoTimes(times, pctEmbargo): 17 | step = int(times.shape[0] * pctEmbargo) 18 | if step == 0: 19 | mbrg = pd.Series(times, index=times) 20 | else: 21 | mbrg = pd.Series(times[step:], index=times[:-step]) 22 | mbrg = mbrg.append(pd.Series(times[-1], index=times[-step:])) 23 | return mbrg 24 | 25 | def cvScore(clf, X, y, sample_weight=None, scoring='neg_log_loss', t1=None, cv=None, cvGen=None, pctEmbargo=None): 26 | if scoring not in ['neg_log_loss', 'accuracy']: 27 | raise Exception('Wrong scoring method') 28 | if cvGen is None: 29 | cvGen = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo) 30 | if sample_weight is None: 31 | sample_weight = np.ones(len(X)) 32 | 33 | score = [] 34 | for train, test in cvGen.split(X=X): 35 | fit = clf.fit( 36 | X=X.iloc[train, :], y=y.iloc[train], 37 | sample_weight=sample_weight[train] 38 | ) 39 | if scoring == 'neg_log_loss': 40 | prob = fit.predict_proba(X.iloc[test, :]) 41 | score_ = -log_loss(y.iloc[test], prob, sample_weight=sample_weight[test], labels=clf.classes_) 42 | else: 43 | pred = fit.predict(X.iloc[test, :]) 44 | score_ = accuracy_score(y.iloc[test], pred, sample_weight=sample_weight[test]) 45 | score.append(score_) 46 | return np.array(score) 47 | 48 | 49 | class PurgedKFold(_BaseKFold): 50 | ''' 51 | Extend KFold to work with labels that span intervals 52 | The train is is purged of observations overlapping test-label intervals 53 | Test set is assumed contiguous (shuffle=False), w/o training examples in between 54 | ''' 55 | def __init__(self, n_splits=3, t1=None, pctEmbargo=0.0): 56 | if not isinstance(t1, pd.Series): 57 | raise ValueError('Label through Dates must be a pandas series') 58 | super(PurgedKFold, self).__init__(n_splits, shuffle=False, random_state=None) 59 | self.t1 = t1 60 | self.pctEmbargo = pctEmbargo 61 | 62 | def split(self, X, y=None, groups=None): 63 | if X.shape[0] != self.t1.shape[0]: 64 | raise ValueError('X and ThruDateValues must have the same index length') 65 | indices = np.arange(X.shape[0]) 66 | mbrg = int(X.shape[0] * self.pctEmbargo) 67 | test_starts = [(i[0], i[-1] + 1) for i in np.array_split(np.arange(X.shape[0]), self.n_splits)] 68 | for i, j in test_starts: 69 | t0 = self.t1.index[i] 70 | test_indices = indices[i:j] 71 | maxT1Idx = self.t1.index.searchsorted(self.t1[test_indices].max()) 72 | train_indices = self.t1.index.searchsorted(self.t1[self.t1<=t0].index) 73 | train_indices = np.concatenate((train_indices, indices[maxT1Idx + mbrg:])) 74 | yield train_indices, test_indices 75 | -------------------------------------------------------------------------------- /feature_imp.py: -------------------------------------------------------------------------------- 1 | from cv import PurgedKFold, cvScore 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.datasets import make_classification 5 | 6 | def featImpMDI(fit, featNames): 7 | # feat importance based on IS mean impurity reduction 8 | df0 = {i: tree.feature_importances_ for i, tree in enumerate(fit.estimators_)} 9 | df0 = pd.DataFrame.from_dict(df0, orient='index') 10 | df0.columns = featNames 11 | df0 = df0.replace(0, np.nan) # because max_features = 1 12 | imp = pd.concat({'mean': df0.mean(), 'std': df0.std() * df0.shape[0] ** -0.5}, axis=1) 13 | imp /= imp['mean'].sum() 14 | return imp 15 | 16 | def featImpMDA(clf, X, y, cv, sample_weight, t1, pctEmbargo, scoring='neg_log_loss'): 17 | # feat importance based on OOS score reduction 18 | if scoring not in ['neg_log_loss', 'accuracy']: 19 | raise ValueError('wrong scoring method') 20 | from sklearn.metrics import log_loss, accuracy_score 21 | cvGen = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo) 22 | scr0, scr1 = pd.Series(), pd.DataFrame(columns=X.columns) 23 | for i, (train, test) in enumerate(cvGen.split(X=X)): 24 | X0, y0, w0 = X.iloc[train, :], y.iloc[train], sample_weight.iloc[train] 25 | X1, y1, w1 = X.iloc[test, :], y.iloc[test], sample_weight.iloc[test] 26 | fit = clf.fit(X=X0, y=y0, sample_weight=w0.values) 27 | if scoring == 'neg_log_loss': 28 | prob = fit.predict_proba(X1) 29 | scr0.loc[i] = -log_loss(y1, prob, sample_weight=w1.values, labels=clf.classes_) 30 | else: 31 | pred = fit.predict(X1) 32 | scr0.loc[i] = accuracy_score(y1, pred, sample_weight=w1.values) 33 | 34 | for j in X.columns: 35 | X1_ = X1.copy(deep=True) 36 | np.random.shuffle(X1_[j].values) # permutation of a single column 37 | if scoring == 'neg_log_loss': 38 | prob = fit.predict_proba(X1_) 39 | scr1.loc[i, j] = -log_loss(y1, prob, sample_weight=w1.values, labels=clf.classes_) 40 | else: 41 | pred = fit.predict(X1_) 42 | scr1.loc[i, j] = accuracy_score(y1, pred, sample_weight=w1.values) 43 | 44 | imp = (-scr1).add(scr0, axis=0) 45 | if scoring == 'neg_log_loss': 46 | imp = imp / -scr1 47 | else: 48 | imp = imp / (1.0 - scr1) 49 | 50 | imp = pd.concat({'mean': imp.mean(), 'std': imp.std() * imp.shape[0] ** -0.5}, axis=1) 51 | return imp, scr0.mean() 52 | 53 | def auxFeatImpSFI(featNames, clf, trnsX, cont, scoring, cvGen): 54 | imp = pd.DataFrame(columns=['mean', 'std']) 55 | for featName in featNames: 56 | df0 = cvScore(clf, X=trnsX[[featName]], y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cvGen=cvGen) 57 | imp.loc[featName, 'mean'] = df0.mean() 58 | imp.loc[featName, 'std'] = df0.std() * df0.shape[0] ** -0.5 59 | return imp 60 | 61 | 62 | def getTestData(n_features=40, n_informative=10, n_redundant=10, n_samples=10000): 63 | # generate a random dataset for a classification problem 64 | trnsX, cont = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, random_state=0, shuffle=False) 65 | df0 = pd.DatetimeIndex(periods=n_samples, freq=pd.tseries.offsets.Minute(), end=pd.datetime.today()) 66 | trnsX = pd.DataFrame(trnsX, index=df0) 67 | cont = pd.Series(cont, index=df0).to_frame('bin') 68 | df0 = ['I_%s' % i for i in range(n_informative)] + ['R_%s' % i for i in range(n_redundant)] 69 | df0 += ['N_%s' % i for i in range(n_features - len(df0))] 70 | trnsX.columns = df0 71 | cont['w'] = 1.0 / cont.shape[0] 72 | cont['t1'] = pd.Series(cont.index, index=cont.index) 73 | return trnsX, cont -------------------------------------------------------------------------------- /feature_importances_mp.py: -------------------------------------------------------------------------------- 1 | from sklearn.tree import DecisionTreeClassifier 2 | from sklearn.ensemble import BaggingClassifier 3 | 4 | from sklearn.metrics import accuracy_score 5 | 6 | from mlfinlab.feature_importance import ( 7 | feature_importance_mean_decrease_impurity, 8 | feature_importance_mean_decrease_accuracy, 9 | feature_importance_sfi, 10 | ) 11 | 12 | from mlfinlab.cross_validation import ml_cross_val_score, PurgedKFold 13 | from mlfinlab.util.multiprocess import process_jobs 14 | from sklearn.model_selection import KFold 15 | 16 | 17 | def feature_importances(X, cont, method, allow_masking_effects=False, n_splits=10): 18 | max_features = None if allow_masking_effects else 1 19 | clf = DecisionTreeClassifier( 20 | criterion='entropy', max_features=max_features, class_weight='balanced', min_weight_fraction_leaf=0.0 21 | ) 22 | clf = BaggingClassifier( 23 | base_estimator=clf, n_estimators=1000, max_features=1.0, max_samples=1.0, oob_score=True, n_jobs=-1 24 | ) 25 | fit = clf.fit(X, cont['bin']) 26 | oob_score = fit.oob_score_ 27 | 28 | cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=cont['t1']) 29 | oos_score = ml_cross_val_score(clf, X, cont['bin'], cv_gen=cv_gen, scoring=accuracy_score).mean() 30 | 31 | if method == 'MDI': 32 | imp = feature_importance_mean_decrease_impurity(fit, X.columns) 33 | elif method == 'MDA': 34 | imp = feature_importance_mean_decrease_accuracy(clf, X, cont['bin'], cv_gen, scoring=accuracy_score) 35 | elif method == 'SFI': 36 | imp = feature_importance_sfi(clf, X, cont['bin'], cv_gen, scoring=accuracy_score) 37 | 38 | return imp, oob_score, oos_score 39 | -------------------------------------------------------------------------------- /filters.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def cusum(gRaw, h): 5 | tEvents, sPos, sNeg = [], 0, 0 6 | diff = gRaw.diff() 7 | for i in diff.index[1:]: 8 | sPos, sNeg = max(0, sPos + diff.loc[i]), min(0, sNeg + diff.loc[i]) 9 | if sNeg < -h: 10 | sNeg = 0 11 | tEvents.append(i) 12 | elif sNeg > h: 13 | sPos = 0 14 | tEvents.append(i) 15 | return pd.DatetimeIndex(tEvents) -------------------------------------------------------------------------------- /hrp_mlf.py: -------------------------------------------------------------------------------- 1 | # from https://github.com/hudson-and-thames/mlfinlab 2 | # license: https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt 3 | 4 | ''' 5 | This module implements the HRP algorithm mentioned in the following paper: 6 | `López de Prado, Marcos, Building Diversified Portfolios that Outperform Out-of-Sample (May 23, 2016). 7 | Journal of Portfolio Management, 2016 `_; 8 | The code is reproduced with modification from his book: Advances in Financial Machine Learning, Chp-16 9 | ''' 10 | 11 | import numpy as np 12 | import pandas as pd 13 | from scipy.cluster.hierarchy import dendrogram, linkage 14 | from scipy.spatial.distance import squareform 15 | from sklearn.covariance import OAS 16 | # import matplotlib 17 | 18 | # matplotlib.use('Agg') 19 | 20 | 21 | class HierarchicalRiskParity: 22 | ''' 23 | The HRP algorithm is a robust algorithm which tries to overcome the limitations of the CLA algorithm. It has three 24 | important steps - hierarchical tree clustering, quasi diagnalisation and recursive bisection. Non-inversion of 25 | covariance matrix makes HRP a very stable algorithm and insensitive to small changes in covariances. 26 | ''' 27 | 28 | def __init__(self): 29 | self.weights = list() 30 | self.seriated_correlations = None 31 | self.seriated_distances = None 32 | self.ordered_indices = None 33 | self.clusters = None 34 | 35 | @staticmethod 36 | def _tree_clustering(correlation, method='single'): 37 | ''' 38 | Perform the traditional heirarchical tree clustering 39 | 40 | :param correlation: (np.array) correlation matrix of the assets 41 | :param method: (str) the type of clustering to be done 42 | :return: distance matrix and clusters 43 | ''' 44 | 45 | distances = np.sqrt((1 - correlation).round(5) / 2) 46 | clusters = linkage(squareform(distances.values), method=method) 47 | return distances, clusters 48 | 49 | def _quasi_diagnalization(self, num_assets, curr_index): 50 | ''' 51 | Rearrange the assets to reorder them according to hierarchical tree clustering order. 52 | 53 | :param num_assets: (int) the total number of assets 54 | :param curr_index: (int) current index 55 | :return: (list) the assets rearranged according to hierarchical clustering 56 | ''' 57 | 58 | if curr_index < num_assets: 59 | return [curr_index] 60 | 61 | left = int(self.clusters[curr_index - num_assets, 0]) 62 | right = int(self.clusters[curr_index - num_assets, 1]) 63 | 64 | return (self._quasi_diagnalization(num_assets, left) + self._quasi_diagnalization(num_assets, right)) 65 | 66 | def _get_seriated_matrix(self, assets, distances, correlations): 67 | ''' 68 | Based on the quasi-diagnalization, reorder the original distance matrix, so that assets within 69 | the same cluster are grouped together. 70 | 71 | :param assets: (list) list of asset names in the portfolio 72 | :param distances: (pd.Dataframe) distance values between asset returns 73 | :param correlations: (pd.Dataframe) correlations between asset returns 74 | :return: (np.array) re-arranged distance matrix based on tree clusters 75 | ''' 76 | 77 | ordering = assets[self.ordered_indices] 78 | seriated_distances = distances.loc[ordering, ordering] 79 | seriated_correlations = correlations.loc[ordering, ordering] 80 | return seriated_distances, seriated_correlations 81 | 82 | def _recursive_bisection(self, covariances, assets): 83 | ''' 84 | Recursively assign weights to the clusters - ultimately assigning weights to the inidividual assets 85 | 86 | :param covariances: (np.array) the covariance matrix 87 | :param assets: (list) list of asset names in the portfolio 88 | ''' 89 | 90 | self.weights = pd.Series(1, index=self.ordered_indices) 91 | clustered_alphas = [self.ordered_indices] 92 | 93 | while clustered_alphas: 94 | clustered_alphas = [cluster[start:end] 95 | for cluster in clustered_alphas 96 | for start, end in ((0, len(cluster) // 2), (len(cluster) // 2, len(cluster))) 97 | if len(cluster) > 1] 98 | 99 | for subcluster in range(0, len(clustered_alphas), 2): 100 | left_cluster = clustered_alphas[subcluster] 101 | right_cluster = clustered_alphas[subcluster + 1] 102 | 103 | # Get left cluster variance 104 | left_subcovar = covariances.iloc[left_cluster, left_cluster] 105 | inv_diag = 1 / np.diag(left_subcovar.values) 106 | parity_w = inv_diag * (1 / np.sum(inv_diag)) 107 | left_cluster_var = np.dot(parity_w, np.dot(left_subcovar, parity_w)) 108 | 109 | # Get right cluster variance 110 | right_subcovar = covariances.iloc[right_cluster, right_cluster] 111 | inv_diag = 1 / np.diag(right_subcovar.values) 112 | parity_w = inv_diag * (1 / np.sum(inv_diag)) 113 | right_cluster_var = np.dot(parity_w, np.dot(right_subcovar, parity_w)) 114 | 115 | # Calculate allocation factor and weights 116 | alloc_factor = 1 - left_cluster_var / (left_cluster_var + right_cluster_var) 117 | self.weights[left_cluster] *= alloc_factor 118 | self.weights[right_cluster] *= 1 - alloc_factor 119 | 120 | # Assign actual asset values to weight index 121 | self.weights.index = assets[self.ordered_indices] 122 | self.weights = pd.DataFrame(self.weights) 123 | self.weights = self.weights.T 124 | 125 | def plot_clusters(self, assets): 126 | ''' 127 | Plot a dendrogram of the hierarchical clusters 128 | 129 | :param assets: (list) list of asset names in the portfolio 130 | ''' 131 | 132 | dendrogram_plot = dendrogram(self.clusters, labels=assets) 133 | return dendrogram_plot 134 | 135 | @staticmethod 136 | def _calculate_returns(asset_prices, resample_by): 137 | ''' 138 | Calculate the annualised mean historical returns from asset price data 139 | 140 | :param asset_prices: (pd.Dataframe) a dataframe of historical asset prices (daily close) 141 | :param resample_by: (str) specifies how to resample the prices - weekly, daily, monthly etc.. Defaults to 142 | 'B' meaning daily business days which is equivalent to no resampling 143 | :return: (pd.Dataframe) stock returns 144 | ''' 145 | 146 | asset_prices = asset_prices.resample(resample_by).last() 147 | asset_returns = asset_prices.pct_change() 148 | asset_returns = asset_returns.dropna(how='all') 149 | return asset_returns 150 | 151 | @staticmethod 152 | def _shrink_covariance(covariance): 153 | ''' 154 | Regularise/Shrink the asset covariances 155 | 156 | :param covariance: (pd.Dataframe) asset returns covariances 157 | :return: (pd.Dataframe) shrinked asset returns covariances 158 | ''' 159 | 160 | oas = OAS() 161 | oas.fit(covariance) 162 | shrinked_covariance = oas.covariance_ 163 | return pd.DataFrame(shrinked_covariance, index=covariance.columns, columns=covariance.columns) 164 | 165 | @staticmethod 166 | def _cov2corr(covariance): 167 | ''' 168 | Calculate the correlations from asset returns covariance matrix 169 | 170 | :param covariance: (pd.Dataframe) asset returns covariances 171 | :return: (pd.Dataframe) correlations between asset returns 172 | ''' 173 | 174 | d_matrix = np.zeros_like(covariance) 175 | diagnoal_sqrt = np.sqrt(np.diag(covariance)) 176 | np.fill_diagonal(d_matrix, diagnoal_sqrt) 177 | d_inv = np.linalg.inv(d_matrix) 178 | corr = np.dot(np.dot(d_inv, covariance), d_inv) 179 | corr = pd.DataFrame(corr, index=covariance.columns, columns=covariance.columns) 180 | return corr 181 | 182 | def allocate(self, asset_prices, resample_by='B', use_shrinkage=False): 183 | ''' 184 | Calculate asset allocations using HRP algorithm 185 | 186 | :param asset_prices: (pd.Dataframe) a dataframe of historical asset prices (daily close) 187 | indexed by date 188 | :param resample_by: (str) specifies how to resample the prices - weekly, daily, monthly etc.. Defaults to 189 | 'B' meaning daily business days which is equivalent to no resampling 190 | :param use_shrinkage: (Boolean) specifies whether to shrink the covariances 191 | ''' 192 | 193 | if not isinstance(asset_prices, pd.DataFrame): 194 | raise ValueError("Asset prices matrix must be a dataframe") 195 | if not isinstance(asset_prices.index, pd.DatetimeIndex): 196 | raise ValueError("Asset prices dataframe must be indexed by date.") 197 | 198 | # Calculate the returns 199 | asset_returns = self._calculate_returns(asset_prices, resample_by=resample_by) 200 | 201 | num_assets = asset_returns.shape[1] 202 | assets = asset_returns.columns 203 | 204 | # Covariance and correlation 205 | cov = asset_returns.cov() 206 | if use_shrinkage: 207 | cov = self._shrink_covariance(covariance=cov) 208 | corr = self._cov2corr(covariance=cov) 209 | 210 | # Step-1: Tree Clustering 211 | distances, self.clusters = self._tree_clustering(correlation=corr) 212 | 213 | # Step-2: Quasi Diagnalization 214 | self.ordered_indices = self._quasi_diagnalization(num_assets, 2 * num_assets - 2) 215 | self.seriated_distances, self.seriated_correlations = self._get_seriated_matrix(assets=assets, 216 | distances=distances, 217 | correlations=corr) 218 | 219 | # Step-3: Recursive Bisection 220 | self._recursive_bisection(covariances=cov, assets=assets) 221 | -------------------------------------------------------------------------------- /images/mda_feat_imp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.1.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.1c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.1c.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.2a.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.3b.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.4c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.4c.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.4c2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.4c2.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.4c_10chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.4c_10chunks.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.4c_1chunk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.4c_1chunk.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.5_1.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.5_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.5_2.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.5_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.5_3.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.5_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.5_4.png -------------------------------------------------------------------------------- /images/mda_feat_imp_8.5_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mda_feat_imp_8.5_5.png -------------------------------------------------------------------------------- /images/mdi_feat_imp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.1.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.1c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.1c.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.2a.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.3b.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.4c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.4c.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.4c2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.4c2.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.4c_10chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.4c_10chunks.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.4c_1chunk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.4c_1chunk.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.5_1.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.5_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.5_2.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.5_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.5_3.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.5_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.5_4.png -------------------------------------------------------------------------------- /images/mdi_feat_imp_8.5_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/mdi_feat_imp_8.5_5.png -------------------------------------------------------------------------------- /images/sfi_feat_imp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.1.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.1c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.1c.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.2a.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.3b.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.4c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.4c.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.4c2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.4c2.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.4c_10chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.4c_10chunks.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.4c_1chunk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.4c_1chunk.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.5_1.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.5_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.5_2.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.5_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.5_3.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.5_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.5_4.png -------------------------------------------------------------------------------- /images/sfi_feat_imp_8.5_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/images/sfi_feat_imp_8.5_5.png -------------------------------------------------------------------------------- /img/MDA_feat_imp_8.1c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDA_feat_imp_8.1c.png -------------------------------------------------------------------------------- /img/MDA_feat_imp_8.1c2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDA_feat_imp_8.1c2.png -------------------------------------------------------------------------------- /img/MDA_feat_imp_8.2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDA_feat_imp_8.2a.png -------------------------------------------------------------------------------- /img/MDA_feat_imp_8.3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDA_feat_imp_8.3b.png -------------------------------------------------------------------------------- /img/MDA_feat_imp_8.4c_10chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDA_feat_imp_8.4c_10chunks.png -------------------------------------------------------------------------------- /img/MDA_feat_imp_8.4c_1chunk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDA_feat_imp_8.4c_1chunk.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.1c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.1c.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.1c2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.1c2.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.2a.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.3b.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.4c_10chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.4c_10chunks.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.4c_1chunk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.4c_1chunk.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.5_1.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.5_1_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.5_1_2.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.5_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.5_2.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.5_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.5_3.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.5_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.5_4.png -------------------------------------------------------------------------------- /img/MDI_feat_imp_8.5_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/MDI_feat_imp_8.5_5.png -------------------------------------------------------------------------------- /img/SFI_feat_imp_8.1c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/SFI_feat_imp_8.1c.png -------------------------------------------------------------------------------- /img/SFI_feat_imp_8.1c2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/SFI_feat_imp_8.1c2.png -------------------------------------------------------------------------------- /img/SFI_feat_imp_8.2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/SFI_feat_imp_8.2a.png -------------------------------------------------------------------------------- /img/SFI_feat_imp_8.3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/SFI_feat_imp_8.3b.png -------------------------------------------------------------------------------- /img/SFI_feat_imp_8.4c_10chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/SFI_feat_imp_8.4c_10chunks.png -------------------------------------------------------------------------------- /img/SFI_feat_imp_8.4c_1chunk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/img/SFI_feat_imp_8.4c_1chunk.png -------------------------------------------------------------------------------- /labeling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from multiprocess import mpPandasObj 3 | 4 | # we ignore the first version in the text, and immediately grab the one with meta-labeling 5 | def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads=1, t1=False, side=None): 6 | # 1) get target 7 | trgt = trgt.loc[tEvents] 8 | trgt = trgt[trgt > minRet] 9 | # 2) get t1 (max holding period) 10 | if t1 is False: 11 | t1 = pd.Series(pd.NaT, index=tEvents) 12 | # 3) form events object, apply stop loss on t1 13 | if side is None: 14 | side_, ptSl_ = pd.Series(1.0, index=trgt.index), [ptSl[0], ptSl[0]] 15 | else: 16 | side_, ptSl_ = side.loc[trgt.index], ptSl[:2] 17 | events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt']) 18 | df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule', events.index), numThreads=numThreads, close=close, events=events, ptSl=ptSl_) 19 | events['t1'] = df0.dropna(how='all').min(axis=1) # pd.min ignores NaN 20 | if side is None: 21 | events = events.drop('side', axis=1) 22 | 23 | # store for later 24 | events['pt'] = ptSl[0] 25 | events['sl'] = ptSl[1] 26 | 27 | return events 28 | 29 | def applyPtSlOnT1(close, events, ptSl, molecule): 30 | # apply stop loss/profit taking, if it takes place before t1 (end of event) 31 | events_ = events.loc[molecule] 32 | out = events_[['t1']].copy(deep=True) 33 | 34 | if ptSl[0] > 0: 35 | pt = ptSl[0] * events_['trgt'] 36 | else: 37 | pt = pd.Series(index=events.index) # NaNs 38 | 39 | if ptSl[1] > 0: 40 | sl = - ptSl[1] * events_['trgt'] 41 | else: 42 | sl = pd.Series(index=events.index) # 'mo NaNs 43 | 44 | for loc, t1 in events_['t1'].fillna(close.index[-1]).iteritems(): 45 | df0 = close[loc:t1] # path prices 46 | df0 = (df0 / close[loc] - 1) * events_.at[loc, 'side'] # path returns 47 | out.loc[loc, 'sl'] = df0[df0pt[loc]].index.min() # earliest profit take 49 | return out 50 | 51 | def getVerticalBarriers(close, tEvents, numDays): 52 | t1 = close.index.searchsorted(tEvents+pd.Timedelta(days=numDays)) 53 | t1 = t1[t1 < close.shape[0]] 54 | t1 = pd.Series(close.index[t1], index=tEvents[:t1.shape[0]]) # NaNs at the end 55 | return t1 56 | 57 | def barrierTouched(out_df, events): 58 | store = [] 59 | for date_time, values in out_df.iterrows(): 60 | ret = values['ret'] 61 | target = values['trgt'] 62 | 63 | pt_level_reached = ret > target * events.loc[date_time, 'pt'] 64 | sl_level_reached = ret < -target * events.loc[date_time, 'sl'] 65 | 66 | if ret > 0.0 and pt_level_reached: 67 | # Top barrier reached 68 | store.append(1) 69 | elif ret < 0.0 and sl_level_reached: 70 | # Bottom barrier reached 71 | store.append(-1) 72 | else: 73 | # Vertical barrier reached 74 | store.append(0) 75 | 76 | # Save to 'bin' column and return 77 | out_df['bin'] = store 78 | return out_df 79 | 80 | 81 | def getBins(events, close): 82 | ''' 83 | Compute event's outcome (including side information, if provided). 84 | events is a DataFrame where: 85 | -events.index is event's starttime 86 | -events['t1'] is event's endtime 87 | -events['trgt'] is event's target 88 | -events['side'] (optional) implies the algo's position side 89 | Case 1: ('side' not in events): bin in (-1, 1) <- label by price action 90 | Case 2: ('side' in events): bin in (0, 1) <- label by pnl (meta-labeling) 91 | ''' 92 | # 1) prices aligned with events 93 | events_ = events.dropna(subset=['t1']) 94 | px = events_.index.union(events_['t1'].values).drop_duplicates() 95 | px = close.reindex(px, method='bfill') 96 | # 2) create out object 97 | out = pd.DataFrame(index=events_.index) 98 | 99 | out['ret'] = px.loc[events_['t1'].values].values / px.loc[events_.index] - 1 100 | if 'side' in events_: 101 | out['ret'] *= events_['side'] # meta-labeling 102 | 103 | out['trgt'] = events_['trgt'] 104 | out = barrierTouched(out, events) 105 | 106 | if 'side' in events_: 107 | out.loc[out['ret'] <= 0, 'bin'] = 0 108 | 109 | if 'side' in events_: 110 | out['side'] = events['side'] 111 | return out 112 | 113 | # TODO: Rewrite "barrier_touched" 114 | -------------------------------------------------------------------------------- /load_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from path import Path 3 | 4 | DATA_DIR = Path('../data/') 5 | DAILY_DATA_DIR = DATA_DIR / 'daily' 6 | 7 | def load_contract(contract_name, directory): 8 | series = pd.read_csv(DATA_DIR / directory / '{}.csv'.format(contract_name), index_col=0) 9 | series = series[::-1] 10 | if directory == 'minutely': 11 | series['Time'] = series['date'] + ' ' + series['time'] 12 | series = series.set_index(pd.to_datetime(series['Time'], format='%Y-%m-%d 0 days %H:%M:00.000000000')) 13 | else: 14 | series['Time'] = series['date'] 15 | series = series.set_index(pd.to_datetime(series['Time'], format='%Y-%m-%d')) 16 | 17 | series = series[['open_p', 'close_p', 'prd_vlm', 'Time']] 18 | series = series.rename(columns={'close_p':'Close', 'open_p':'Open', 'prd_vlm':'Volume'}) 19 | series['Instrument'] = contract_name 20 | return series 21 | 22 | def load_contracts(symbol, directory='minutely'): 23 | contract_names = [x.basename().namebase for x in (DATA_DIR / directory).files('*{}*'.format(symbol))] 24 | loaded = [load_contract(x, directory) for x in contract_names] 25 | loaded = list(sorted(loaded, key=lambda x:x.index[-1])) 26 | first = loaded[0] 27 | # cut out from later contracts what former contracts already have 28 | zipped = zip(loaded, loaded[1:]) 29 | cut_contracts = [latter.truncate(before=former.index[-1] + pd.Timedelta(minutes=1)) for former, latter in zipped] 30 | 31 | return pd.concat([first] + cut_contracts) 32 | 33 | def load_all_cont_contracts(): 34 | all_continuous_contracts = DAILY_DATA_DIR.files('*#C*') 35 | all_continuous_contracts = [x.basename().namebase for x in all_continuous_contracts] 36 | return {name: load_contract(name, 'daily') for name in all_continuous_contracts} -------------------------------------------------------------------------------- /load_data_orig.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from path import Path 3 | DATA_DIR = Path('../data/') 4 | BROKEN = { 5 | '@SM#C':'2010', 6 | '@RP#C':'2002', 7 | '@LE#C':'2005', 8 | '@SP#C':'2010', 9 | '@MME#C':'2010', 10 | '@S#C':'2005', 11 | 'LG#C':'2010', 12 | } # bugged data 13 | DONTLIKE = [ 14 | # 'GAS#C', # NG trades weird 15 | # 'QNG#C', # NG trades weird 16 | # '@QG#C', # NG trades weird 17 | '@ED#C', #we want to stay further out the ED curve 18 | 'EZ#C', 19 | '@TU#C', 20 | ] 21 | 22 | START_DATE = '2000-1-1' 23 | 24 | def fix_fut(fut): 25 | for column,date in BROKEN.items(): 26 | if column in fut.columns:\ 27 | fut[column] = fut[column][date:] 28 | 29 | return fut 30 | 31 | def load_symbols_and_prices(sectors): 32 | symbols = pd.read_csv(DATA_DIR / 'symbols.csv', index_col='iqsymbol') 33 | 34 | symbols_list = [ 35 | (x, Path(DATA_DIR / 'daily' / '%s.csv' % x)) for x in symbols.index 36 | if (symbols.loc[x]['Sector'] in sectors) 37 | ] 38 | symbols_list = [(symbol,pp) for symbol, pp in symbols_list if pp.exists() and pp.size > 10000 and not symbol in DONTLIKE and not symbol in BROKEN] 39 | symbols_list, fns = zip(*symbols_list) 40 | dfs = [pd.read_csv(ff, index_col='date', parse_dates=True)[['close_p']] for ff in fns] 41 | fut = pd.concat([x['close_p'] for x in dfs], axis=1).ffill().truncate(before=pd.Timestamp(START_DATE)) 42 | fut.columns = symbols_list 43 | 44 | fut = fix_fut(fut) 45 | # print(len(fut.columns)) 46 | # print(fut.columns) 47 | return symbols, fut 48 | -------------------------------------------------------------------------------- /mean_variance_mlf.py: -------------------------------------------------------------------------------- 1 | # from https://github.com/hudson-and-thames/mlfinlab 2 | # license: https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt 3 | 4 | ''' 5 | This module implements the classic mean-variance optimisation techniques for calculating the efficient frontier. 6 | It uses typical quadratic optimisers to generate optimal portfolios for different objective functions. 7 | ''' 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | 13 | class MeanVarianceOptimisation: 14 | ''' 15 | This class contains a variety of methods dealing with different solutions to the mean variance optimisation 16 | problem. 17 | ''' 18 | 19 | def __init__(self): 20 | self.weights = list() 21 | 22 | def allocate(self, asset_prices, solution='inverse_variance', resample_by='B'): 23 | ''' 24 | Calculate the portfolio asset allocations using the method specified. 25 | 26 | :param asset_prices: (pd.Dataframe) a dataframe of historical asset prices (daily close) 27 | :param solution: (str) the type of solution/algorithm to use to calculate the weights 28 | :param resample_by: (str) specifies how to resample the prices - weekly, daily, monthly etc.. Defaults to 29 | 'B' meaning daily business days which is equivalent to no resampling 30 | ''' 31 | 32 | if not isinstance(asset_prices, pd.DataFrame): 33 | raise ValueError("Asset prices matrix must be a dataframe") 34 | if not isinstance(asset_prices.index, pd.DatetimeIndex): 35 | raise ValueError("Asset prices dataframe must be indexed by date.") 36 | 37 | # Calculate returns 38 | asset_returns = self._calculate_returns(asset_prices, resample_by=resample_by) 39 | assets = asset_prices.columns 40 | 41 | if solution == 'inverse_variance': 42 | cov = asset_returns.cov() 43 | self.weights = self._inverse_variance(covariance=cov) 44 | else: 45 | raise ValueError("Unknown solution string specified. Supported solutions - inverse_variance.") 46 | self.weights = pd.DataFrame(self.weights) 47 | self.weights.index = assets 48 | self.weights = self.weights.T 49 | 50 | @staticmethod 51 | def _calculate_returns(asset_prices, resample_by): 52 | ''' 53 | Calculate the annualised mean historical returns from asset price data 54 | 55 | :param asset_prices: (pd.Dataframe) a dataframe of historical asset prices (daily close) 56 | :param resample_by: (str) specifies how to resample the prices - weekly, daily, monthly etc.. Defaults to 57 | 'B' meaning daily business days which is equivalent to no resampling 58 | :return: (pd.Dataframe) stock returns 59 | ''' 60 | 61 | asset_prices = asset_prices.resample(resample_by).last() 62 | asset_returns = asset_prices.pct_change() 63 | asset_returns = asset_returns.dropna(how='all') 64 | return asset_returns 65 | 66 | @staticmethod 67 | def _inverse_variance(covariance): 68 | ''' 69 | Calculate weights using inverse-variance allocation 70 | 71 | :param covariance: (pd.Dataframe) covariance dataframe of asset returns 72 | :return: (np.array) array of portfolio weights 73 | ''' 74 | 75 | ivp = 1. / np.diag(covariance) 76 | ivp /= ivp.sum() 77 | return ivp 78 | -------------------------------------------------------------------------------- /multiprocess.py: -------------------------------------------------------------------------------- 1 | # Linear Partitions [20.4.1] 2 | import pandas as pd 3 | import numpy as np 4 | import time 5 | import sys 6 | 7 | def linParts(numAtoms,numThreads): 8 | # partition of atoms with a single loop 9 | parts=np.linspace(0,numAtoms,min(numThreads,numAtoms)+1) 10 | parts=np.ceil(parts).astype(int) 11 | return parts 12 | 13 | def nestedParts(numAtoms,numThreads,upperTriang=False): 14 | # partition of atoms with an inner loop 15 | parts,numThreads_=[0],min(numThreads,numAtoms) 16 | for num in range(numThreads_): 17 | part=1+4*(parts[-1]**2+parts[-1]+numAtoms*(numAtoms+1.)/numThreads_) 18 | part=(-1+part**.5)/2. 19 | parts.append(part) 20 | parts=np.round(parts).astype(int) 21 | if upperTriang: # the first rows are heaviest 22 | parts=np.cumsum(np.diff(parts)[::-1]) 23 | parts=np.append(np.array([0]),parts) 24 | return parts 25 | 26 | def mpPandasObj(func,pdObj,numThreads=24,mpBatches=1,linMols=True,**kargs): 27 | ''' 28 | Parallelize jobs, return a dataframe or series 29 | + func: function to be parallelized. Returns a DataFrame 30 | + pdObj[0]: Name of argument used to pass the molecule 31 | + pdObj[1]: List of atoms that will be grouped into molecules 32 | + kwds: any other argument needed by func 33 | Example: df1=mpPandasObj(func,('molecule',df0.index),24,**kwds) 34 | ''' 35 | import pandas as pd 36 | #if linMols:parts=linParts(len(argList[1]),numThreads*mpBatches) 37 | #else:parts=nestedParts(len(argList[1]),numThreads*mpBatches) 38 | if linMols:parts=linParts(len(pdObj[1]),numThreads*mpBatches) 39 | else:parts=nestedParts(len(pdObj[1]),numThreads*mpBatches) 40 | 41 | jobs=[] 42 | for i in range(1,len(parts)): 43 | job={pdObj[0]:pdObj[1][parts[i-1]:parts[i]],'func':func} 44 | job.update(kargs) 45 | jobs.append(job) 46 | if numThreads==1:out=processJobs_(jobs) 47 | else: out=processJobs(jobs,numThreads=numThreads) 48 | if isinstance(out[0],pd.DataFrame):df0=pd.DataFrame() 49 | elif isinstance(out[0],pd.Series):df0=pd.Series() 50 | else:return out 51 | for i in out:df0=df0.append(i) 52 | df0=df0.sort_index() 53 | return df0 54 | # ======================================================= 55 | # single-thread execution for debugging [20.8] 56 | def processJobs_(jobs): 57 | # Run jobs sequentially, for debugging 58 | out=[] 59 | for job in jobs: 60 | out_=expandCall(job) 61 | out.append(out_) 62 | return out 63 | # ======================================================= 64 | # Example of async call to multiprocessing lib [20.9] 65 | import multiprocessing as mp 66 | import datetime as dt 67 | 68 | #________________________________ 69 | def reportProgress(jobNum,numJobs,time0,task): 70 | # Report progress as asynch jobs are completed 71 | msg=[float(jobNum)/numJobs, (time.time()-time0)/60.] 72 | msg.append(msg[1]*(1/msg[0]-1)) 73 | timeStamp=str(dt.datetime.fromtimestamp(time.time())) 74 | msg=timeStamp+' '+str(round(msg[0]*100,2))+'% '+task+' done after '+ \ 75 | str(round(msg[1],2))+' minutes. Remaining '+str(round(msg[2],2))+' minutes.' 76 | if jobNum> 1 not satisfied.") 39 | return (1 - np.euler_gamma) * ss.norm.ppf( 40 | 1 - 1.0 / N 41 | ) + np.euler_gamma * ss.norm.ppf(1 - np.exp(-1) / N) 42 | 43 | 44 | def dsr(test_sharpe, sharpe_std, N, T, skew, kurtosis): 45 | """ 46 | Deflated Sharpe Ratio statistic. DSR = PSR(SR_0). 47 | See paper for definition of SR_0. http://ssrn.com/abstract=2460551 48 | Parameters: 49 | test_sharpe : 50 | reported sharpe, to be tested. 51 | sharpe_std : 52 | standard deviation of sharpe ratios from N trials / configurations 53 | N : 54 | number of backtest configurations 55 | T : 56 | number of observations 57 | skew : 58 | skew of returns 59 | kurtosis : 60 | kurtosis of returns 61 | Returns: 62 | DSR statistic 63 | """ 64 | # sharpe_std = np.std(sharpe_n, ddof=1) 65 | target_sharpe = sharpe_std * expected_max(N) 66 | 67 | dsr_stat = psr(test_sharpe, T, skew, kurtosis, target_sharpe) 68 | 69 | return dsr_stat 70 | -------------------------------------------------------------------------------- /synthetic_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from random import gauss 4 | from itertools import product 5 | import seaborn as sns 6 | sns.set() 7 | 8 | def main(): 9 | rPT = rSLm = np.linspace(0, 10, 21) 10 | count = 0 11 | for prod_ in product([10, 5, 0, -5, -10], [5, 10, 25, 50, 100]): 12 | count += 1 13 | coeffs = {'forecast': prod_[0], 'hl': prod_[1], 'sigma': 1} 14 | output = batch(coeffs, nIter=1e5, maxHP=100, rPT=rPT, rSLm=rSLm) 15 | return output 16 | 17 | def batch(coeffs, nIter=1e5, maxHP=100, rPT=np.linspace(0.5, 10, 20), rSLm=np.linspace(0.5, 10, 20), seed=0): 18 | phi = 2 ** (-1.0 / coeffs['hl']) 19 | output1 = [] 20 | for comb_ in product(rPT, rSLm): 21 | output2 = [] 22 | for iter_ in range(int(nIter)): 23 | p, hp, count = seed, 0, 0 24 | while True: 25 | p = (1 - phi) * coeffs['forecast'] + phi * p + coeffs['sigma'] * gauss(0, 1) 26 | cP = p - seed 27 | hp += 1 28 | if cP > comb_[0] or cP < -comb_[1] or hp > maxHP: 29 | output2.append(cP) 30 | break 31 | mean, std = np.mean(output2), np.std(output2) 32 | print(comb_[0], comb_[1], mean, std, mean / std) 33 | output1.append((comb_[0], comb_[1], mean, std, mean / std)) 34 | return output1 35 | 36 | def process_batch(coeffs_list, **kwargs): 37 | out = [] 38 | for coeffs in coeffs_list: 39 | out.append((coeffs, batch(coeffs, **kwargs))) 40 | return out 41 | -------------------------------------------------------------------------------- /testFunc/81c_mda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/81c_mda.png -------------------------------------------------------------------------------- /testFunc/81c_mdi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/81c_mdi.png -------------------------------------------------------------------------------- /testFunc/81c_sfi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/81c_sfi.png -------------------------------------------------------------------------------- /testFunc/82b_mda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/82b_mda.png -------------------------------------------------------------------------------- /testFunc/82b_mdi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/82b_mdi.png -------------------------------------------------------------------------------- /testFunc/82b_sfi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/82b_sfi.png -------------------------------------------------------------------------------- /testFunc/83b_mda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/83b_mda.png -------------------------------------------------------------------------------- /testFunc/83b_mdi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/83b_mdi.png -------------------------------------------------------------------------------- /testFunc/83b_sfi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/83b_sfi.png -------------------------------------------------------------------------------- /testFunc/84d_mdi_10chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/84d_mdi_10chunks.png -------------------------------------------------------------------------------- /testFunc/84d_mdi_1chunk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/84d_mdi_1chunk.png -------------------------------------------------------------------------------- /testFunc/85_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/85_1.png -------------------------------------------------------------------------------- /testFunc/85_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/85_2.png -------------------------------------------------------------------------------- /testFunc/85_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/85_3.png -------------------------------------------------------------------------------- /testFunc/85_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/85_4.png -------------------------------------------------------------------------------- /testFunc/85_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doda/advances-in-financial-ml-notes/8eb3b3148a45a5b591db4d994ed2f2cc8aec8428/testFunc/85_5.png -------------------------------------------------------------------------------- /testFunc/stats.csv: -------------------------------------------------------------------------------- 1 | ,method,scoring,minWLeaf,max_samples,I,R,N,oob,oos 2 | 0,MDI,accuracy,0.0,1.0,0.4496459135778261,0.45270218841638743,0.09765189800578648,0.95297,0.9392700000000002 3 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | def getDailyVol(close, span0=100): 5 | # daily vol, reindexed to cloes 6 | df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1)) 7 | df0 = df0[df0>0] 8 | df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:]) 9 | df0 = close.loc[df0.index] / close.loc[df0.values].values - 1 # daily returns 10 | df0 = df0.ewm(span=span0).std() 11 | return df0 12 | --------------------------------------------------------------------------------