├── Notebooks ├── image │ ├── betsize.jpg │ ├── pcacorr.jpg │ ├── pnl_2nd.png │ ├── volume.jpg │ ├── bet_prob.png │ ├── pnl_2nd2.png │ ├── positions.JPG │ ├── predprob.jpg │ ├── close_price.jpg │ ├── closereturn.jpg │ ├── feature_corr.png │ ├── order_flow.jpg │ ├── pcaloadings.jpg │ ├── closecumreturn.jpg │ ├── feature_corr2.png │ ├── True loss points.png │ ├── autogluon_results.JPG │ ├── pnl_first_model.jpg │ ├── cumsum_ret_1st_test.png │ ├── cumsum_ret_2nd_test.png │ ├── returns_of_1stmodel.jpg │ ├── Predicted loss points.png │ ├── Profit(1) vs Loss(-1).png │ └── cumsum_return_1stmodel.jpg └── project │ ├── 0. Get Market Data.ipynb │ ├── 2. Feature Selection.ipynb │ └── 5. Secondary model, Bet confidence.ipynb └── README.md /Notebooks/image/betsize.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/betsize.jpg -------------------------------------------------------------------------------- /Notebooks/image/pcacorr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/pcacorr.jpg -------------------------------------------------------------------------------- /Notebooks/image/pnl_2nd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/pnl_2nd.png -------------------------------------------------------------------------------- /Notebooks/image/volume.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/volume.jpg -------------------------------------------------------------------------------- /Notebooks/image/bet_prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/bet_prob.png -------------------------------------------------------------------------------- /Notebooks/image/pnl_2nd2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/pnl_2nd2.png -------------------------------------------------------------------------------- /Notebooks/image/positions.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/positions.JPG -------------------------------------------------------------------------------- /Notebooks/image/predprob.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/predprob.jpg -------------------------------------------------------------------------------- /Notebooks/image/close_price.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/close_price.jpg -------------------------------------------------------------------------------- /Notebooks/image/closereturn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/closereturn.jpg -------------------------------------------------------------------------------- /Notebooks/image/feature_corr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/feature_corr.png -------------------------------------------------------------------------------- /Notebooks/image/order_flow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/order_flow.jpg -------------------------------------------------------------------------------- /Notebooks/image/pcaloadings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/pcaloadings.jpg -------------------------------------------------------------------------------- /Notebooks/image/closecumreturn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/closecumreturn.jpg -------------------------------------------------------------------------------- /Notebooks/image/feature_corr2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/feature_corr2.png -------------------------------------------------------------------------------- /Notebooks/image/True loss points.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/True loss points.png -------------------------------------------------------------------------------- /Notebooks/image/autogluon_results.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/autogluon_results.JPG -------------------------------------------------------------------------------- /Notebooks/image/pnl_first_model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/pnl_first_model.jpg -------------------------------------------------------------------------------- /Notebooks/image/cumsum_ret_1st_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/cumsum_ret_1st_test.png -------------------------------------------------------------------------------- /Notebooks/image/cumsum_ret_2nd_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/cumsum_ret_2nd_test.png -------------------------------------------------------------------------------- /Notebooks/image/returns_of_1stmodel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/returns_of_1stmodel.jpg -------------------------------------------------------------------------------- /Notebooks/image/Predicted loss points.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/Predicted loss points.png -------------------------------------------------------------------------------- /Notebooks/image/Profit(1) vs Loss(-1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/Profit(1) vs Loss(-1).png -------------------------------------------------------------------------------- /Notebooks/image/cumsum_return_1stmodel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jo-cho/trading-rules-using-machine-learning/HEAD/Notebooks/image/cumsum_return_1stmodel.jpg -------------------------------------------------------------------------------- /Notebooks/project/0. Get Market Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# lib\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import seaborn as sns;sns.set()\n", 14 | "\n", 15 | "import FinanceDataReader as fdr\n", 16 | "import yfinance as yf\n", 17 | "df_ohlc = fdr.DataReader('005930','2005-7-27','2022-1-17').iloc[:,0:4] #가격 수정되어 있음\n", 18 | "volume_yf = yf.download('005930.KS','2005-7-27','2022-1-17',auto_adjust=True).Volume # 수정거래량\n", 19 | "\n", 20 | "df_ohlcv = df_ohlc.join(volume_yf).dropna()\n", 21 | "df = tautil.ohlcv(df_ohlcv)\n", 22 | "\n", 23 | "quantity_ = pd.read_csv('C:data/순매수량.csv')\n", 24 | "quantity_ = quantity_.iloc[:-1,1:5]\n", 25 | "quantity_.columns = ['Date','individual','foreign','institutional']\n", 26 | "quantity_.index = quantity_['Date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))\n", 27 | "quantity_.drop(columns='Date',inplace=True)\n", 28 | "\n", 29 | "df = df.join(quantity_).dropna()" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.7" 50 | }, 51 | "toc": { 52 | "base_numbering": 1, 53 | "nav_menu": {}, 54 | "number_sections": true, 55 | "sideBar": true, 56 | "skip_h1_title": false, 57 | "title_cell": "Table of Contents", 58 | "title_sidebar": "Contents", 59 | "toc_cell": false, 60 | "toc_position": {}, 61 | "toc_section_display": true, 62 | "toc_window_display": false 63 | } 64 | }, 65 | "nbformat": 4, 66 | "nbformat_minor": 4 67 | } 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Trading rules using machine learning 2 | This is my financial trading using ML. 3 | 4 | 5 | - [Example](https://github.com/jo-cho/trading-rules-using-machine-learning/blob/main/Notebooks/ETHUSD%20trading%20ML.ipynb) 6 | - [Project](https://github.com/jo-cho/trading-rules-using-machine-learning/tree/main/Notebooks/project) 7 | 8 | 9 | Momentum prediction and enhancing the strategy with machine learning 10 | 11 | 1. Financial Data and Bars 12 | - Form time/dollar bars with tick data 13 | 14 | 2. Get Buy/Sell Signals 15 | - Momentum strategy (RSI..) 16 | - Additional ML regime detector 17 | 18 | 3. Trading Rules 19 | - Set enter rules with trading signals from classifiers 20 | - Set exit rules with profit-taking, stop-loss rate, and maximum holding period 21 | - (For enhancing the strategy) Label the binary outcome (Profit or Loss) 22 | 23 | 24 | 4. Strategy-Enhancing ML Model 25 | 26 | - Get Features (X) 27 | 28 | - Market data & Technical analysis 29 | - Microstructure features 30 | - Macroeconomic variables 31 | - Fundamentals 32 | - *news/public sentiments* (in progress) 33 | 34 | - Feature Engineering 35 | - Feature selection, dimension reduction 36 | 37 | - Machine Learning Model Optmization 38 | - Cross-validation (time-series cv / Purged k-fold) 39 | - Hyperparameter tuning 40 | - AutoML with autogluon (or simply using ensemble methods such as Random forest, LightGBM, or XGBoost) 41 | - Metrics (accuracy, f1 score, roc-auc) 42 | 43 | - Outcome 44 | - Bet confidence (probability to accept a single trading signal) 45 | 46 | 4. Trading Decision 47 | - Decide to bet or pass for each trading signal from the momentum strategy. The ML model above will help you. 48 | - Bet sizing with some advanced models (in progress) 49 | 50 | 5. Backtesting 51 | - Cumulative returns, Sharpe ratio, max drawdown, win ratio 52 | 53 | 54 | # References: 55 | - Advances in Financial Machine Learning, Lopez de Prado (2018) 56 | 57 | # Flowchart 58 | ![ML Trade Networks](https://user-images.githubusercontent.com/52461409/132567663-eeead1ab-d3de-4cf3-a79f-6fea94722999.png) 59 | -------------------------------------------------------------------------------- /Notebooks/project/2. Feature Selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Feature Selection\n", 8 | "- Input\n", 9 | " - label: up-trend vs. (down or no trend)\n", 10 | " - periods : 2005-2010\n", 11 | " - features: market data features\n", 12 | "\n", 13 | "- Model: RF\n", 14 | " - 5 Feature Selection Methods: original, mda-kmeans, mda-optics, mda-onc, rfecv, sbfs(too much cost)\n", 15 | "\n", 16 | "- Output\n", 17 | " - best methods & selected features" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 46, 23 | "metadata": { 24 | "ExecuteTime": { 25 | "end_time": "2021-11-06T07:06:20.955568Z", 26 | "start_time": "2021-11-06T07:06:20.781355Z" 27 | } 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "# lib\n", 32 | "import numpy as np\n", 33 | "import pandas as pd\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import seaborn as sns;sns.set()\n", 36 | "plt.style.use('tableau-colorblind10')\n", 37 | "\n", 38 | "from sklearn.ensemble import RandomForestClassifier\n", 39 | "from sklearn.metrics import roc_auc_score, accuracy_score, f1_score\n", 40 | "\n", 41 | "from sklearn.cluster import OPTICS, KMeans\n", 42 | "from sklearn.metrics import silhouette_score\n", 43 | "from sklearn.feature_selection import RFECV, SequentialFeatureSelector\n", 44 | "\n", 45 | "# homemade\n", 46 | "from feature_engineering import cluster\n", 47 | "from feature_importance import importance\n", 48 | "from labeling import labeling\n", 49 | "from mlutil.pkfold import PKFold" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 47, 55 | "metadata": { 56 | "ExecuteTime": { 57 | "end_time": "2021-11-06T07:06:21.855360Z", 58 | "start_time": "2021-11-06T07:06:21.844363Z" 59 | }, 60 | "scrolled": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "import warnings\n", 65 | "warnings.filterwarnings(action='ignore')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 48, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "market_df = pd.read_csv('C:data/market_samsung.csv')\n", 75 | "market_df = market_df.rename(columns={market_df.columns[0]:'Date'})\n", 76 | "market_df.index = pd.to_datetime(market_df.Date)\n", 77 | "market_df.drop(columns='Date',inplace=True)\n", 78 | "market_df.dropna(inplace=True)\n", 79 | "\n", 80 | "feature_df = pd.read_csv('C:data/features_samsung.csv')\n", 81 | "feature_df = feature_df.rename(columns={feature_df.columns[0]:'Date'})\n", 82 | "feature_df.index = pd.to_datetime(feature_df.Date)\n", 83 | "feature_df.drop(columns='Date',inplace=True)\n", 84 | "feature_df.dropna(inplace=True)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 49, 90 | "metadata": { 91 | "ExecuteTime": { 92 | "end_time": "2021-11-06T07:07:13.931903Z", 93 | "start_time": "2021-11-06T07:07:13.916273Z" 94 | } 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "X = feature_df.dropna()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 50, 104 | "metadata": { 105 | "ExecuteTime": { 106 | "end_time": "2021-11-06T07:07:14.449727Z", 107 | "start_time": "2021-11-06T07:07:14.429352Z" 108 | } 109 | }, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "\n", 116 | "DatetimeIndex: 3873 entries, 2005-11-04 to 2021-10-15\n", 117 | "Data columns (total 32 columns):\n", 118 | " # Column Non-Null Count Dtype \n", 119 | "--- ------ -------------- ----- \n", 120 | " 0 momentum_rsi_15 3873 non-null float64\n", 121 | " 1 momentum_wr_15 3873 non-null float64\n", 122 | " 2 trend_adx_15 3873 non-null float64\n", 123 | " 3 trend_aroon_ind_20 3873 non-null float64\n", 124 | " 4 trend_dpo_20 3873 non-null float64\n", 125 | " 5 trend_macd_diff_25_10_9 3873 non-null float64\n", 126 | " 6 trend_mass_index_10_25 3873 non-null float64\n", 127 | " 7 trend_trix_15 3873 non-null float64\n", 128 | " 8 volatility_atr_10 3873 non-null float64\n", 129 | " 9 volatility_ui_15 3873 non-null float64\n", 130 | " 10 volume_cmf_20 3873 non-null float64\n", 131 | " 11 volume_fi_15 3873 non-null float64\n", 132 | " 12 volume_mfi_15 3873 non-null float64\n", 133 | " 13 volume_sma_em_15 3873 non-null float64\n", 134 | " 14 volume_vpt 3873 non-null float64\n", 135 | " 15 ret_10 3873 non-null float64\n", 136 | " 16 ret_20 3873 non-null float64\n", 137 | " 17 ret_5 3873 non-null float64\n", 138 | " 18 std_30 3873 non-null float64\n", 139 | " 19 individual sma_5 3873 non-null float64\n", 140 | " 20 individual sma_20 3873 non-null float64\n", 141 | " 21 foreign sma_5 3873 non-null float64\n", 142 | " 22 foreign sma_20 3873 non-null float64\n", 143 | " 23 institutional sma_5 3873 non-null float64\n", 144 | " 24 institutional sma_20 3873 non-null float64\n", 145 | " 25 trend_back_scan_20 3873 non-null float64\n", 146 | " 26 trend_back_scan_60 3873 non-null float64\n", 147 | " 27 kyle_lambda 3873 non-null float64\n", 148 | " 28 amihud_lambda 3873 non-null float64\n", 149 | " 29 hasbrouck_lambda 3873 non-null float64\n", 150 | " 30 bekker_parkinson_vol 3873 non-null float64\n", 151 | " 31 corwin_schultz_estimator 3873 non-null float64\n", 152 | "dtypes: float64(32)\n", 153 | "memory usage: 998.5 KB\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "X.info()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Feature selection" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "## Clustering Methods" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### clustering" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 51, 185 | "metadata": { 186 | "ExecuteTime": { 187 | "end_time": "2021-11-06T07:07:18.248269Z", 188 | "start_time": "2021-11-06T07:07:18.236277Z" 189 | } 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn.preprocessing import StandardScaler\n", 194 | "sc = StandardScaler()\n", 195 | "X_sc = sc.fit_transform(X)\n", 196 | "X_sc = pd.DataFrame(X_sc, index=X.index, columns=X.columns)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 69, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "X_sc=X_sc[:'2020']" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 70, 211 | "metadata": { 212 | "ExecuteTime": { 213 | "end_time": "2021-11-06T07:07:19.452157Z", 214 | "start_time": "2021-11-06T07:07:18.837293Z" 215 | } 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "silhouette_coefficients = []\n", 220 | "kmeans_kwargs = {\n", 221 | " \"init\": \"random\",\n", 222 | " \"n_init\": 10,\n", 223 | " \"max_iter\": 300,\n", 224 | " \"random_state\": 42,\n", 225 | "}\n", 226 | "\n", 227 | "for k in range(2, 30):\n", 228 | " kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n", 229 | " kmeans.fit(X.T)\n", 230 | " score = silhouette_score(X.T, kmeans.labels_)\n", 231 | " silhouette_coefficients.append(score)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 71, 237 | "metadata": { 238 | "ExecuteTime": { 239 | "end_time": "2021-11-06T07:07:28.075037Z", 240 | "start_time": "2021-11-06T07:07:27.989415Z" 241 | } 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "n_clusters=np.argmin(silhouette_coefficients)+2\n", 246 | "kmeans = KMeans(\n", 247 | " init=\"random\",\n", 248 | " n_clusters=n_clusters,\n", 249 | " n_init=10,\n", 250 | " max_iter=300,\n", 251 | " random_state=42)\n", 252 | "kmeans.fit(X_sc.T)\n", 253 | "clusters_kmeans = {i: X_sc.columns[np.where(kmeans.labels_ == i)[0]].tolist() for i in np.unique(kmeans.labels_)}" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 72, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "optics = OPTICS(min_cluster_size=2)\n", 263 | "optics.fit(X.T)\n", 264 | "clusters_optics = {i: X_sc.columns[np.where(optics.labels_ == i)[0]].tolist() for i in np.unique(optics.labels_)}" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 73, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "No feature/s found with low silhouette score. All features belongs to its respective clusters\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "# 오래 걸림.\n", 282 | "clusters_onc_dist = cluster.get_feature_clusters(X_sc, dependence_metric= 'distance_correlation')" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "### mda - selection" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 74, 295 | "metadata": { 296 | "ExecuteTime": { 297 | "end_time": "2021-11-06T07:20:03.162545Z", 298 | "start_time": "2021-11-06T07:19:04.236938Z" 299 | } 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "#labeling\n", 304 | "trend_scanning_window = 60\n", 305 | "trend_scanning_q = 3\n", 306 | "ts_out = labeling.trend_scanning_label(market_df.close, window = trend_scanning_window, q = trend_scanning_q)\n", 307 | "mom_label = ts_out[0]\n", 308 | "y = np.sign(mom_label-1)+1 # up-trend vs others" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 75, 314 | "metadata": { 315 | "ExecuteTime": { 316 | "end_time": "2021-11-06T07:20:03.178375Z", 317 | "start_time": "2021-11-06T07:20:03.162545Z" 318 | } 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "raw_X = X_sc.copy()\n", 323 | "\n", 324 | "tmp = raw_X.join(y).dropna()\n", 325 | "X=tmp.iloc[:,:-1]\n", 326 | "y=tmp.iloc[:,-1]\n", 327 | "\n", 328 | "# train & test split\n", 329 | "# use previous data for feature selection\n", 330 | "X = X.loc['2005':'2010']\n", 331 | "y = y.loc['2005':'2010']" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 76, 337 | "metadata": { 338 | "ExecuteTime": { 339 | "end_time": "2021-11-06T07:20:03.210287Z", 340 | "start_time": "2021-11-06T07:20:03.181366Z" 341 | } 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "# CV\n", 346 | "n_cv=4\n", 347 | "t1 = ts_out[1].loc[X.index]\n", 348 | "cv = PKFold(n_cv,t1,0.01)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 77, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "clusters = [clusters_kmeans[i] for i in range(n_clusters)]\n", 358 | "clusters2 = [clusters_optics[i] for i in clusters_optics.keys()]\n", 359 | "clusters3 = clusters_onc_dist" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 78, 365 | "metadata": { 366 | "ExecuteTime": { 367 | "end_time": "2021-11-06T07:21:38.781197Z", 368 | "start_time": "2021-11-06T07:20:59.059244Z" 369 | } 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "clf = RandomForestClassifier(n_estimators=1000,class_weight='balanced')\n", 374 | "mda_cluster = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters)\n", 375 | "mda_cluster2 = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters2)\n", 376 | "mda_cluster3 = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters3)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 79, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "features_mda_kmeans = mda_cluster.loc[mda_cluster['mean'] == mda_cluster['mean'].max()].index\n", 386 | "features_mda_optics = mda_cluster2.loc[mda_cluster2['mean'] == mda_cluster2['mean'].max()].index\n", 387 | "features_mda_onc_dist = mda_cluster3.loc[mda_cluster3['mean'] == mda_cluster3['mean'].max()].index\n", 388 | "\n", 389 | "# 0에서 min 값으로 변경함." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 80, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "new_X1 = X[features_mda_kmeans]\n", 399 | "new_X2 = X[features_mda_optics]\n", 400 | "new_X3 = X[features_mda_onc_dist]" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "## Non-clustering methods\n", 408 | "\n", 409 | "- rfecv/ sbfs" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 81, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "# 오래걸림\n", 419 | "\n", 420 | "rf = RandomForestClassifier(class_weight='balanced')\n", 421 | "\n", 422 | "min_features_to_select = 2 # Minimum number of features to consider\n", 423 | "rfecv = RFECV(\n", 424 | " estimator=rf,\n", 425 | " step=1,\n", 426 | " cv=cv,\n", 427 | " scoring=\"accuracy\",\n", 428 | " min_features_to_select=min_features_to_select,\n", 429 | ")\n", 430 | "new_X5_ = rfecv.fit_transform(X,y)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 82, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "new_X5 = pd.DataFrame(new_X5_, index=X.index, columns=rfecv.get_feature_names_out())" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 83, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "X_list = [X,new_X1,new_X2,new_X3,new_X5]" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "# results" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 84, 461 | "metadata": { 462 | "ExecuteTime": { 463 | "end_time": "2021-11-06T07:22:09.651645Z", 464 | "start_time": "2021-11-06T07:21:39.268311Z" 465 | } 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "clf = RandomForestClassifier(class_weight='balanced')\n", 470 | "score_list = []\n", 471 | "for X_ in X_list:\n", 472 | " accs = []\n", 473 | " f1 = []\n", 474 | " roc_auc = []\n", 475 | "\n", 476 | " for train, test in cv.split(X_, y):\n", 477 | " clf.fit(X_.iloc[train], y.iloc[train])\n", 478 | " y_true = y.iloc[test]\n", 479 | " y_pred = clf.predict(X_.iloc[test])\n", 480 | " y_probs = clf.predict_proba(X_.iloc[test])\n", 481 | " y_probs = y_probs[:, 1]\n", 482 | " accs.append(accuracy_score(y_true, y_pred))\n", 483 | " f1.append(f1_score(y_true, y_pred))\n", 484 | " roc_auc.append(roc_auc_score(y_true, y_probs))\n", 485 | "\n", 486 | "\n", 487 | " accs = np.mean(accs)\n", 488 | " f1 = np.mean(f1)\n", 489 | " roc = np.mean(roc_auc)\n", 490 | " scores = [accs, f1, roc]\n", 491 | " score_list.append(scores)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 85, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "result_df = pd.DataFrame(score_list,\n", 501 | " columns=['accuracy','f1 score','roc auc score'], \n", 502 | " index = ['original','mda_kmeans','mda_optics','mda_onc','rfecv'])" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 86, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/html": [ 513 | "
\n", 514 | "\n", 527 | "\n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | "
accuracyf1 scoreroc auc scoremean_
rfecv0.6417450.2792230.5359760.485648
original0.6300620.2439870.5100870.461379
mda_kmeans0.5965730.2516140.5044740.450887
mda_onc0.5926790.1967220.4643150.417905
mda_optics0.5919000.1560970.4790040.409001
\n", 575 | "
" 576 | ], 577 | "text/plain": [ 578 | " accuracy f1 score roc auc score mean_\n", 579 | "rfecv 0.641745 0.279223 0.535976 0.485648\n", 580 | "original 0.630062 0.243987 0.510087 0.461379\n", 581 | "mda_kmeans 0.596573 0.251614 0.504474 0.450887\n", 582 | "mda_onc 0.592679 0.196722 0.464315 0.417905\n", 583 | "mda_optics 0.591900 0.156097 0.479004 0.409001" 584 | ] 585 | }, 586 | "execution_count": 86, 587 | "metadata": {}, 588 | "output_type": "execute_result" 589 | } 590 | ], 591 | "source": [ 592 | "result_df['mean_'] = result_df.mean(axis=1)\n", 593 | "result_df.sort_values('mean_', ascending=False)" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 87, 599 | "metadata": {}, 600 | "outputs": [ 601 | { 602 | "data": { 603 | "text/plain": [ 604 | "Index(['trend_adx_15', 'trend_mass_index_10_25', 'trend_trix_15',\n", 605 | " 'volatility_atr_10', 'volatility_ui_15', 'volume_cmf_20',\n", 606 | " 'volume_mfi_15', 'std_30', 'individual sma_20', 'foreign sma_20',\n", 607 | " 'institutional sma_20', 'trend_back_scan_60', 'kyle_lambda',\n", 608 | " 'amihud_lambda', 'hasbrouck_lambda'],\n", 609 | " dtype='object')" 610 | ] 611 | }, 612 | "execution_count": 87, 613 | "metadata": {}, 614 | "output_type": "execute_result" 615 | } 616 | ], 617 | "source": [ 618 | "#best features\n", 619 | "selected_features = X_list[result_df['mean_'].argmax()].iloc[0:2]\n", 620 | "selected_features.columns" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 88, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "selected_features.to_csv('C:data/selected_features.csv')" 630 | ] 631 | } 632 | ], 633 | "metadata": { 634 | "kernelspec": { 635 | "display_name": "Python 3", 636 | "language": "python", 637 | "name": "python3" 638 | }, 639 | "language_info": { 640 | "codemirror_mode": { 641 | "name": "ipython", 642 | "version": 3 643 | }, 644 | "file_extension": ".py", 645 | "mimetype": "text/x-python", 646 | "name": "python", 647 | "nbconvert_exporter": "python", 648 | "pygments_lexer": "ipython3", 649 | "version": "3.7.7" 650 | }, 651 | "toc": { 652 | "base_numbering": 1, 653 | "nav_menu": {}, 654 | "number_sections": true, 655 | "sideBar": true, 656 | "skip_h1_title": false, 657 | "title_cell": "Table of Contents", 658 | "title_sidebar": "Contents", 659 | "toc_cell": false, 660 | "toc_position": {}, 661 | "toc_section_display": true, 662 | "toc_window_display": false 663 | } 664 | }, 665 | "nbformat": 4, 666 | "nbformat_minor": 4 667 | } 668 | -------------------------------------------------------------------------------- /Notebooks/project/5. Secondary model, Bet confidence.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Secondary Model\n", 8 | "- inputs\n", 9 | " - labels: meta-label(outcome of primary model = trading strategy)\n", 10 | " - features: same as momentum classifiers\n", 11 | "\n", 12 | "- models: SVM, Random Forest, Gradient Boosting, LSTM\n", 13 | "\n", 14 | "- outputs\n", 15 | " - bet confidence\n", 16 | "\n", 17 | "- strategy enhancing\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# lib\n", 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import seaborn as sns;sns.set()\n", 31 | "plt.style.use('tableau-colorblind10')\n", 32 | "\n", 33 | "# different models\n", 34 | "from sklearn.linear_model import LogisticRegression\n", 35 | "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier\n", 36 | "from sklearn.svm import SVC\n", 37 | "from sklearn.naive_bayes import GaussianNB\n", 38 | "\n", 39 | "from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize\n", 40 | "from sklearn.model_selection import GridSearchCV\n", 41 | "from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score\n", 42 | "\n", 43 | "# homemade\n", 44 | "from feature_engineering import dimension_reduction as DR\n", 45 | "from features import tautil\n", 46 | "from labeling import labeling\n", 47 | "from backtest import round_trip\n", 48 | "from triple_barrier import make_rt\n", 49 | "\n", 50 | "from mlutil.pkfold import PKFold" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import warnings\n", 60 | "warnings.filterwarnings(action='ignore')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "# get X,y" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "market_df = pd.read_csv('C:data/market_samsung.csv')\n", 77 | "market_df = market_df.rename(columns={market_df.columns[0]:'Date'})\n", 78 | "market_df.index = pd.to_datetime(market_df.Date)\n", 79 | "market_df.drop(columns='Date',inplace=True)\n", 80 | "market_df.dropna(inplace=True)\n", 81 | "close = market_df.close['2010':'2020']\n", 82 | "\n", 83 | "feature_df = pd.read_csv('C:data/features_samsung.csv')\n", 84 | "feature_df = feature_df.rename(columns={feature_df.columns[0]:'Date'})\n", 85 | "feature_df.index = pd.to_datetime(feature_df.Date)\n", 86 | "feature_df.drop(columns='Date',inplace=True)\n", 87 | "feature_df.dropna(inplace=True)\n", 88 | "\n", 89 | "selected_features = pd.read_csv('C:data/selected_features.csv').columns[1:]" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "feature = feature_df.dropna()\n", 99 | "feature = feature[selected_features]\n", 100 | "sc = StandardScaler()\n", 101 | "X_sc = sc.fit_transform(feature)\n", 102 | "X_sc = pd.DataFrame(X_sc, index=feature.index, columns=feature.columns)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 5, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "#benchmark\n", 112 | "barrier_bm = pd.read_csv('C:data/barrier_bm.csv')\n", 113 | "barrier_bm.index = pd.to_datetime(barrier_bm.Date)\n", 114 | "barrier_bm.exit = pd.to_datetime(barrier_bm.exit)\n", 115 | "barrier_bm.drop(columns='Date',inplace=True)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "#labeling\n", 125 | "barrier = pd.read_csv('C:data/barrier.csv')\n", 126 | "barrier.index = pd.to_datetime(barrier.Date)\n", 127 | "barrier.exit = pd.to_datetime(barrier.exit)\n", 128 | "barrier.drop(columns='Date',inplace=True)\n", 129 | "\n", 130 | "rts = make_rt(close,barrier.dropna())\n", 131 | "outcome = rts.rt_returns\n", 132 | "outcome.index = rts.open_dt" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "1.0 608\n", 144 | "0.0 421\n", 145 | "Name: rt_returns, dtype: int64" 146 | ] 147 | }, 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "#meta-label\n", 155 | "wl = np.sign(np.sign(outcome)+1)\n", 156 | "y_ = wl\n", 157 | "y_.value_counts()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 10, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "image/png": "\n", 168 | "text/plain": [ 169 | "
" 170 | ] 171 | }, 172 | "metadata": {}, 173 | "output_type": "display_data" 174 | } 175 | ], 176 | "source": [ 177 | "loss = wl.value_counts()[0]\n", 178 | "win = wl.value_counts()[1]\n", 179 | "plt.figure(figsize=(10,3))\n", 180 | "plt.scatter(wl[wl==1].index,close.loc[wl[wl==1].index], alpha=0.5)\n", 181 | "plt.scatter(wl[wl==0].index,close.loc[wl[wl==0].index], marker='x', alpha=0.5)\n", 182 | "plt.legend(['win 1','lose 0'])\n", 183 | "plt.title('y (meta-label): win {}, lose {}'.format(win,loss))\n", 184 | "plt.show()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 9, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "raw_X = X_sc.copy()\n", 194 | "tmp = raw_X.join(y_).dropna()\n", 195 | "X=tmp.iloc[:,:-1]\n", 196 | "y=tmp.iloc[:,-1]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "# Model Construction" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "# Choose model\n", 213 | "\n", 214 | "# Cross Validation (k-fold)\n", 215 | "n_cv=4\n", 216 | "t1 = pd.to_datetime(barrier.exit.loc[X.index])\n", 217 | "cv = PKFold(n_cv,t1,0)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 11, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "SVC(C=10, probability=True)" 229 | ] 230 | }, 231 | "execution_count": 11, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "# Choose model (SVM-rbf)\n", 238 | "C = [0.1, 1,10]\n", 239 | "param_grid_rbf = dict(C=C)\n", 240 | "svc_rbf = SVC(kernel='rbf', probability=True)\n", 241 | "gs_svc_rbf = GridSearchCV(estimator=svc_rbf, param_grid= param_grid_rbf, cv=cv, scoring='precision')\n", 242 | "gs_svc_rbf.fit(X,y)\n", 243 | "svc_best = gs_svc_rbf.best_estimator_\n", 244 | "svc_best" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 12, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "RandomForestClassifier(n_estimators=200)" 256 | ] 257 | }, 258 | "execution_count": 12, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "n_estimators = [200,1000]\n", 265 | "#max_depth = [3,7]\n", 266 | "param_grid_rfc = dict(n_estimators=n_estimators)\n", 267 | "rfc = RandomForestClassifier()\n", 268 | "gs_rfc = GridSearchCV(estimator=rfc, param_grid= param_grid_rfc, cv=cv, scoring='precision')\n", 269 | "gs_rfc.fit(X,y)\n", 270 | "rfc_best = gs_rfc.best_estimator_\n", 271 | "rfc_best" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 13, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "AdaBoostClassifier(learning_rate=1, n_estimators=100)" 283 | ] 284 | }, 285 | "execution_count": 13, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "n_estimators_ab = [50,100]\n", 292 | "learning_rate = [1,0.1]\n", 293 | "param_grid_abc = dict(n_estimators=n_estimators_ab, learning_rate=learning_rate)\n", 294 | "\n", 295 | "abc=AdaBoostClassifier()\n", 296 | "gs_abc = GridSearchCV(estimator=abc, param_grid= param_grid_abc, cv=cv, scoring='precision')\n", 297 | "gs_abc.fit(X,y)\n", 298 | "ada_best = gs_abc.best_estimator_\n", 299 | "ada_best" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 14, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "GradientBoostingClassifier(learning_rate=0.01, n_estimators=200)" 311 | ] 312 | }, 313 | "execution_count": 14, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "n_estimators_gb = [100,200]\n", 320 | "learning_rate = [0.1,0.01]\n", 321 | "param_grid_gbc = dict(n_estimators=n_estimators_gb, learning_rate=learning_rate)\n", 322 | "gbc=GradientBoostingClassifier()\n", 323 | "gs_gbc = GridSearchCV(estimator=gbc, param_grid= param_grid_gbc, cv=cv, scoring='precision')\n", 324 | "gs_gbc.fit(X,y)\n", 325 | "gbc_best = gs_gbc.best_estimator_\n", 326 | "gbc_best" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "# Model" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 15, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "clf_list = [svc_best, rfc_best, ada_best, gbc_best]\n", 343 | "estimators=['SVM_best','RF_best','AdaBoost_best','GradientBoost_best']\n", 344 | "scores_list = []\n", 345 | "y_preds_list = []\n", 346 | "y_probs_list = []\n", 347 | "\n", 348 | "# for ML model prediction\n", 349 | "for clf in clf_list:\n", 350 | " y_preds_ = []\n", 351 | " y_probs_ = []\n", 352 | "\n", 353 | " for train, test in cv.split(X, y):\n", 354 | " clf.fit(X.iloc[train], y.iloc[train])\n", 355 | " y_true = y.iloc[test]\n", 356 | " y_pred = clf.predict(X.iloc[test])\n", 357 | " y_probs = clf.predict_proba(X.iloc[test])\n", 358 | " y_probs = y_probs[:, 1]\n", 359 | " y_pred_series = pd.Series(y_pred,index=y[test].index)\n", 360 | " y_probs_series = pd.Series(y_probs,index=y[test].index)\n", 361 | " y_preds_.append(y_pred_series)\n", 362 | " y_probs_.append(y_probs_series)\n", 363 | " \n", 364 | " \n", 365 | " y_preds__ = pd.concat([i for i in y_preds_])\n", 366 | " y_probs__ = pd.concat([i for i in y_probs_])\n", 367 | " y_true__ = y.loc[y_preds__.index]\n", 368 | " accs = accuracy_score(y_true__, y_preds__)\n", 369 | " f1=f1_score(y_true__, y_preds__)\n", 370 | " roc=roc_auc_score(y_true__, y_probs__)\n", 371 | " prec=precision_score(y_true__, y_preds__)\n", 372 | " score = [accs, f1, roc, prec]\n", 373 | " scores_list.append(score)\n", 374 | " y_preds_list.append(y_preds__)\n", 375 | " y_probs_list.append(y_probs__)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 16, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "results = pd.DataFrame(scores_list, columns=['accuracy','f1 score','roc auc score','precision score'],index=estimators)\n", 385 | "result_show = results.sort_values('precision score', ascending=False)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 17, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/html": [ 396 | "
\n", 397 | "\n", 410 | "\n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | "
accuracyf1 scoreroc auc scoreprecision score
AdaBoost_best0.5675410.6319270.5524710.635607
SVM_best0.5442180.5853230.5742280.632887
RF_best0.5490770.6578170.5370730.596257
GradientBoost_best0.5199220.6091770.4903640.586890
\n", 451 | "
" 452 | ], 453 | "text/plain": [ 454 | " accuracy f1 score roc auc score precision score\n", 455 | "AdaBoost_best 0.567541 0.631927 0.552471 0.635607\n", 456 | "SVM_best 0.544218 0.585323 0.574228 0.632887\n", 457 | "RF_best 0.549077 0.657817 0.537073 0.596257\n", 458 | "GradientBoost_best 0.519922 0.609177 0.490364 0.586890" 459 | ] 460 | }, 461 | "execution_count": 17, 462 | "metadata": {}, 463 | "output_type": "execute_result" 464 | } 465 | ], 466 | "source": [ 467 | "result_show" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 18, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "y_probs_df = pd.DataFrame()\n", 477 | "for i in range(len(estimators)):\n", 478 | " y_probs_df[estimators[i]] = y_probs_list[i]" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 19, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "#평균\n", 488 | "pred_prob = pd.Series(y_probs_df.mean(axis=1),index=y_probs_df.index)\n", 489 | "\n", 490 | "#하나하나\n", 491 | "\n", 492 | "#y_probs_df_2 = y_probs_df[estimators[3]]\n", 493 | "#pred_prob = pd.Series(y_probs_df_2,index=y_probs_df_2.index)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 20, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "pred_prob2=pd.Series(normalize(pred_prob.to_frame().T).reshape(-1,), index=y_probs_df.index).rename('bet_confidence')" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 21, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "bet_confidence=pd.Series(MinMaxScaler().fit_transform(pred_prob2.to_frame()).reshape(-1,), index=y_probs_df.index).rename('bet_confidence')" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 41, 517 | "metadata": {}, 518 | "outputs": [ 519 | { 520 | "data": { 521 | "text/plain": [ 522 | "Text(0, 0.5, 'counts')" 523 | ] 524 | }, 525 | "execution_count": 41, 526 | "metadata": {}, 527 | "output_type": "execute_result" 528 | }, 529 | { 530 | "data": { 531 | "image/png": "\n", 532 | "text/plain": [ 533 | "
" 534 | ] 535 | }, 536 | "metadata": {}, 537 | "output_type": "display_data" 538 | } 539 | ], 540 | "source": [ 541 | "plt.title('Bet confidence distribution')\n", 542 | "plt.hist(bet_confidence, bins=30)[2]\n", 543 | "plt.xlabel('Bet confidence')\n", 544 | "plt.ylabel('counts')" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 45, 550 | "metadata": { 551 | "scrolled": false 552 | }, 553 | "outputs": [ 554 | { 555 | "data": { 556 | "image/png": "\n", 557 | "text/plain": [ 558 | "
" 559 | ] 560 | }, 561 | "metadata": {}, 562 | "output_type": "display_data" 563 | } 564 | ], 565 | "source": [ 566 | "c = close.loc[bet_confidence.index]\n", 567 | "plt.figure(figsize=(10,5))\n", 568 | "plt.title('Bet confidence')\n", 569 | "plt.plot(close, alpha=0.1)\n", 570 | "plt.scatter(c.index,c, c = bet_confidence, s=20,cmap='vlag',vmin=0,vmax=1)\n", 571 | "plt.colorbar()\n", 572 | "plt.show()" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "# Algo Trading Backtest" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 24, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "barrier_bm = barrier_bm.dropna()\n", 589 | "barrier_before = barrier.loc[bet_confidence.index].dropna()\n", 590 | "barrier_enhanced = barrier_before.loc[bet_confidence.loc[bet_confidence>0.5].index]" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 25, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "rts_bm = make_rt(close,barrier_bm)\n", 600 | "rts_before = make_rt(close,barrier_before)\n", 601 | "rts_enhanced = make_rt(close,barrier_enhanced)" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 26, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "result1 = pd.concat([round_trip.get_df_ann_sr(rts_bm,'Benchmark',years=11),\n", 611 | " round_trip.get_df_ann_sr(rts_before,'Trading Strategy (Primary)',years=11)],axis=1)\n", 612 | "\n", 613 | "df_sr = round_trip.get_df_ann_sr(rts_enhanced,'Enhanced Trading Strategy (Second)',years=11)\n", 614 | "result1 = result1.join(df_sr)" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 27, 620 | "metadata": {}, 621 | "outputs": [ 622 | { 623 | "data": { 624 | "text/html": [ 625 | "
\n", 626 | "\n", 639 | "\n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | "
BenchmarkTrading Strategy (Primary)Enhanced Trading Strategy (Second)
avg_n_bets_per_year246.27272793.54545549.636364
win_ratio0.5205060.5904670.612844
annualized_sharpe_ratio0.5382321.5259951.623284
\n", 669 | "
" 670 | ], 671 | "text/plain": [ 672 | " Benchmark Trading Strategy (Primary) \\\n", 673 | "avg_n_bets_per_year 246.272727 93.545455 \n", 674 | "win_ratio 0.520506 0.590467 \n", 675 | "annualized_sharpe_ratio 0.538232 1.525995 \n", 676 | "\n", 677 | " Enhanced Trading Strategy (Second) \n", 678 | "avg_n_bets_per_year 49.636364 \n", 679 | "win_ratio 0.612844 \n", 680 | "annualized_sharpe_ratio 1.623284 " 681 | ] 682 | }, 683 | "execution_count": 27, 684 | "metadata": {}, 685 | "output_type": "execute_result" 686 | } 687 | ], 688 | "source": [ 689 | "result1" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 33, 695 | "metadata": {}, 696 | "outputs": [], 697 | "source": [ 698 | "result2 = pd.concat([round_trip.get_df_ann_sr(rts_bm,'Benchmark',years=11),\n", 699 | " round_trip.get_df_ann_sr(rts_before,'Trading Strategy (Primary)',years=11)],axis=1)\n", 700 | "winr = []\n", 701 | "for i in np.linspace(0.1,0.9,9):\n", 702 | " barrier_enhanced_ = barrier_before.loc[bet_confidence.loc[bet_confidence>=i].index]\n", 703 | " rts_enhanced_ = make_rt(close,barrier_enhanced_)\n", 704 | " df_sr = round_trip.get_df_ann_sr(rts_enhanced_,'b',years=11)\n", 705 | " winr.append(df_sr.T.win_ratio[0])" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 34, 711 | "metadata": {}, 712 | "outputs": [], 713 | "source": [ 714 | "dict_ = dict(zip(np.linspace(0.1,0.9,9).round(2),winr))" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 49, 720 | "metadata": {}, 721 | "outputs": [ 722 | { 723 | "data": { 724 | "image/png": "\n", 725 | "text/plain": [ 726 | "
" 727 | ] 728 | }, 729 | "metadata": {}, 730 | "output_type": "display_data" 731 | } 732 | ], 733 | "source": [ 734 | "df_res = pd.DataFrame.from_dict(dict_,orient='index')\n", 735 | "plt.figure(figsize=(10,5))\n", 736 | "plt.title(\"Hit-ratio of different thresholds strategy\")\n", 737 | "plt.bar(df_res.index, df_res[0], width=0.05)\n", 738 | "plt.plot(df_res)\n", 739 | "plt.ylabel('win ratio')\n", 740 | "plt.xlabel('bet confidence threshold')\n", 741 | "plt.ylim(0.5,0.8)\n", 742 | "plt.show()" 743 | ] 744 | } 745 | ], 746 | "metadata": { 747 | "kernelspec": { 748 | "display_name": "Python 3", 749 | "language": "python", 750 | "name": "python3" 751 | }, 752 | "language_info": { 753 | "codemirror_mode": { 754 | "name": "ipython", 755 | "version": 3 756 | }, 757 | "file_extension": ".py", 758 | "mimetype": "text/x-python", 759 | "name": "python", 760 | "nbconvert_exporter": "python", 761 | "pygments_lexer": "ipython3", 762 | "version": "3.7.7" 763 | }, 764 | "toc": { 765 | "base_numbering": 1, 766 | "nav_menu": {}, 767 | "number_sections": true, 768 | "sideBar": true, 769 | "skip_h1_title": false, 770 | "title_cell": "Table of Contents", 771 | "title_sidebar": "Contents", 772 | "toc_cell": false, 773 | "toc_position": {}, 774 | "toc_section_display": true, 775 | "toc_window_display": false 776 | } 777 | }, 778 | "nbformat": 4, 779 | "nbformat_minor": 4 780 | } 781 | --------------------------------------------------------------------------------