├── .gitignore ├── requirements.txt ├── README.md ├── rule_extractor_robotrader.py └── rule_extractor_robotrader.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .ipynb_checkpoints/rule_extractor_robotrader-checkpoint.ipynb 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | scipy 4 | scikit-learn 5 | joblib 6 | cupy 7 | tqdm 8 | 9 | # TA-Lib requires manual installation, follow the guide: 10 | # https://blog.quantinsti.com/install-ta-lib-python/ 11 | TA-Lib 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Alanovski 2 | Welcome algotrader. 3 | 4 | # rule_extractor 5 | 🚀 Unlocking the Edge for AlgoTraders 6 | Built with Python, it aims to bridge the gap between discretionary trading insights and fully automated trading systems 7 | 8 | --- 9 | 10 | ## 📌 Contact 11 | 12 | 📩 **Telegram:** [@alanovski](https://t.me/alanovski) 13 | 📸 **Instagram:** [@alanautotrading](https://www.instagram.com/alan_autotrading) 14 | 15 | If you have any questions or want to talk about algorithmic trading, feel free to contact me! 16 | -------------------------------------------------------------------------------- /rule_extractor_robotrader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # Librerías de manejo de datos 8 | import pandas as pd 9 | import numpy as np 10 | 11 | # Configuración de pandas para mejor visualización 12 | pd.set_option('display.max_columns', 100) 13 | pd.set_option('display.float_format', lambda x: '%.7f' % x) 14 | 15 | # Librerías de análisis técnico y estadística 16 | import talib as ta 17 | from scipy.stats import pointbiserialr 18 | from sklearn.feature_selection import mutual_info_regression 19 | 20 | # Librerías de paralelización y optimización 21 | import joblib 22 | from joblib import Parallel, delayed, parallel_backend 23 | import cupy as cp 24 | 25 | # Manejo de fechas y tiempo 26 | import time 27 | from datetime import datetime 28 | 29 | # Librerías de bases de datos y almacenamiento 30 | import os 31 | import pickle 32 | import hashlib 33 | 34 | # Utilidades varias 35 | import uuid 36 | import itertools 37 | import re 38 | import gc 39 | from tqdm import tqdm 40 | from itertools import groupby 41 | 42 | # In[2]: 43 | 44 | 45 | def transform_df(csv_name, exposicion_dias=3, threshold=25, date_column='DateTime', short=False): 46 | df = pd.read_csv(csv_name+'.csv') 47 | 48 | def rsi_function(i): 49 | rsi = ta.RSI(df['Close'], timeperiod=i) 50 | return pd.DataFrame(rsi, columns=[f'rsi_{i}']) 51 | 52 | rsi_dfs = pd.concat([rsi_function(i) for i in range(2, 51, 2)], axis=1) 53 | df = pd.concat([df, rsi_dfs], axis=1) 54 | 55 | def adx_function(i): 56 | adx = ta.ADX(df['High'], df['Low'], df['Close'], timeperiod=i) 57 | return pd.DataFrame(adx, columns=[f'adx_{i}']) 58 | 59 | adx_dfs = pd.concat([adx_function(i) for i in range(2, 51, 2)], axis=1) 60 | df = pd.concat([df, adx_dfs], axis=1) 61 | 62 | def plus_di_function(i): 63 | plus_di = ta.PLUS_DI(df['High'], df['Low'], df['Close'], timeperiod=i) 64 | return pd.DataFrame(plus_di, columns=[f'plus_di_{i}']) 65 | 66 | plus_di_dfs = pd.concat([plus_di_function(i) for i in range(2, 51, 2)], axis=1) 67 | df = pd.concat([df, plus_di_dfs], axis=1) 68 | 69 | def minus_di_function(i): 70 | minus_di = ta.MINUS_DI(df['High'], df['Low'], df['Close'], timeperiod=i) 71 | return pd.DataFrame(minus_di, columns=[f'minus_di_{i}']) 72 | 73 | minus_di_dfs = pd.concat([minus_di_function(i) for i in range(2, 51, 2)], axis=1) 74 | df = pd.concat([df, minus_di_dfs], axis=1) 75 | 76 | def willr_function(i): 77 | willr = ta.WILLR(df['High'], df['Low'], df['Close'], timeperiod=i) 78 | return pd.DataFrame(willr, columns=[f'willr_{i}']) 79 | 80 | willr_dfs = pd.concat([willr_function(i) for i in range(2, 51, 2)], axis=1) 81 | df = pd.concat([df, willr_dfs], axis=1) 82 | 83 | def ma_function(i): 84 | ma = ta.MA(df['Close'], timeperiod=i, matype=0) 85 | return pd.DataFrame(ma, columns=[f'sma_{i}']) 86 | 87 | ma_dfs = pd.concat([ma_function(i) for i in range(2, 301, 2)], axis=1) 88 | df = pd.concat([df, ma_dfs], axis=1) 89 | 90 | def ema_function(i): 91 | ma = ta.EMA(df['Close'], timeperiod=i) 92 | return pd.DataFrame(ma, columns=[f'mema_{i}']) 93 | 94 | ema_dfs = pd.concat([ema_function(i) for i in range(2, 301, 2)], axis=1) 95 | df = pd.concat([df, ema_dfs], axis=1) 96 | 97 | def atr_function(i): 98 | atr = ta.ATR(df['High'], df['Low'], df['Close'], timeperiod=i) 99 | return pd.DataFrame(atr, columns=[f'atr_{i}']) 100 | 101 | atr_dfs = pd.concat([atr_function(i) for i in range(2, 51, 2)], axis=1) 102 | df = pd.concat([df, atr_dfs], axis=1) 103 | 104 | def calculate_ibs(high, low, close): 105 | ibs = (close - low) / (high - low) 106 | ibs = np.round(ibs, 2) 107 | return pd.DataFrame(ibs, columns=['ibs_']) 108 | 109 | 110 | def stdev_function(i): 111 | stdev = ta.STDDEV(df['Close'], timeperiod=i, nbdev=1) 112 | return pd.DataFrame(stdev, columns=[f'stdev_{i}']) 113 | 114 | stdev_dfs = pd.concat([stdev_function(i) for i in range(2, 51, 2)], axis=1) 115 | df = pd.concat([df, stdev_dfs], axis=1) 116 | 117 | def bband_function(i, dev=2): 118 | upperband, middleband, lowerband = ta.BBANDS(df['Close'], timeperiod=i, nbdevup=dev, nbdevdn=dev, matype=0) 119 | return pd.DataFrame({f'bb_upper_{dev}_{i}': upperband, f'bb_middle_{dev}_{i}': middleband, f'bb_lower_{dev}_{i}': lowerband}) 120 | 121 | def bband_function(i, dev=2): 122 | upperband, middleband, lowerband = ta.BBANDS(df['Close'], timeperiod=i, nbdevup=dev, nbdevdn=dev, matype=0) 123 | return pd.DataFrame({f'bb_upper_{dev}_{i}': upperband, f'bb_lower_{dev}_{i}': lowerband}) 124 | 125 | for dev in range(2,6): 126 | bband_dfs = pd.concat([bband_function(i, dev) for i in range(5, 31, 2)], axis=1) 127 | df = pd.concat([df, bband_dfs], axis=1) 128 | 129 | 130 | def macd_function(fp, slp, sp): 131 | macd, macdsignal, macdhist = ta.MACD(df['Close'], fastperiod=fp, slowperiod=slp, signalperiod=sp) 132 | return pd.DataFrame({f'macd_{fp}': macd, f'macdsig_{slp}': macdsignal, f'macdh_{sp}': macdhist}) 133 | 134 | macd_dfs = [] 135 | fastperiod_values = [7, 12, 26, 52] 136 | slowperiod_values = [13, 26, 52] 137 | signalperiod_values = [3, 6, 9] 138 | 139 | 140 | def mom_function(i): 141 | momentum = ta.MOM(df['Close'], timeperiod=i) 142 | return pd.DataFrame(momentum, columns=[f'mom_{i}']) 143 | 144 | momentum_dfs = pd.concat([mom_function(i) for i in range(2, 31, 2)], axis=1) 145 | df = pd.concat([df, momentum_dfs], axis=1) 146 | 147 | def aaron_up_function(i): 148 | aroon_up = ta.AROONOSC(df['High'], df['Low'], timeperiod=i) 149 | return pd.DataFrame(aroon_up, columns=[f'aaro_{i}']) 150 | 151 | aaronu_dfs = pd.concat([aaron_up_function(i) for i in range(2, 51, 2)], axis=1) 152 | df = pd.concat([df, aaronu_dfs], axis=1) 153 | 154 | def aaron_up_function2(i): 155 | _, aroon_up = ta.AROON(df['High'], df['Low'], timeperiod=i) 156 | return pd.DataFrame(aroon_up, columns=[f'aarou_{i}']) 157 | 158 | aaronu_dfs = pd.concat([aaron_up_function2(i) for i in range(2, 51, 2)], axis=1) 159 | df = pd.concat([df, aaronu_dfs], axis=1) 160 | 161 | def aaron_dw_function2(i): 162 | aroon_down, _ = ta.AROON(df['High'], df['Low'], timeperiod=i) 163 | return pd.DataFrame(aroon_down, columns=[f'aarod_{i}']) 164 | 165 | aaronu_dfs = pd.concat([aaron_dw_function2(i) for i in range(2, 51, 2)], axis=1) 166 | df = pd.concat([df, aaronu_dfs], axis=1) 167 | 168 | def roc_function(i): 169 | roc_t = ta.ROC(df['Close'], timeperiod=i) 170 | return pd.DataFrame(roc_t, columns=[f'roc_{i}']) 171 | 172 | 173 | 174 | 175 | duplicates = df.columns[df.columns.duplicated()] 176 | df = df.loc[:, ~df.columns.duplicated()] 177 | 178 | def shift_column(column, i): 179 | shifted = df[column].shift(i) 180 | if('ibs_' in column): 181 | column = 'ibs' 182 | return shifted.rename(f'{column}_sft_{i}') 183 | 184 | columns = df.columns 185 | lista_shift = ['rsi', 'adx', 'plus_di', 'minus_di', 'willr', 'bb', 'atr', 'stdev', 'Close', 'High', 'Low', 'aaro', 'mom'] 186 | indicator_columns = {col for col in columns if any(name in col for name in lista_shift)} 187 | 188 | shifted_columns = [] 189 | shift_value = 3 190 | 191 | for column in indicator_columns: 192 | for i in range(1, shift_value + 1): 193 | shifted_series = shift_column(column, i) 194 | shifted_columns.append(shifted_series) 195 | 196 | df = pd.concat([df] + shifted_columns, axis=1) 197 | 198 | 199 | pips = 0 200 | if "JPY" in csv_name: 201 | pips = 100 202 | else: 203 | pips = 10000 204 | 205 | 206 | for i in range (2,31, 2): 207 | ret = [] 208 | new_cols = [] 209 | if(short == True): 210 | ret = ((df["Close"].shift(-1 * i) - df["Close"]) * pips) + 2 211 | new_cols = pd.DataFrame(np.array(ret) * -1, columns=[f"Return_{i}"]) 212 | else: 213 | ret = ((df["Close"].shift(-1 * i) - df["Close"]) * pips) - 2 214 | new_cols = pd.DataFrame(np.array(ret), columns=[f"Return_{i}"]) 215 | 216 | df = pd.concat([df, new_cols], axis=1) 217 | 218 | 219 | if(short == True): 220 | ret = ((df["Close"].shift(-1 * exposicion_dias) - df["Close"]) * pips) + 2 221 | else: 222 | ret = ((df["Close"].shift(-1 * exposicion_dias) - df["Close"]) * pips) - 2 223 | 224 | new_cols = pd.DataFrame(np.array(ret), columns=["Return"]) 225 | 226 | if(short == True): 227 | new_cols["Return"] = new_cols["Return"] * -1 228 | 229 | df = pd.concat([df, new_cols], axis=1) 230 | 231 | target = (df["Return"] >= threshold).astype(int) 232 | 233 | 234 | target = df[f'Return_{exposicion_dias}'].copy() 235 | target = (df[f'Return_{exposicion_dias}'] >= threshold).astype(int) 236 | df = pd.concat([df, target.rename("Target")], axis=1) 237 | 238 | for i in range (4,31, 2): 239 | target = (df[f'Return_{i}'] >= threshold).astype(int) 240 | df = pd.concat([df, target.rename(f'Target_{i}')], axis=1) 241 | 242 | 243 | 244 | df = df.copy() 245 | df[date_column] = pd.to_datetime(df[date_column], dayfirst=True) 246 | 247 | day_of_month = df[date_column].apply(lambda x: x.day) 248 | month = df[date_column].apply(lambda x: x.month) 249 | day_of_week = df[date_column].apply(lambda x: x.weekday()) 250 | year = df[date_column].apply(lambda x: x.year) 251 | 252 | df[date_column] = df[date_column].dt.strftime('%d/%m/%Y %H:%M') 253 | 254 | new_columns = pd.concat([day_of_month.rename('day_of_month'), month.rename('month'), day_of_week.rename('day_of_week'), year.rename('year')], axis=1) 255 | df = pd.concat([df, new_columns], axis=1) 256 | 257 | return df 258 | 259 | def split_data_validation(df, year_max_cut = '2022', year_min_cur= '2008'): 260 | data = df.query('year >= ' + year_max_cut).copy() 261 | df = df.query(year_min_cur+' < year <= 2022') 262 | df = df.reset_index(drop=True) 263 | return df, data 264 | 265 | def map_creator(df): 266 | inicio = time.time() 267 | columns = df.columns 268 | no_sft_columns = [col for col in columns if 'sft' not in col] 269 | column_map = create_column_map(df, columns, no_sft_columns) 270 | fin = time.time() 271 | duracion = fin - inicio 272 | print("La duración del proceso fue de", duracion, "segundos") 273 | return column_map 274 | 275 | def create_column_map(df, columns, no_sft_columns): 276 | column_map = {} 277 | rsi_columns = {col for col in columns if 'rsi_' in col} 278 | adx_columns = {col for col in columns if 'adx' in col} 279 | plus_di_columns = {col for col in columns if 'plus_di' in col} 280 | minus_di_columns = {col for col in columns if 'minus_di' in col} 281 | will_columns = {col for col in columns if 'willr' in col} 282 | sma_columns = {col for col in columns if 'sma' in col} 283 | mema_columns = {col for col in columns if 'mema' in col} 284 | ibs_columns = {col for col in columns if 'ibs_' in col} 285 | atr_columns = {col for col in columns if 'atr' in col} 286 | bbup_columns = {col for col in columns if 'bb_upper' in col} 287 | bbmid_columns = {col for col in columns if 'bb_middle' in col} 288 | bblow_columns = {col for col in columns if 'bb_lower' in col} 289 | macd_columns = {col for col in columns if 'macd' in col} 290 | macdsig_columns = {col for col in columns if 'macdsig' in col} 291 | macdh_columns = {col for col in columns if 'macdh' in col} 292 | ibsma_columns = {col for col in columns if 'ibma' in col} 293 | hh_columns = {col for col in columns if 'hh' in col} 294 | dayw_columns = {col for col in columns if 'day_of_week' in col} 295 | daym_columns = {col for col in columns if 'day_of_month' in col} 296 | ll_columns = {col for col in columns if 'll' in col} 297 | mom_columns = {col for col in columns if 'mom' in col} 298 | aaro_columns = {col for col in columns if 'aaro_' in col} 299 | roc_columns = {col for col in columns if 'roc' in col} 300 | stoch_columns = {col for col in columns if 'stoch' in col} 301 | stochk_columns = {col for col in columns if 'stochk' in col} 302 | stochd_columns = {col for col in columns if 'stochd' in col} 303 | stdev_columns = {col for col in columns if 'stdev' in col} 304 | aarod_columns = {col for col in columns if 'aarod_' in col} 305 | aarou_columns = {col for col in columns if 'aarou_' in col} 306 | 307 | for column in no_sft_columns: 308 | if 'rsi' in column: 309 | filtered_columns = rsi_columns - {column} 310 | filtered_columns = {col for col in filtered_columns if 'condition' not in col and 'ibs' not in col} 311 | column_map[column] = [list(range(0, 101)), list(filtered_columns)] 312 | 313 | elif 'adx' in column: 314 | filtered_columns = adx_columns - {column} 315 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 316 | column_map[column] = [list(range(0, 101)), list(filtered_columns)] 317 | 318 | elif 'plus_di' in column: 319 | filtered_columns = plus_di_columns - {column} 320 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 321 | column_map[column] = [list(range(0, 101)), list(filtered_columns)] 322 | 323 | elif 'minus_di' in column: 324 | filtered_columns = minus_di_columns - {column} 325 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 326 | column_map[column] = [list(range(0, 101)), list(filtered_columns)] 327 | 328 | elif 'willr' in column: 329 | filtered_columns = will_columns - {column} 330 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 331 | column_map[column] = [list(x for x in range(0, -101, -1)), list(filtered_columns)] 332 | 333 | elif 'sma' in column: 334 | filtered_columns = sma_columns - {column} 335 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 336 | column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)] 337 | 338 | elif 'mema' in column: 339 | filtered_columns = mema_columns - {column} 340 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 341 | column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)] 342 | 343 | elif 'ibs_' in column: 344 | filtered_columns = ibs_columns - {column} 345 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 346 | column_map[column] = [list([i / 100 for i in range(0, 101)]), list(filtered_columns)] 347 | 348 | elif 'atr' in column: 349 | filtered_columns = atr_columns - {column} 350 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 351 | column_map[column] = [list(filtered_columns), list(filtered_columns)] 352 | 353 | elif 'bb_upper' in column: 354 | filtered_columns = bbup_columns - {column} 355 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 356 | column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)] 357 | 358 | elif 'bb_middle' in column: 359 | filtered_columns = bbmid_columns - {column} 360 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 361 | column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)] 362 | 363 | elif 'bb_lower' in column: 364 | filtered_columns = bblow_columns - {column} 365 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 366 | column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)] 367 | 368 | elif 'macd' in column: 369 | filtered_columns = macd_columns - {column} 370 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 371 | column_map[column] = [list(filtered_columns), list(macdsig_columns)] 372 | 373 | elif 'macdsig' in column: 374 | filtered_columns = macdsig_columns - {column} 375 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 376 | column_map[column] = [list(filtered_columns), list(macd_columns)] 377 | 378 | elif 'macdh' in column: 379 | filtered_columns = macdh_columns - {column} 380 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 381 | column_map[column] = [list(filtered_columns), list(filtered_columns)] 382 | 383 | elif 'ibma' in column: 384 | filtered_columns = ibsma_columns - {column} 385 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 386 | column_map[column] = [list(filtered_columns), list(ibs_columns)] 387 | 388 | elif 'hh' in column: 389 | filtered_columns = hh_columns - {column} 390 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 391 | column_map[column] = [list(filtered_columns), list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3'])] 392 | 393 | elif 'day_of_week' in column: 394 | filtered_columns = dayw_columns - {column} 395 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 396 | column_map[column] = [list([i for i in range(0, 7)]), list([i for i in range(0, 7)])] 397 | 398 | elif 'day_of_week' in column: 399 | filtered_columns = dayw_columns - {column} 400 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 401 | column_map[column] = [list([i for i in range(0, 7)]), list([i for i in range(0, 7)])] 402 | 403 | elif 'day_of_month' in column: 404 | filtered_columns = daym_columns - {column} 405 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 406 | column_map[column] = [list([i for i in range(1, 32)]), list([i for i in range(1, 32)])] 407 | 408 | 409 | elif 'mom' in column: 410 | filtered_columns = mom_columns - {column} 411 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 412 | column_map[column] = [list(filtered_columns), list(filtered_columns)] 413 | 414 | elif 'aaro_' in column: 415 | filtered_columns = aaro_columns - {column} 416 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 417 | column_map[column] = [list(x for x in range(-100, 101, 1)), list(filtered_columns)] 418 | 419 | elif 'aarod_' in column: 420 | filtered_columns = aarod_columns - {column} 421 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 422 | column_map[column] = [list(x for x in range(0, 101)), list(filtered_columns)] 423 | 424 | elif 'aarou_' in column: 425 | filtered_columns = aarou_columns - {column} 426 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 427 | column_map[column] = [list(x for x in range(0, 101)), list(filtered_columns)] 428 | 429 | elif 'roc' in column: 430 | filtered_columns = roc_columns - {column} 431 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 432 | column_map[column] = [list(filtered_columns), list(filtered_columns)] 433 | 434 | elif 'stochd' in column: 435 | filtered_columns = stochd_columns - {column} 436 | comun_columns = stoch_columns - {column} 437 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 438 | filtered_comun_columns = {col for col in comun_columns if 'condition' not in col} 439 | column_map[column] = [list(filtered_columns), list(filtered_comun_columns)] 440 | 441 | elif 'stochk' in column: 442 | filtered_columns = stochk_columns - {column} 443 | comun_columns = stoch_columns - {column} 444 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 445 | filtered_comun_columns = {col for col in comun_columns if 'condition' not in col} 446 | column_map[column] = [list(filtered_columns), list(filtered_comun_columns)] 447 | 448 | elif 'stdev' in column: 449 | filtered_columns = stdev_columns - {column} 450 | filtered_columns = {col for col in filtered_columns if 'condition' not in col} 451 | column_map[column] = [list(filtered_columns), list(filtered_columns)] 452 | 453 | 454 | return column_map 455 | 456 | 457 | # In[3]: 458 | 459 | 460 | def crear_directorio(nombre_carpeta): 461 | try: 462 | os.makedirs(nombre_carpeta) 463 | print(f"Directorio '{nombre_carpeta}' creado con éxito.") 464 | except FileExistsError: 465 | print(f"El directorio '{nombre_carpeta}' ya existe.") 466 | except Exception as e: 467 | print(f"Error al crear el directorio '{nombre_carpeta}': {e}") 468 | 469 | 470 | # In[4]: 471 | 472 | 473 | date_column = 'DateTime' 474 | h12 = 'GBPNZ_H12' 475 | 476 | 477 | # In[5]: 478 | 479 | 480 | exposicion_dias=4 481 | threshold=65 482 | df = transform_df(h12, exposicion_dias, threshold, short=True) 483 | dir_results_name = f'results_{exposicion_dias}_{threshold}' 484 | crear_directorio(dir_results_name) 485 | full_df = df.copy() 486 | df = df.query('2016 < year < 2019') 487 | 488 | df = df.reset_index(drop=True) 489 | column_map = map_creator(df) 490 | 491 | 492 | 493 | # In[6]: 494 | 495 | 496 | df.head() 497 | 498 | 499 | # In[7]: 500 | 501 | 502 | len(df.columns) 503 | 504 | 505 | # In[8]: 506 | 507 | 508 | return_columns = [col for col in df.columns if 'Return_' in col] 509 | df[return_columns+['Return']].tail(25) 510 | 511 | 512 | # In[9]: 513 | 514 | 515 | print(len(df[df['day_of_week'] == 6])) 516 | print(len(df[df['day_of_week'] == 7])) 517 | 518 | 519 | # In[10]: 520 | 521 | 522 | def generate_all_rules(column_map, df): 523 | all_rules = [] 524 | 525 | for column, (possible_values, related_columns) in column_map.items(): 526 | if 'day' in column: 527 | operators = ['>', '<', '==', '>=', '<='] 528 | else: 529 | operators = ['>=', '<='] 530 | 531 | for value in possible_values: 532 | for operator in operators: 533 | condition = f"{column} {operator} {value}" 534 | all_rules.append(condition) 535 | 536 | for value in related_columns: 537 | for operator in operators: 538 | related_condition = f"{column} {operator} {value}" 539 | all_rules.append(related_condition) 540 | 541 | return all_rules 542 | 543 | 544 | 545 | 546 | # In[11]: 547 | 548 | 549 | all_rules = generate_all_rules(column_map, df) 550 | 551 | 552 | # In[12]: 553 | 554 | 555 | all_rules 556 | 557 | 558 | # In[13]: 559 | 560 | 561 | len(all_rules) 562 | 563 | 564 | # In[14]: 565 | 566 | 567 | def generate_hash(s): 568 | try: 569 | first_part = re.findall(r'^\D+', s.split(' ')[0])[0] 570 | comparison_operator = re.findall(r'[<=>=]+', s)[0] 571 | try: 572 | second_part = re.findall(r'^\D+', s.split(' ')[2].split('_')[0])[0] 573 | except IndexError: 574 | second_part = 'num' 575 | combined = first_part + comparison_operator + second_part 576 | return hashlib.sha256(combined.encode()).hexdigest() 577 | except Exception as e: 578 | return 12345678910 579 | 580 | 581 | def process_rule_chunk(df, data, df_columns, rule_chunk, target_values, target, returns_columns): 582 | chunk_results = {} 583 | chunk_stats = {} 584 | for rule in rule_chunk: 585 | try: 586 | parts = rule.split() 587 | if len(parts) == 3: 588 | column1, operator, column2_or_value = parts 589 | idx1 = df_columns.get_loc(column1) 590 | idx_return = df_columns.get_loc('Return') 591 | 592 | try: 593 | value = float(column2_or_value) 594 | idx2 = None 595 | except ValueError: 596 | idx2 = df_columns.get_loc(column2_or_value) 597 | 598 | condition_eval_df = None 599 | if operator == '>=': 600 | if idx2 is None: 601 | condition_eval_df = (data[:, idx1] >= value) 602 | else: 603 | condition_eval_df = (data[:, idx1] >= data[:, idx2]) 604 | elif operator == '<=': 605 | if idx2 is None: 606 | condition_eval_df = (data[:, idx1] <= value) 607 | else: 608 | condition_eval_df = (data[:, idx1] <= data[:, idx2]) 609 | elif operator == '==': 610 | if idx2 is None: 611 | condition_eval_df = (data[:, idx1] == value) 612 | else: 613 | condition_eval_df = (data[:, idx1] == data[:, idx2]) 614 | 615 | condition_eval = condition_eval_df.astype(np.int8) 616 | ones_count = np.sum(condition_eval) 617 | if ones_count < 100: 618 | continue 619 | 620 | correlation, _ = pointbiserialr(condition_eval, target) 621 | if np.isnan(correlation): 622 | continue 623 | 624 | length = len(condition_eval) 625 | zeros_count = length - ones_count 626 | win_rate = np.sum(condition_eval & target_values) / ones_count if ones_count > 0 else 0 627 | 628 | 629 | sum_returns = np.sum(data[condition_eval_df, idx_return]) 630 | 631 | sum_positive_returns = np.sum(data[condition_eval_df & (data[:, idx_return] > 0), idx_return]) 632 | sum_negative_returns = np.sum(data[condition_eval_df & (data[:, idx_return] < 0), idx_return]) 633 | 634 | profit_factor = 0 635 | if sum_negative_returns != 0: 636 | profit_factor = sum_positive_returns / -sum_negative_returns 637 | else: 638 | profit_factor = float('inf') 639 | 640 | optimal_pf = 0 641 | optimal_exposition = str(4) 642 | profit_factors = {} 643 | for ret_col in returns_columns: 644 | idx_return = df_columns.get_loc(ret_col) 645 | sum_positive_returns = np.sum(data[condition_eval_df & (data[:, idx_return] > 0), idx_return]) 646 | sum_negative_returns = -np.sum(data[condition_eval_df & (data[:, idx_return] < 0), idx_return]) 647 | profit_factor = sum_positive_returns / sum_negative_returns if sum_negative_returns != 0 else float('inf') 648 | match = re.search(r'Return_(\d+)', ret_col) 649 | number = 4 650 | if match: 651 | number = match.group(1) 652 | chunk_stats[f'pf_{str(number)}'] = profit_factor 653 | if(profit_factor != np.inf and profit_factor > optimal_pf): 654 | optimal_pf = profit_factor 655 | optimal_exposition = str(number) 656 | 657 | target_optimal = df[f'Target_{optimal_exposition}'].values 658 | target_values_optimal = df[f'Target_{optimal_exposition}'].apply(lambda x: 1 if x > 0 else 0).values 659 | correlation_optimal, _ = pointbiserialr(condition_eval, target_optimal) 660 | win_rate_optimal = np.sum(condition_eval & target_values_optimal) / ones_count if ones_count > 0 else 0 661 | 662 | 663 | chunk_results[rule] = condition_eval 664 | chunk_stats[rule] = { 665 | 'correlation': correlation, 666 | 'length': length, 667 | 'ones_count': ones_count, 668 | 'zeros_count': zeros_count, 669 | 'win_rate': round(win_rate * 100, 2), 670 | 'return': sum_returns, 671 | 'hash': generate_hash(rule), 672 | 'optimal_exposition': optimal_exposition, 673 | 'correlation_optimal': correlation_optimal, 674 | 'win_rate_optimal': round(win_rate_optimal * 100, 2), 675 | } 676 | chunk_stats[rule].update({f'pf_{i}': chunk_stats.pop(f'pf_{i}') for i in range(4, 31, 2)}) 677 | 678 | else: 679 | print(f"Rule '{rule}' could not be parsed.") 680 | except Exception as e: 681 | print(f"Error processing rule '{rule}': {e}") 682 | 683 | return chunk_results, chunk_stats 684 | 685 | 686 | def evaluate_rules_numpy(df, all_rules, target_column='Target', output_file='results2_h5.h5'): 687 | print('Starting process..', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) 688 | data = df.to_numpy() 689 | target = df[target_column].values 690 | target_values = df[target_column].apply(lambda x: 1 if x > 0 else 0).values 691 | 692 | 693 | num_cores = joblib.cpu_count() 694 | 695 | # Dividimos las reglas en chunks 696 | chunk_size = len(all_rules) // num_cores 697 | rule_chunks = [all_rules[i:i + chunk_size] for i in range(0, len(all_rules), chunk_size)] 698 | 699 | returns_columns = [f'Return_{i}' for i in range(4, 31, 2)] 700 | 701 | # Utilizar Joblib para paralelizar la evaluación de las reglas 702 | results = Parallel(n_jobs=-1)( 703 | delayed(process_rule_chunk)(df, data, df.columns, rule_chunk, target_values, target, returns_columns) for rule_chunk in rule_chunks 704 | ) 705 | 706 | all_results = {} 707 | all_stats = {} 708 | for chunk_results, chunk_stats in results: 709 | all_results.update(chunk_results) 710 | all_stats.update(chunk_stats) 711 | 712 | result_df = pd.DataFrame(all_results) 713 | result_df['Target'] = df['Target'].copy() 714 | result_df['Return'] = df['Return'].copy() 715 | 716 | returns_columns = [] 717 | for i in range(4, 31, 2): 718 | returns_columns.append(f'Return_{i}') 719 | 720 | 721 | for column_ in returns_columns: 722 | result_df[column_] = df[column_].values 723 | 724 | 725 | print('Ending process.. saving', datetime.now().strftime('%d-%m-%Y %H:%M:%S'), len(result_df)) 726 | result_df.to_hdf(output_file, key='result_df', mode='w') 727 | sorted_stats = sorted(all_stats.items(), key=lambda item: item[1]['correlation'], reverse=True) 728 | 729 | groups = groupby(sorted_stats, key=lambda item: item[1]['correlation']) 730 | unique_stats = [next(g) for _, g in groups] 731 | 732 | df = pd.DataFrame([item[1] for item in unique_stats]) 733 | df['condition'] = [item[0] for item in unique_stats] 734 | df = df[['condition'] + [col for col in df.columns if col != 'condition']] 735 | 736 | print('Ended process',datetime.now().strftime('%d-%m-%Y %H:%M:%S')) 737 | return df, unique_stats 738 | 739 | 740 | 741 | # In[15]: 742 | 743 | 744 | start_time = time.time() 745 | df_rules, sorted_stats = evaluate_rules_numpy(df, all_rules) 746 | 747 | print(f"El proceso tardó {time.time() - start_time} segundos.") 748 | 749 | 750 | # In[19]: 751 | 752 | 753 | len(df_rules) 754 | 755 | 756 | # In[16]: 757 | 758 | 759 | df_rules 760 | 761 | 762 | # In[17]: 763 | 764 | 765 | sorted_stats 766 | 767 | 768 | # In[18]: 769 | 770 | 771 | print(f'Total de backtests ejecutados: {len(df_rules) * 14}') 772 | 773 | 774 | # In[20]: 775 | 776 | 777 | df_sorted = df_rules.sort_values(by='pf_10', ascending=False) 778 | 779 | 780 | # In[21]: 781 | 782 | 783 | df_sorted 784 | 785 | 786 | # In[ ]: 787 | 788 | 789 | 790 | 791 | -------------------------------------------------------------------------------- /rule_extractor_robotrader.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "7fe2d110", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Librerías de manejo de datos\n", 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "\n", 14 | "# Configuración de pandas para mejor visualización\n", 15 | "pd.set_option('display.max_columns', 100)\n", 16 | "pd.set_option('display.float_format', lambda x: '%.7f' % x)\n", 17 | "\n", 18 | "# Librerías de análisis técnico y estadística\n", 19 | "import talib as ta\n", 20 | "from scipy.stats import pointbiserialr\n", 21 | "from sklearn.feature_selection import mutual_info_regression\n", 22 | "\n", 23 | "# Librerías de paralelización y optimización\n", 24 | "import joblib\n", 25 | "from joblib import Parallel, delayed, parallel_backend\n", 26 | "import cupy as cp\n", 27 | "\n", 28 | "# Manejo de fechas y tiempo\n", 29 | "import time\n", 30 | "from datetime import datetime\n", 31 | "\n", 32 | "# Librerías de bases de datos y almacenamiento\n", 33 | "import os\n", 34 | "import pickle\n", 35 | "import hashlib\n", 36 | "\n", 37 | "# Utilidades varias\n", 38 | "import uuid\n", 39 | "import itertools\n", 40 | "import re\n", 41 | "import gc\n", 42 | "from tqdm import tqdm\n", 43 | "from itertools import groupby" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "e35f3672", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "\n", 54 | "\n", 55 | "def transform_df(csv_name, exposicion_dias=3, threshold=25, date_column='DateTime', short=False):\n", 56 | " df = pd.read_csv(csv_name+'.csv')\n", 57 | "\n", 58 | " def rsi_function(i):\n", 59 | " rsi = ta.RSI(df['Close'], timeperiod=i)\n", 60 | " return pd.DataFrame(rsi, columns=[f'rsi_{i}'])\n", 61 | "\n", 62 | " rsi_dfs = pd.concat([rsi_function(i) for i in range(2, 51, 2)], axis=1)\n", 63 | " df = pd.concat([df, rsi_dfs], axis=1)\n", 64 | "\n", 65 | " def adx_function(i):\n", 66 | " adx = ta.ADX(df['High'], df['Low'], df['Close'], timeperiod=i)\n", 67 | " return pd.DataFrame(adx, columns=[f'adx_{i}'])\n", 68 | "\n", 69 | " adx_dfs = pd.concat([adx_function(i) for i in range(2, 51, 2)], axis=1)\n", 70 | " df = pd.concat([df, adx_dfs], axis=1)\n", 71 | "\n", 72 | " def plus_di_function(i):\n", 73 | " plus_di = ta.PLUS_DI(df['High'], df['Low'], df['Close'], timeperiod=i)\n", 74 | " return pd.DataFrame(plus_di, columns=[f'plus_di_{i}'])\n", 75 | "\n", 76 | " plus_di_dfs = pd.concat([plus_di_function(i) for i in range(2, 51, 2)], axis=1)\n", 77 | " df = pd.concat([df, plus_di_dfs], axis=1)\n", 78 | "\n", 79 | " def minus_di_function(i):\n", 80 | " minus_di = ta.MINUS_DI(df['High'], df['Low'], df['Close'], timeperiod=i)\n", 81 | " return pd.DataFrame(minus_di, columns=[f'minus_di_{i}'])\n", 82 | "\n", 83 | " minus_di_dfs = pd.concat([minus_di_function(i) for i in range(2, 51, 2)], axis=1)\n", 84 | " df = pd.concat([df, minus_di_dfs], axis=1)\n", 85 | "\n", 86 | " def willr_function(i):\n", 87 | " willr = ta.WILLR(df['High'], df['Low'], df['Close'], timeperiod=i)\n", 88 | " return pd.DataFrame(willr, columns=[f'willr_{i}'])\n", 89 | "\n", 90 | " willr_dfs = pd.concat([willr_function(i) for i in range(2, 51, 2)], axis=1)\n", 91 | " df = pd.concat([df, willr_dfs], axis=1)\n", 92 | "\n", 93 | " def ma_function(i):\n", 94 | " ma = ta.MA(df['Close'], timeperiod=i, matype=0)\n", 95 | " return pd.DataFrame(ma, columns=[f'sma_{i}'])\n", 96 | "\n", 97 | " ma_dfs = pd.concat([ma_function(i) for i in range(2, 301, 2)], axis=1)\n", 98 | " df = pd.concat([df, ma_dfs], axis=1)\n", 99 | "\n", 100 | " def ema_function(i):\n", 101 | " ma = ta.EMA(df['Close'], timeperiod=i)\n", 102 | " return pd.DataFrame(ma, columns=[f'mema_{i}'])\n", 103 | "\n", 104 | " ema_dfs = pd.concat([ema_function(i) for i in range(2, 301, 2)], axis=1)\n", 105 | " df = pd.concat([df, ema_dfs], axis=1)\n", 106 | "\n", 107 | " def atr_function(i):\n", 108 | " atr = ta.ATR(df['High'], df['Low'], df['Close'], timeperiod=i)\n", 109 | " return pd.DataFrame(atr, columns=[f'atr_{i}'])\n", 110 | "\n", 111 | " atr_dfs = pd.concat([atr_function(i) for i in range(2, 51, 2)], axis=1)\n", 112 | " df = pd.concat([df, atr_dfs], axis=1)\n", 113 | "\n", 114 | " def calculate_ibs(high, low, close):\n", 115 | " ibs = (close - low) / (high - low)\n", 116 | " ibs = np.round(ibs, 2)\n", 117 | " return pd.DataFrame(ibs, columns=['ibs_'])\n", 118 | "\n", 119 | "\n", 120 | " def stdev_function(i):\n", 121 | " stdev = ta.STDDEV(df['Close'], timeperiod=i, nbdev=1)\n", 122 | " return pd.DataFrame(stdev, columns=[f'stdev_{i}'])\n", 123 | " \n", 124 | " stdev_dfs = pd.concat([stdev_function(i) for i in range(2, 51, 2)], axis=1)\n", 125 | " df = pd.concat([df, stdev_dfs], axis=1)\n", 126 | "\n", 127 | " def bband_function(i, dev=2):\n", 128 | " upperband, middleband, lowerband = ta.BBANDS(df['Close'], timeperiod=i, nbdevup=dev, nbdevdn=dev, matype=0)\n", 129 | " return pd.DataFrame({f'bb_upper_{dev}_{i}': upperband, f'bb_middle_{dev}_{i}': middleband, f'bb_lower_{dev}_{i}': lowerband})\n", 130 | " \n", 131 | " def bband_function(i, dev=2):\n", 132 | " upperband, middleband, lowerband = ta.BBANDS(df['Close'], timeperiod=i, nbdevup=dev, nbdevdn=dev, matype=0)\n", 133 | " return pd.DataFrame({f'bb_upper_{dev}_{i}': upperband, f'bb_lower_{dev}_{i}': lowerband})\n", 134 | "\t\n", 135 | " for dev in range(2,6):\n", 136 | " bband_dfs = pd.concat([bband_function(i, dev) for i in range(5, 31, 2)], axis=1)\n", 137 | " df = pd.concat([df, bband_dfs], axis=1)\n", 138 | " \n", 139 | "\n", 140 | " def macd_function(fp, slp, sp):\n", 141 | " macd, macdsignal, macdhist = ta.MACD(df['Close'], fastperiod=fp, slowperiod=slp, signalperiod=sp)\n", 142 | " return pd.DataFrame({f'macd_{fp}': macd, f'macdsig_{slp}': macdsignal, f'macdh_{sp}': macdhist})\n", 143 | "\n", 144 | " macd_dfs = []\n", 145 | " fastperiod_values = [7, 12, 26, 52]\n", 146 | " slowperiod_values = [13, 26, 52]\n", 147 | " signalperiod_values = [3, 6, 9]\n", 148 | "\n", 149 | "\n", 150 | " def mom_function(i):\n", 151 | " momentum = ta.MOM(df['Close'], timeperiod=i)\n", 152 | " return pd.DataFrame(momentum, columns=[f'mom_{i}'])\n", 153 | "\n", 154 | " momentum_dfs = pd.concat([mom_function(i) for i in range(2, 31, 2)], axis=1)\n", 155 | " df = pd.concat([df, momentum_dfs], axis=1)\n", 156 | "\n", 157 | " def aaron_up_function(i):\n", 158 | " aroon_up = ta.AROONOSC(df['High'], df['Low'], timeperiod=i)\n", 159 | " return pd.DataFrame(aroon_up, columns=[f'aaro_{i}'])\n", 160 | "\n", 161 | " aaronu_dfs = pd.concat([aaron_up_function(i) for i in range(2, 51, 2)], axis=1)\n", 162 | " df = pd.concat([df, aaronu_dfs], axis=1)\n", 163 | "\t\n", 164 | " def aaron_up_function2(i):\n", 165 | " _, aroon_up = ta.AROON(df['High'], df['Low'], timeperiod=i)\n", 166 | " return pd.DataFrame(aroon_up, columns=[f'aarou_{i}'])\n", 167 | "\t\n", 168 | " aaronu_dfs = pd.concat([aaron_up_function2(i) for i in range(2, 51, 2)], axis=1)\n", 169 | " df = pd.concat([df, aaronu_dfs], axis=1)\n", 170 | "\t\n", 171 | " def aaron_dw_function2(i):\n", 172 | " aroon_down, _ = ta.AROON(df['High'], df['Low'], timeperiod=i)\n", 173 | " return pd.DataFrame(aroon_down, columns=[f'aarod_{i}'])\n", 174 | "\t\n", 175 | " aaronu_dfs = pd.concat([aaron_dw_function2(i) for i in range(2, 51, 2)], axis=1)\n", 176 | " df = pd.concat([df, aaronu_dfs], axis=1)\n", 177 | "\n", 178 | " def roc_function(i):\n", 179 | " roc_t = ta.ROC(df['Close'], timeperiod=i)\n", 180 | " return pd.DataFrame(roc_t, columns=[f'roc_{i}'])\n", 181 | "\n", 182 | "\n", 183 | "\n", 184 | "\n", 185 | " duplicates = df.columns[df.columns.duplicated()]\n", 186 | " df = df.loc[:, ~df.columns.duplicated()]\n", 187 | "\n", 188 | " def shift_column(column, i):\n", 189 | " shifted = df[column].shift(i)\n", 190 | " if('ibs_' in column):\n", 191 | " column = 'ibs'\n", 192 | " return shifted.rename(f'{column}_sft_{i}')\n", 193 | "\n", 194 | " columns = df.columns\n", 195 | " lista_shift = ['rsi', 'adx', 'plus_di', 'minus_di', 'willr', 'bb', 'atr', 'stdev', 'Close', 'High', 'Low', 'aaro', 'mom']\n", 196 | " indicator_columns = {col for col in columns if any(name in col for name in lista_shift)}\n", 197 | "\n", 198 | " shifted_columns = []\n", 199 | " shift_value = 3\n", 200 | "\n", 201 | " for column in indicator_columns:\n", 202 | " for i in range(1, shift_value + 1):\n", 203 | " shifted_series = shift_column(column, i)\n", 204 | " shifted_columns.append(shifted_series)\n", 205 | "\n", 206 | " df = pd.concat([df] + shifted_columns, axis=1)\n", 207 | "\n", 208 | "\n", 209 | " pips = 0\n", 210 | " if \"JPY\" in csv_name:\n", 211 | " pips = 100\n", 212 | " else:\n", 213 | " pips = 10000\n", 214 | "\n", 215 | "\n", 216 | " for i in range (2,31, 2):\n", 217 | " ret = []\n", 218 | " new_cols = []\n", 219 | " if(short == True):\n", 220 | " ret = ((df[\"Close\"].shift(-1 * i) - df[\"Close\"]) * pips) + 2\n", 221 | " new_cols = pd.DataFrame(np.array(ret) * -1, columns=[f\"Return_{i}\"])\n", 222 | " else:\n", 223 | " ret = ((df[\"Close\"].shift(-1 * i) - df[\"Close\"]) * pips) - 2\n", 224 | " new_cols = pd.DataFrame(np.array(ret), columns=[f\"Return_{i}\"])\n", 225 | "\t\t\t\n", 226 | " df = pd.concat([df, new_cols], axis=1)\n", 227 | "\n", 228 | "\n", 229 | " if(short == True):\n", 230 | " ret = ((df[\"Close\"].shift(-1 * exposicion_dias) - df[\"Close\"]) * pips) + 2\n", 231 | " else:\n", 232 | " ret = ((df[\"Close\"].shift(-1 * exposicion_dias) - df[\"Close\"]) * pips) - 2\n", 233 | "\t\t\n", 234 | " new_cols = pd.DataFrame(np.array(ret), columns=[\"Return\"])\n", 235 | "\t\n", 236 | " if(short == True):\n", 237 | " new_cols[\"Return\"] = new_cols[\"Return\"] * -1\n", 238 | "\t\t\n", 239 | " df = pd.concat([df, new_cols], axis=1)\n", 240 | " \n", 241 | " target = (df[\"Return\"] >= threshold).astype(int)\n", 242 | "\t\n", 243 | "\n", 244 | " target = df[f'Return_{exposicion_dias}'].copy()\n", 245 | " target = (df[f'Return_{exposicion_dias}'] >= threshold).astype(int)\n", 246 | " df = pd.concat([df, target.rename(\"Target\")], axis=1) \n", 247 | "\n", 248 | " for i in range (4,31, 2):\n", 249 | " target = (df[f'Return_{i}'] >= threshold).astype(int)\n", 250 | " df = pd.concat([df, target.rename(f'Target_{i}')], axis=1) \n", 251 | "\t\n", 252 | " \n", 253 | "\n", 254 | " df = df.copy()\n", 255 | " df[date_column] = pd.to_datetime(df[date_column], dayfirst=True)\n", 256 | "\n", 257 | " day_of_month = df[date_column].apply(lambda x: x.day)\n", 258 | " month = df[date_column].apply(lambda x: x.month)\n", 259 | " day_of_week = df[date_column].apply(lambda x: x.weekday())\n", 260 | " year = df[date_column].apply(lambda x: x.year)\n", 261 | "\n", 262 | " df[date_column] = df[date_column].dt.strftime('%d/%m/%Y %H:%M')\n", 263 | "\n", 264 | " new_columns = pd.concat([day_of_month.rename('day_of_month'), month.rename('month'), day_of_week.rename('day_of_week'), year.rename('year')], axis=1)\n", 265 | " df = pd.concat([df, new_columns], axis=1)\n", 266 | "\n", 267 | " return df\n", 268 | "\n", 269 | "def split_data_validation(df, year_max_cut = '2022', year_min_cur= '2008'):\n", 270 | " data = df.query('year >= ' + year_max_cut).copy()\n", 271 | " df = df.query(year_min_cur+' < year <= 2022')\n", 272 | " df = df.reset_index(drop=True)\n", 273 | " return df, data\n", 274 | "\n", 275 | "def map_creator(df):\n", 276 | " inicio = time.time()\n", 277 | " columns = df.columns\n", 278 | " no_sft_columns = [col for col in columns if 'sft' not in col]\n", 279 | " column_map = create_column_map(df, columns, no_sft_columns)\n", 280 | " fin = time.time()\n", 281 | " duracion = fin - inicio\n", 282 | " print(\"La duración del proceso fue de\", duracion, \"segundos\")\n", 283 | " return column_map\n", 284 | "\n", 285 | "def create_column_map(df, columns, no_sft_columns):\n", 286 | " column_map = {}\n", 287 | " rsi_columns = {col for col in columns if 'rsi_' in col}\n", 288 | " adx_columns = {col for col in columns if 'adx' in col}\n", 289 | " plus_di_columns = {col for col in columns if 'plus_di' in col}\n", 290 | " minus_di_columns = {col for col in columns if 'minus_di' in col}\n", 291 | " will_columns = {col for col in columns if 'willr' in col}\n", 292 | " sma_columns = {col for col in columns if 'sma' in col}\n", 293 | " mema_columns = {col for col in columns if 'mema' in col}\n", 294 | " ibs_columns = {col for col in columns if 'ibs_' in col}\n", 295 | " atr_columns = {col for col in columns if 'atr' in col}\n", 296 | " bbup_columns = {col for col in columns if 'bb_upper' in col}\n", 297 | " bbmid_columns = {col for col in columns if 'bb_middle' in col}\n", 298 | " bblow_columns = {col for col in columns if 'bb_lower' in col}\n", 299 | " macd_columns = {col for col in columns if 'macd' in col}\n", 300 | " macdsig_columns = {col for col in columns if 'macdsig' in col}\n", 301 | " macdh_columns = {col for col in columns if 'macdh' in col}\n", 302 | " ibsma_columns = {col for col in columns if 'ibma' in col}\n", 303 | " hh_columns = {col for col in columns if 'hh' in col}\n", 304 | " dayw_columns = {col for col in columns if 'day_of_week' in col}\n", 305 | " daym_columns = {col for col in columns if 'day_of_month' in col}\n", 306 | " ll_columns = {col for col in columns if 'll' in col}\n", 307 | " mom_columns = {col for col in columns if 'mom' in col}\n", 308 | " aaro_columns = {col for col in columns if 'aaro_' in col}\n", 309 | " roc_columns = {col for col in columns if 'roc' in col}\n", 310 | " stoch_columns = {col for col in columns if 'stoch' in col}\n", 311 | " stochk_columns = {col for col in columns if 'stochk' in col}\n", 312 | " stochd_columns = {col for col in columns if 'stochd' in col}\n", 313 | " stdev_columns = {col for col in columns if 'stdev' in col}\n", 314 | " aarod_columns = {col for col in columns if 'aarod_' in col}\n", 315 | " aarou_columns = {col for col in columns if 'aarou_' in col}\n", 316 | " \n", 317 | " for column in no_sft_columns:\n", 318 | " if 'rsi' in column:\n", 319 | " filtered_columns = rsi_columns - {column}\n", 320 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col and 'ibs' not in col}\n", 321 | " column_map[column] = [list(range(0, 101)), list(filtered_columns)]\n", 322 | "\n", 323 | " elif 'adx' in column:\n", 324 | " filtered_columns = adx_columns - {column}\n", 325 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 326 | " column_map[column] = [list(range(0, 101)), list(filtered_columns)]\n", 327 | " \n", 328 | " elif 'plus_di' in column:\n", 329 | " filtered_columns = plus_di_columns - {column}\n", 330 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 331 | " column_map[column] = [list(range(0, 101)), list(filtered_columns)]\n", 332 | " \n", 333 | " elif 'minus_di' in column:\n", 334 | " filtered_columns = minus_di_columns - {column}\n", 335 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 336 | " column_map[column] = [list(range(0, 101)), list(filtered_columns)]\n", 337 | " \n", 338 | " elif 'willr' in column:\n", 339 | " filtered_columns = will_columns - {column}\n", 340 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 341 | " column_map[column] = [list(x for x in range(0, -101, -1)), list(filtered_columns)]\n", 342 | " \n", 343 | " elif 'sma' in column:\n", 344 | " filtered_columns = sma_columns - {column}\n", 345 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 346 | " column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)] \n", 347 | "\n", 348 | " elif 'mema' in column:\n", 349 | " filtered_columns = mema_columns - {column}\n", 350 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 351 | " column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)]\t\t\t\n", 352 | " \n", 353 | " elif 'ibs_' in column:\n", 354 | " filtered_columns = ibs_columns - {column}\n", 355 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 356 | " column_map[column] = [list([i / 100 for i in range(0, 101)]), list(filtered_columns)]\n", 357 | " \n", 358 | " elif 'atr' in column:\n", 359 | " filtered_columns = atr_columns - {column}\n", 360 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 361 | " column_map[column] = [list(filtered_columns), list(filtered_columns)]\n", 362 | " \n", 363 | " elif 'bb_upper' in column:\n", 364 | " filtered_columns = bbup_columns - {column}\n", 365 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 366 | " column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)]\n", 367 | " \n", 368 | " elif 'bb_middle' in column:\n", 369 | " filtered_columns = bbmid_columns - {column}\n", 370 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 371 | " column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)]\n", 372 | " \n", 373 | " elif 'bb_lower' in column:\n", 374 | " filtered_columns = bblow_columns - {column}\n", 375 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 376 | " column_map[column] = [list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3']), list(filtered_columns)]\n", 377 | " \n", 378 | " elif 'macd' in column:\n", 379 | " filtered_columns = macd_columns - {column}\n", 380 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 381 | " column_map[column] = [list(filtered_columns), list(macdsig_columns)]\n", 382 | " \n", 383 | " elif 'macdsig' in column:\n", 384 | " filtered_columns = macdsig_columns - {column}\n", 385 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 386 | " column_map[column] = [list(filtered_columns), list(macd_columns)]\n", 387 | " \n", 388 | " elif 'macdh' in column:\n", 389 | " filtered_columns = macdh_columns - {column}\n", 390 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 391 | " column_map[column] = [list(filtered_columns), list(filtered_columns)]\n", 392 | " \n", 393 | " elif 'ibma' in column:\n", 394 | " filtered_columns = ibsma_columns - {column}\n", 395 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 396 | " column_map[column] = [list(filtered_columns), list(ibs_columns)]\n", 397 | " \n", 398 | " elif 'hh' in column:\n", 399 | " filtered_columns = hh_columns - {column}\n", 400 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 401 | " column_map[column] = [list(filtered_columns), list(['Open', 'High', 'Low', 'Close', 'Close_sft_1', 'Close_sft_2', 'Close_sft_3', 'Low_sft_1', 'Low_sft_2', 'Low_sft_3', 'High_sft_1', 'High_sft_2', 'High_sft_3'])]\n", 402 | " \n", 403 | " elif 'day_of_week' in column:\n", 404 | " filtered_columns = dayw_columns - {column}\n", 405 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 406 | " column_map[column] = [list([i for i in range(0, 7)]), list([i for i in range(0, 7)])]\n", 407 | " \n", 408 | " elif 'day_of_week' in column:\n", 409 | " filtered_columns = dayw_columns - {column}\n", 410 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 411 | " column_map[column] = [list([i for i in range(0, 7)]), list([i for i in range(0, 7)])]\n", 412 | " \n", 413 | " elif 'day_of_month' in column:\n", 414 | " filtered_columns = daym_columns - {column}\n", 415 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 416 | " column_map[column] = [list([i for i in range(1, 32)]), list([i for i in range(1, 32)])]\n", 417 | " \n", 418 | " \n", 419 | " elif 'mom' in column:\n", 420 | " filtered_columns = mom_columns - {column}\n", 421 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 422 | " column_map[column] = [list(filtered_columns), list(filtered_columns)]\n", 423 | " \n", 424 | " elif 'aaro_' in column:\n", 425 | " filtered_columns = aaro_columns - {column}\n", 426 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 427 | " column_map[column] = [list(x for x in range(-100, 101, 1)), list(filtered_columns)]\n", 428 | "\t\t\t\n", 429 | " elif 'aarod_' in column:\n", 430 | " filtered_columns = aarod_columns - {column}\n", 431 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 432 | " column_map[column] = [list(x for x in range(0, 101)), list(filtered_columns)]\n", 433 | "\t\t\t\n", 434 | " elif 'aarou_' in column:\n", 435 | " filtered_columns = aarou_columns - {column}\n", 436 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 437 | " column_map[column] = [list(x for x in range(0, 101)), list(filtered_columns)]\n", 438 | " \n", 439 | " elif 'roc' in column:\n", 440 | " filtered_columns = roc_columns - {column}\n", 441 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 442 | " column_map[column] = [list(filtered_columns), list(filtered_columns)]\n", 443 | " \n", 444 | " elif 'stochd' in column:\n", 445 | " filtered_columns = stochd_columns - {column}\n", 446 | " comun_columns = stoch_columns - {column}\n", 447 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 448 | " filtered_comun_columns = {col for col in comun_columns if 'condition' not in col}\n", 449 | " column_map[column] = [list(filtered_columns), list(filtered_comun_columns)]\n", 450 | " \n", 451 | " elif 'stochk' in column:\n", 452 | " filtered_columns = stochk_columns - {column}\n", 453 | " comun_columns = stoch_columns - {column}\n", 454 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 455 | " filtered_comun_columns = {col for col in comun_columns if 'condition' not in col}\n", 456 | " column_map[column] = [list(filtered_columns), list(filtered_comun_columns)]\n", 457 | " \n", 458 | " elif 'stdev' in column:\n", 459 | " filtered_columns = stdev_columns - {column}\n", 460 | " filtered_columns = {col for col in filtered_columns if 'condition' not in col}\n", 461 | " column_map[column] = [list(filtered_columns), list(filtered_columns)]\n", 462 | " \n", 463 | "\n", 464 | " return column_map" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "id": "d3ac9b4d", 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "def crear_directorio(nombre_carpeta):\n", 475 | " try:\n", 476 | " os.makedirs(nombre_carpeta)\n", 477 | " print(f\"Directorio '{nombre_carpeta}' creado con éxito.\")\n", 478 | " except FileExistsError:\n", 479 | " print(f\"El directorio '{nombre_carpeta}' ya existe.\")\n", 480 | " except Exception as e:\n", 481 | " print(f\"Error al crear el directorio '{nombre_carpeta}': {e}\")" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "id": "1ad4de9c", 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "\n", 492 | "date_column = 'DateTime'\n", 493 | "h12 = 'GBPNZ_H12'\n" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "id": "8fd8460a", 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [ 503 | "exposicion_dias=4\n", 504 | "threshold=65\n", 505 | "df = transform_df(h12, exposicion_dias, threshold, short=True)\n", 506 | "dir_results_name = f'results_{exposicion_dias}_{threshold}'\n", 507 | "crear_directorio(dir_results_name)\n", 508 | "full_df = df.copy()\n", 509 | "df = df.query('2016 < year < 2019')\n", 510 | "\n", 511 | "df = df.reset_index(drop=True)\n", 512 | "column_map = map_creator(df)\n", 513 | "\n" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "id": "79e25842", 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "df.head()" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "id": "50a7b989", 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "len(df.columns)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "id": "5a50e9aa", 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "return_columns = [col for col in df.columns if 'Return_' in col]\n", 544 | "df[return_columns+['Return']].tail(25)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "id": "1e4a15e9", 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "print(len(df[df['day_of_week'] == 6]))\n", 555 | "print(len(df[df['day_of_week'] == 7]))" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "id": "e9044ec0", 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "def generate_all_rules(column_map, df):\n", 566 | " all_rules = [] \n", 567 | " \n", 568 | " for column, (possible_values, related_columns) in column_map.items():\n", 569 | " if 'day' in column:\n", 570 | " operators = ['>', '<', '==', '>=', '<=']\n", 571 | " else:\n", 572 | " operators = ['>=', '<=']\n", 573 | "\n", 574 | " for value in possible_values:\n", 575 | " for operator in operators:\n", 576 | " condition = f\"{column} {operator} {value}\"\n", 577 | " all_rules.append(condition)\n", 578 | "\n", 579 | " for value in related_columns:\n", 580 | " for operator in operators:\n", 581 | " related_condition = f\"{column} {operator} {value}\"\n", 582 | " all_rules.append(related_condition)\n", 583 | " \n", 584 | " return all_rules\n", 585 | "\n", 586 | "\n" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "id": "2f2c3368", 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "all_rules = generate_all_rules(column_map, df)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "id": "920e466c", 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "all_rules" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "id": "182c80a1", 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "len(all_rules)" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "id": "4809032e", 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "\n", 627 | "\n", 628 | "def generate_hash(s):\n", 629 | " try:\n", 630 | " first_part = re.findall(r'^\\D+', s.split(' ')[0])[0]\n", 631 | " comparison_operator = re.findall(r'[<=>=]+', s)[0]\n", 632 | " try:\n", 633 | " second_part = re.findall(r'^\\D+', s.split(' ')[2].split('_')[0])[0]\n", 634 | " except IndexError:\n", 635 | " second_part = 'num'\n", 636 | " combined = first_part + comparison_operator + second_part\n", 637 | " return hashlib.sha256(combined.encode()).hexdigest()\n", 638 | " except Exception as e:\n", 639 | " return 12345678910\n", 640 | "\n", 641 | "\n", 642 | "def process_rule_chunk(df, data, df_columns, rule_chunk, target_values, target, returns_columns):\n", 643 | " chunk_results = {}\n", 644 | " chunk_stats = {}\n", 645 | " for rule in rule_chunk:\n", 646 | " try:\n", 647 | " parts = rule.split()\n", 648 | " if len(parts) == 3:\n", 649 | " column1, operator, column2_or_value = parts\n", 650 | " idx1 = df_columns.get_loc(column1)\n", 651 | " idx_return = df_columns.get_loc('Return')\n", 652 | " \n", 653 | " try:\n", 654 | " value = float(column2_or_value)\n", 655 | " idx2 = None\n", 656 | " except ValueError:\n", 657 | " idx2 = df_columns.get_loc(column2_or_value)\n", 658 | " \n", 659 | " condition_eval_df = None\n", 660 | " if operator == '>=':\n", 661 | " if idx2 is None:\n", 662 | " condition_eval_df = (data[:, idx1] >= value)\n", 663 | " else:\n", 664 | " condition_eval_df = (data[:, idx1] >= data[:, idx2])\n", 665 | " elif operator == '<=':\n", 666 | " if idx2 is None:\n", 667 | " condition_eval_df = (data[:, idx1] <= value)\n", 668 | " else:\n", 669 | " condition_eval_df = (data[:, idx1] <= data[:, idx2])\n", 670 | " elif operator == '==':\n", 671 | " if idx2 is None:\n", 672 | " condition_eval_df = (data[:, idx1] == value)\n", 673 | " else:\n", 674 | " condition_eval_df = (data[:, idx1] == data[:, idx2])\n", 675 | " \n", 676 | " condition_eval = condition_eval_df.astype(np.int8)\n", 677 | " ones_count = np.sum(condition_eval)\n", 678 | " if ones_count < 100:\n", 679 | " continue\n", 680 | " \n", 681 | " correlation, _ = pointbiserialr(condition_eval, target)\n", 682 | " if np.isnan(correlation):\n", 683 | " continue\n", 684 | "\n", 685 | " length = len(condition_eval)\n", 686 | " zeros_count = length - ones_count\n", 687 | " win_rate = np.sum(condition_eval & target_values) / ones_count if ones_count > 0 else 0\n", 688 | "\n", 689 | " \n", 690 | " sum_returns = np.sum(data[condition_eval_df, idx_return])\n", 691 | " \n", 692 | " sum_positive_returns = np.sum(data[condition_eval_df & (data[:, idx_return] > 0), idx_return])\n", 693 | " sum_negative_returns = np.sum(data[condition_eval_df & (data[:, idx_return] < 0), idx_return])\n", 694 | " \n", 695 | " profit_factor = 0\n", 696 | " if sum_negative_returns != 0:\n", 697 | " profit_factor = sum_positive_returns / -sum_negative_returns\n", 698 | " else:\n", 699 | " profit_factor = float('inf')\n", 700 | " \n", 701 | " optimal_pf = 0\n", 702 | " optimal_exposition = str(4)\n", 703 | " profit_factors = {}\n", 704 | " for ret_col in returns_columns:\n", 705 | " idx_return = df_columns.get_loc(ret_col)\n", 706 | " sum_positive_returns = np.sum(data[condition_eval_df & (data[:, idx_return] > 0), idx_return])\n", 707 | " sum_negative_returns = -np.sum(data[condition_eval_df & (data[:, idx_return] < 0), idx_return])\n", 708 | " profit_factor = sum_positive_returns / sum_negative_returns if sum_negative_returns != 0 else float('inf')\n", 709 | " match = re.search(r'Return_(\\d+)', ret_col)\n", 710 | " number = 4\n", 711 | " if match:\n", 712 | " number = match.group(1)\n", 713 | " chunk_stats[f'pf_{str(number)}'] = profit_factor\n", 714 | " if(profit_factor != np.inf and profit_factor > optimal_pf):\n", 715 | " optimal_pf = profit_factor\n", 716 | " optimal_exposition = str(number)\n", 717 | " \n", 718 | " target_optimal = df[f'Target_{optimal_exposition}'].values\n", 719 | " target_values_optimal = df[f'Target_{optimal_exposition}'].apply(lambda x: 1 if x > 0 else 0).values\n", 720 | " correlation_optimal, _ = pointbiserialr(condition_eval, target_optimal)\n", 721 | " win_rate_optimal = np.sum(condition_eval & target_values_optimal) / ones_count if ones_count > 0 else 0\n", 722 | " \n", 723 | " \n", 724 | " chunk_results[rule] = condition_eval\n", 725 | " chunk_stats[rule] = {\n", 726 | " 'correlation': correlation,\n", 727 | " 'length': length,\n", 728 | " 'ones_count': ones_count,\n", 729 | " 'zeros_count': zeros_count,\n", 730 | " 'win_rate': round(win_rate * 100, 2),\n", 731 | " 'return': sum_returns,\n", 732 | " 'hash': generate_hash(rule),\n", 733 | " 'optimal_exposition': optimal_exposition,\n", 734 | " 'correlation_optimal': correlation_optimal,\n", 735 | " 'win_rate_optimal': round(win_rate_optimal * 100, 2),\n", 736 | " }\n", 737 | " chunk_stats[rule].update({f'pf_{i}': chunk_stats.pop(f'pf_{i}') for i in range(4, 31, 2)})\n", 738 | " \n", 739 | " else:\n", 740 | " print(f\"Rule '{rule}' could not be parsed.\")\n", 741 | " except Exception as e:\n", 742 | " print(f\"Error processing rule '{rule}': {e}\")\n", 743 | "\n", 744 | " return chunk_results, chunk_stats\n", 745 | "\n", 746 | "\n", 747 | "def evaluate_rules_numpy(df, all_rules, target_column='Target', output_file='results2_h5.h5'):\n", 748 | " print('Starting process..', datetime.now().strftime('%d-%m-%Y %H:%M:%S'))\n", 749 | " data = df.to_numpy()\n", 750 | " target = df[target_column].values\n", 751 | " target_values = df[target_column].apply(lambda x: 1 if x > 0 else 0).values\n", 752 | "\n", 753 | "\n", 754 | " num_cores = joblib.cpu_count()\n", 755 | "\n", 756 | " # Dividimos las reglas en chunks\n", 757 | " chunk_size = len(all_rules) // num_cores \n", 758 | " rule_chunks = [all_rules[i:i + chunk_size] for i in range(0, len(all_rules), chunk_size)]\n", 759 | " \n", 760 | " returns_columns = [f'Return_{i}' for i in range(4, 31, 2)] \n", 761 | "\n", 762 | " # Utilizar Joblib para paralelizar la evaluación de las reglas\n", 763 | " results = Parallel(n_jobs=-1)(\n", 764 | " delayed(process_rule_chunk)(df, data, df.columns, rule_chunk, target_values, target, returns_columns) for rule_chunk in rule_chunks\n", 765 | " )\n", 766 | " \n", 767 | " all_results = {}\n", 768 | " all_stats = {}\n", 769 | " for chunk_results, chunk_stats in results:\n", 770 | " all_results.update(chunk_results)\n", 771 | " all_stats.update(chunk_stats)\n", 772 | " \n", 773 | " result_df = pd.DataFrame(all_results)\n", 774 | " result_df['Target'] = df['Target'].copy()\n", 775 | " result_df['Return'] = df['Return'].copy()\n", 776 | " \n", 777 | " returns_columns = []\n", 778 | " for i in range(4, 31, 2):\n", 779 | " returns_columns.append(f'Return_{i}')\n", 780 | " \n", 781 | " \n", 782 | " for column_ in returns_columns:\n", 783 | " result_df[column_] = df[column_].values\n", 784 | " \n", 785 | " \n", 786 | " print('Ending process.. saving', datetime.now().strftime('%d-%m-%Y %H:%M:%S'), len(result_df))\n", 787 | " result_df.to_hdf(output_file, key='result_df', mode='w')\n", 788 | " sorted_stats = sorted(all_stats.items(), key=lambda item: item[1]['correlation'], reverse=True)\n", 789 | " \n", 790 | " groups = groupby(sorted_stats, key=lambda item: item[1]['correlation'])\n", 791 | " unique_stats = [next(g) for _, g in groups]\n", 792 | " \n", 793 | " df = pd.DataFrame([item[1] for item in unique_stats])\n", 794 | " df['condition'] = [item[0] for item in unique_stats]\n", 795 | " df = df[['condition'] + [col for col in df.columns if col != 'condition']]\n", 796 | " \n", 797 | " print('Ended process',datetime.now().strftime('%d-%m-%Y %H:%M:%S'))\n", 798 | " return df, unique_stats\n", 799 | "\n" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "id": "50f863b9", 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [ 809 | "start_time = time.time()\n", 810 | "df_rules, sorted_stats = evaluate_rules_numpy(df, all_rules)\n", 811 | "\n", 812 | "print(f\"El proceso tardó {time.time() - start_time} segundos.\")\n" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "id": "bb3e4f9f", 819 | "metadata": {}, 820 | "outputs": [], 821 | "source": [ 822 | "len(df_rules)" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": null, 828 | "id": "d627742c", 829 | "metadata": {}, 830 | "outputs": [], 831 | "source": [ 832 | "df_rules" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "id": "9d01fada", 839 | "metadata": {}, 840 | "outputs": [], 841 | "source": [ 842 | "sorted_stats" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "id": "89792e7e", 849 | "metadata": {}, 850 | "outputs": [], 851 | "source": [ 852 | "print(f'Total de backtests ejecutados: {len(df_rules) * 14}')" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "id": "bd1d4859", 859 | "metadata": {}, 860 | "outputs": [], 861 | "source": [ 862 | "df_sorted = df_rules.sort_values(by='pf_10', ascending=False)" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": null, 868 | "id": "6b346c23", 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [ 872 | "df_sorted" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": null, 878 | "id": "1f7e7a8e", 879 | "metadata": {}, 880 | "outputs": [], 881 | "source": [] 882 | } 883 | ], 884 | "metadata": { 885 | "kernelspec": { 886 | "display_name": "Python 3 (ipykernel)", 887 | "language": "python", 888 | "name": "python3" 889 | }, 890 | "language_info": { 891 | "codemirror_mode": { 892 | "name": "ipython", 893 | "version": 3 894 | }, 895 | "file_extension": ".py", 896 | "mimetype": "text/x-python", 897 | "name": "python", 898 | "nbconvert_exporter": "python", 899 | "pygments_lexer": "ipython3", 900 | "version": "3.10.9" 901 | } 902 | }, 903 | "nbformat": 4, 904 | "nbformat_minor": 5 905 | } 906 | --------------------------------------------------------------------------------