├── data ├── __init__.py ├── f_mean.npy ├── janestreet │ └── __init__.py ├── data_final_eda.py ├── data_denoise.py ├── data_rolling.py └── data_final.py ├── lgb ├── __init__.py └── v01_explore.ipynb ├── models └── __init__.py ├── mlp ├── __init__.py ├── run_train_final_3.py ├── debug_train_dense.py ├── run_train_embed.py ├── v08_submit.ipynb ├── run_train_final_1.py ├── run_train_base.py ├── run_train_denoise.py ├── debug_ae_tf.py ├── run_train_final_4.py ├── run_train_finetune.py ├── debug_embedding_1.py ├── debug_resnet_tf.py ├── run_train_final_2_overfit.py ├── run_train_final_2.py ├── debug_embedding_tag.py └── debug_train_utility_finetune.py ├── .gitattributes ├── janest.code-workspace ├── __init__.py ├── .gitignore ├── data.py ├── cv_splits.py ├── cv.py ├── iter_cv_torch.py ├── iter_cv.py ├── utils.py └── cv_final.py /data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lgb/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mlp/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlp import * 2 | from .tf_models import * -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.python linguist-detectable=true 2 | *.ipynb linguist-language=Python -------------------------------------------------------------------------------- /data/f_mean.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scaomath/kaggle-jane-street/HEAD/data/f_mean.npy -------------------------------------------------------------------------------- /janest.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | } -------------------------------------------------------------------------------- /data/janestreet/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .competition import make_env 3 | 4 | __all__ = ['make_env'] 5 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .utils_lgb import * 2 | from .utils import * 3 | # WORKSPACE_FOLDER=/home/scao/Documents/kaggle-jane-street 4 | # PYTHONPATH=${WORKSPACE_FOLDER}:${WORKSPACE_FOLDER}/nn -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pt 3 | *.png 4 | *.txt 5 | *.zip 6 | *.pkl 7 | *.pth 8 | *.csv 9 | data/janestreet/competition.cpython-37m-x86_64-linux-gnu.so 10 | data/example_sample_submission.csv 11 | *.parquet 12 | *.json 13 | *.index 14 | nn/untitled_project/ 15 | *.hdf5 16 | nn/ae_mlp_1127/ 17 | nn/ae_mlp_1127/untitled_project/ 18 | *.feather 19 | *.npy 20 | *.h5 21 | .ipynb_checkpoints/ 22 | -------------------------------------------------------------------------------- /mlp/run_train_final_3.py: -------------------------------------------------------------------------------- 1 | #%% 2 | from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, GaussianNoise, Concatenate, Lambda, Activation 3 | from tensorflow.keras.models import Model, Sequential 4 | from tensorflow.keras.losses import BinaryCrossentropy 5 | from tensorflow.keras.optimizers import Adam 6 | from tensorflow.keras.callbacks import EarlyStopping, Callback 7 | from tensorflow.keras.layers.experimental.preprocessing import Normalization 8 | import tensorflow as tf 9 | import tensorflow_addons as tfa 10 | import kerastuner as kt 11 | from tensorflow.keras import backend as K 12 | 13 | import numpy as np 14 | import pandas as pd 15 | from tqdm.auto import tqdm 16 | from random import choices 17 | 18 | current_path = os.path.dirname(os.path.abspath(__file__)) 19 | HOME = os.path.dirname(current_path) 20 | MODEL_DIR = os.path.join(HOME, 'models') 21 | DATA_DIR = os.path.join(HOME, 'data') 22 | sys.path.append(HOME) 23 | 24 | from utils import * 25 | # %% 26 | ''' 27 | Final model 2: 28 | 1. data: including the volatile day but excluding the outlier days (2, 294, 36, 270) 29 | 2. data: the fillna is using the past day mean (after excluding the days above) 30 | 3. data: target is only resp_{0,3,4} 31 | 3. Denoised auto-encoder 32 | 4. simple MLP tf model 33 | ''' 34 | # %% 35 | train = pd.read_parquet(os.path.join(DATA_DIR, 'train.parquet')) 36 | train_pdm = pd.read_parquet(os.path.join(DATA_DIR, 'train_pdm.parquet')) 37 | # %% 38 | -------------------------------------------------------------------------------- /mlp/debug_train_dense.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import tensorflow_addons as tfa 3 | import tensorflow as tf 4 | import tensorflow.keras.backend as K 5 | 6 | #%% 7 | resp_cols = ['resp','resp_1', 'resp_2', 'resp_3', 'resp_4'] 8 | target_cols = ['action','action_1', 'action_2', 'action_3', 'action_4'] 9 | 10 | 11 | #%% 12 | def mish(x): 13 | return tf.keras.layers.Lambda(lambda x: x*K.tanh(K.softplus(x)))(x) 14 | 15 | tf.keras.utils.get_custom_objects().update({'mish': tf.keras.layers.Activation(mish)}) 16 | 17 | def create_model(input_shape): 18 | 19 | inp = tf.keras.layers.Input(input_shape) 20 | tmp = tf.keras.layers.BatchNormalization()(inp) 21 | xs = [tmp] 22 | for _ in range(5): 23 | if len(xs) > 1: 24 | tmp = tf.keras.layers.Concatenate(axis=-1)(xs) 25 | else: 26 | tmp = xs[0] 27 | # tmp = tf.keras.layers.Dense(128, activation='mish')(tmp) 28 | tmp = tf.keras.layers.Dense(128, activation='swish')(tmp) 29 | tmp = tf.keras.layers.BatchNormalization()(tmp) 30 | tmp = tf.keras.layers.Dropout(0.2)(tmp) 31 | xs.append(tmp) 32 | 33 | output = tf.keras.layers.Dense(len(resp_cols),activation='sigmoid')(tf.keras.layers.Concatenate()(xs)) 34 | model = tf.keras.models.Model(inp,output) 35 | optimizer = tfa.optimizers.RectifiedAdam(1e-3) 36 | model.compile(optimizer, loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001), 37 | metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')]) 38 | return model 39 | # %% 40 | model = create_model(132) 41 | model.summary() 42 | # %% 43 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import kaggle 3 | from kaggle.api.kaggle_api_extended import KaggleApi 4 | import os, sys 5 | from utils import * 6 | import zipfile 7 | import pandas as pd 8 | import datatable as dt 9 | import numpy as np 10 | 11 | HOME = os.path.dirname(os.path.abspath(__file__)) 12 | MODEL_DIR = os.path.join(HOME, 'models') 13 | DATA_DIR = os.path.join(HOME, 'data') 14 | sys.path.append(HOME) 15 | 16 | ''' 17 | The API token from https://www.kaggle.com//account needs to put in ~/.kaggle/ folder in MacOS/Linux or to C:\\Users\\.kaggle\\ on Windows 18 | ''' 19 | 20 | train_dtypes = {'date': np.int32, 21 | 'ts_id': np.int64, 22 | 'resp': np.float64, 23 | 'weight': np.float64, 24 | # 'feature_0': np.int8 25 | } 26 | for c in range(1,5): 27 | train_dtypes['resp_'+str(c)] = np.float64 28 | for c in range(130): 29 | train_dtypes['feature_'+str(c)] = np.float32 30 | 31 | #%% 32 | if __name__ == "__main__": 33 | print(f"Current directory : {HOME}") 34 | print(f"Current data directory: {DATA_DIR}") 35 | data_file = find_files('train.csv', DATA_DIR) 36 | data_parquet = find_files('train.parquet', DATA_DIR) 37 | data_feather = find_files('train.feather', DATA_DIR) 38 | if not data_file: 39 | try: 40 | api = KaggleApi() 41 | api.authenticate() 42 | api.competition_download_files('jane-street-market-prediction', 43 | path=DATA_DIR, quiet=False) 44 | data_file = find_files('zip', DATA_DIR) 45 | with zipfile.ZipFile(data_file,"r") as f: 46 | f.extractall(DATA_DIR) 47 | except RuntimeError as err: 48 | print(f"Needs API token: {err}") 49 | elif data_parquet and data_feather: 50 | print(f"Train parquet at: {data_parquet[0]}.") 51 | with timer("Loading train"): 52 | train = pd.read_parquet(data_parquet[0]) 53 | print(train.dtypes[:10]) 54 | print(train.dtypes[-10:]) 55 | 56 | print(f"Train feather at: {data_feather[0]}.") 57 | with timer("Loading train"): 58 | train = pd.read_feather(data_feather[0]) 59 | print(train.dtypes[:10]) 60 | print(train.dtypes[-10:]) 61 | 62 | elif not data_parquet and data_feather: 63 | with timer("Processing train parquet"): 64 | # train = pd.read_csv(data_file[0]) 65 | # train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) 66 | train = dt.fread(data_file[0], 67 | columns=set(train_dtypes.keys())).to_pandas().astype(train_dtypes) 68 | train.set_index('ts_id') 69 | train.to_parquet(os.path.join(DATA_DIR,'train.parquet')) 70 | else: 71 | with timer("Processing train feather"): 72 | train = dt.fread(data_file[0], 73 | columns=set(train_dtypes.keys())).to_pandas().astype(train_dtypes) 74 | train.set_index('ts_id') 75 | train.to_feather(os.path.join(DATA_DIR,'train.feather')) -------------------------------------------------------------------------------- /data/data_final_eda.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os 3 | import sys 4 | current_path = os.path.dirname(os.path.abspath(__file__)) 5 | HOME = os.path.dirname(current_path) 6 | sys.path.append(HOME) 7 | 8 | import pandas as pd 9 | pd.set_option('display.max_rows', 100) 10 | pd.set_option('display.max_columns', 100) 11 | 12 | import numpy as np 13 | import datatable as dt 14 | from tqdm.auto import tqdm 15 | from collections import deque 16 | import matplotlib.pyplot as plt 17 | import seaborn as sns 18 | sns.set(style="darkgrid", context="talk") 19 | from jupyterthemes import jtplot 20 | jtplot.style(theme='onedork', context='notebook', ticks=True, grid=False) 21 | 22 | 23 | MODEL_DIR = HOME+'/models/' 24 | DATA_DIR = HOME+'/data/' 25 | from utils import * 26 | from utils_js import * 27 | #%% 28 | train_parquet = os.path.join(DATA_DIR, 'train_final.parquet') 29 | train_final = pd.read_parquet(train_parquet) 30 | 31 | train_parquet = os.path.join(DATA_DIR, 'train_final_ver1.parquet') 32 | train_final_ver1 = pd.read_parquet(train_parquet) 33 | 34 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 35 | train_orig = pd.read_parquet(train_parquet) 36 | 37 | #%% 38 | features_csv = os.path.join(DATA_DIR, 'features.csv') 39 | features = pd.read_csv(features_csv) 40 | tags = [t for t in list(features.iloc[:,1:])] 41 | tags_dict = {} 42 | for tag in tags: 43 | tags_dict[tag] = features[features[tag] == True]['feature'].to_list() 44 | # print(tag) 45 | feat_num = " ".join([t.split('_')[-1] for t in tags_dict[tag]]) 46 | # print(f"Features: {feat_num}") 47 | 48 | 49 | def plot_features(feats, train, scatter=False, num_days=3, start_day=None): 50 | if not start_day: 51 | start_day = np.random.randint(0, 500-num_days, 1)[0] 52 | days = [start_day+i for i in range(num_days)] 53 | days_str = " ".join([str(d) for d in days]) 54 | 55 | num_feat = len(feats) 56 | _, axes = plt.subplots(num_feat, 1, figsize=(15,num_feat*2), constrained_layout=True) 57 | cmap = get_cmap(num_feat*2, cmap='RdYlGn') 58 | for i in range(num_feat): 59 | feat = feats[i] 60 | feat_vals = train[train['date'].isin(days)][feat].reset_index(drop=True) 61 | if scatter: 62 | axes[i].scatter(pd.Series(feat_vals).index, pd.Series(feat_vals), s=5, color=cmap(i)) 63 | else: 64 | axes[i].plot(pd.Series(feat_vals).index, pd.Series(feat_vals), lw=1, color=cmap(i)) 65 | axes[i].set_title (feat+" at "+days_str, fontsize=15); 66 | axes[i].set_xlim(xmin=0) 67 | # %% 68 | plot_features(tags_dict['tag_22'], train_final, scatter=True) 69 | 70 | 71 | # %% 72 | plot_features(tags_dict['tag_2'], train_final) 73 | # %% 74 | # feats = ['feature_74', 'feature_80', 'feature_86', 'feature_92', 'feature_98', 'feature_104'] 75 | # feats = ['feature_106', 'feature_118'] 76 | feats = ['feature_98', 'feature_104'] 77 | plot_features(feats, train_final, start_day=320, num_days=2) 78 | # plt.savefig(DATA_DIR+'feat_98_104_fillna_pdm.png') 79 | plot_features(feats, train_final_ver1, start_day=320, num_days=2) 80 | plot_features(feats, train_orig, start_day=320,num_days=2) 81 | # plt.savefig(DATA_DIR+'feat_98_104.png') 82 | # %% 83 | train_final['feature_92'].value_counts().sort_values(ascending=False) 84 | train_final.query('date in [320]')['feature_92'].value_counts().sort_values(ascending=False) 85 | # %% 86 | feats = ['feature_1', 'feature_69'] 87 | start_day = np.random.randint(0, 500-3, 1)[0] 88 | plot_features(feats, train_final, start_day=start_day) 89 | plot_features(feats, train, start_day=start_day) 90 | 91 | #%% 92 | feat_spike_index = [1, 2, 69, 71, 85, 87, 88, 91, 93, 94, 97, 99, 100, 103, 105, 106] 93 | # feats = ['feature_100', 'feature_106'] 94 | feats = ['feature_1', 'feature_2', 'feature_69'] 95 | start_day = np.random.randint(0, 500-3, 1)[0] 96 | plot_features(feats, train_final, start_day=start_day, scatter=True) 97 | plot_features(feats, train, start_day=start_day, scatter=True) 98 | # %% 99 | train[['feature_85','feature_91']].value_counts() -------------------------------------------------------------------------------- /cv_splits.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import datetime 3 | import gc 4 | import os 5 | import random 6 | import sys 7 | 8 | import datatable as dt 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import pandas as pd 12 | import torch 13 | from numba import njit 14 | from sklearn.metrics import roc_auc_score 15 | from tqdm import tqdm 16 | 17 | HOME = os.path.dirname(os.path.abspath(__file__)) 18 | MODEL_DIR = HOME+'/models/' 19 | DATA_DIR = HOME+'/data/' 20 | from mlp.mlp import * 21 | from utils import * 22 | from utils_js import * 23 | 24 | get_system() 25 | 26 | import warnings 27 | 28 | from tqdm.auto import tqdm 29 | 30 | warnings.simplefilter(action='ignore', category=FutureWarning) 31 | warnings.simplefilter(action='ignore', category=pd.core.common.SettingWithCopyWarning) 32 | 33 | plt.style.use('bmh') 34 | plt.rcParams['figure.figsize'] = [14, 8] # width, height 35 | 36 | #%% 37 | from matplotlib.colors import ListedColormap 38 | import numpy as np 39 | import matplotlib.pyplot as plt 40 | 41 | # this is code slightly modified from the sklearn docs here: 42 | # https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py 43 | def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10): 44 | """Create a sample plot for indices of a cross-validation object.""" 45 | 46 | cmap_cv = plt.cm.coolwarm 47 | 48 | jet = plt.cm.get_cmap('jet', 256) 49 | seq = np.linspace(0, 1, 256) 50 | _ = np.random.shuffle(seq) # inplace 51 | cmap_data = ListedColormap(jet(seq)) 52 | 53 | # Generate the training/testing visualizations for each CV split 54 | for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)): 55 | # Fill in indices with the training/test groups 56 | indices = np.array([np.nan] * len(X)) 57 | indices[tt] = 1 58 | indices[tr] = 0 59 | 60 | # Visualize the results 61 | ax.scatter(range(len(indices)), [ii + .5] * len(indices), 62 | c=indices, marker='_', lw=lw, cmap=cmap_cv, 63 | vmin=-.2, vmax=1.2) 64 | 65 | # Plot the data classes and groups at the end 66 | ax.scatter(range(len(X)), [ii + 1.5] * len(X), 67 | c=y, marker='_', lw=lw, cmap=plt.cm.Set3) 68 | 69 | ax.scatter(range(len(X)), [ii + 2.5] * len(X), 70 | c=group, marker='_', lw=lw, cmap=cmap_data) 71 | 72 | # Formatting 73 | yticklabels = list(range(n_splits)) + ['target', 'day'] 74 | ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels, 75 | xlabel='Sample index', ylabel="CV iteration", 76 | ylim=[n_splits+2.2, -.2], xlim=[0, len(y)]) 77 | ax.set_title('{}'.format(type(cv).__name__), fontsize=15) 78 | return ax 79 | # %% 80 | n_samples = 20000 81 | n_groups = 500 82 | assert n_samples % n_groups == 0 83 | 84 | idx = np.linspace(0, n_samples-1, num=n_samples) 85 | X_train = np.random.random(size=(n_samples, 5)) 86 | y_train = np.random.choice([0, 1], n_samples) 87 | groups = np.repeat(np.linspace(0, n_groups-1, num=n_groups), n_samples/n_groups) 88 | 89 | fig, ax = plt.subplots() 90 | 91 | cv = PurgedGroupTimeSeriesSplit( 92 | n_splits=5, 93 | max_train_group_size=300, 94 | group_gap=5, 95 | max_test_group_size=40 96 | ) 97 | 98 | plot_cv_indices(cv, X_train, y_train, groups, ax, 5, lw=20); 99 | # %% 100 | train_parquet = os.path.join(DATA_DIR, 'train_final.parquet') 101 | train_final = pd.read_parquet(train_parquet) 102 | # %% 103 | fig, ax = plt.subplots() 104 | 105 | cv = PurgedGroupTimeSeriesSplit( 106 | n_splits=5, 107 | max_train_group_size=15, 108 | group_gap=5, 109 | max_test_group_size=5 110 | ) 111 | 112 | plot_cv_indices( 113 | cv, 114 | train_final.query('date < 50')[ 115 | train_final.columns[train_final.columns.str.contains('feature')] 116 | ].values, 117 | (train_final.query('date < 50')['resp'] > 0).astype(int).values, 118 | train_final.query('date < 50')['date'].values, 119 | ax, 120 | 5, 121 | lw=20 122 | ); -------------------------------------------------------------------------------- /cv.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os, sys 3 | import pandas as pd 4 | import numpy as np 5 | import datatable as dt 6 | 7 | HOME = os.path.dirname(os.path.abspath(__file__)) 8 | MODEL_DIR = os.path.join(HOME, 'models') 9 | DATA_DIR = os.path.join(HOME, 'data') 10 | sys.path.append(HOME) 11 | from utils import * 12 | from mlp.mlp import * 13 | # %% 14 | ''' 15 | Current CV uses train.query('date>450') 16 | Model: pt models 17 | ''' 18 | target_cols = ['action_0', 'action_1', 'action_2', 'action_3', 'action_4'] 19 | N_FOLDS = 5 20 | N_MODELS = 5 21 | BATCH_SIZE = 8192 22 | VALID_DATE = 450 23 | model_list = [MODEL_DIR+f'/resmlp_{i}.pth' for i in range(N_FOLDS)] # baseline 24 | 25 | feat_cols = [f'feature_{i}' for i in range(130)] 26 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 27 | # f = median_avg 28 | f = np.median 29 | 30 | #%% 31 | 32 | def get_valid_df(date, fillna = 'mean'): 33 | data_file = find_files('train.csv', DATA_DIR) 34 | train = dt.fread(data_file[0]).to_pandas() 35 | _feat_cols = [f'feature_{i}' for i in range(130)] 36 | if fillna == 'mean': 37 | f_mean = np.mean(train[_feat_cols[1:]].values, axis=0) # for inference 38 | train.fillna(train.mean(),inplace=True) 39 | elif fillna == 'ffill': 40 | train[_feat_cols[1:]] = train[_feat_cols[1:]].fillna(method = 'ffill').fillna(0) 41 | else: # TO_DO: customized fillna_func 42 | pass 43 | 44 | train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43'] 45 | train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5) 46 | train['action_0'] = (train['resp'] > 0).astype(int) 47 | for c in range(1,5): 48 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int) 49 | print(f'action based on resp_{c} mean: ', train['action_'+str(c)].mean()) 50 | valid = train.query(f'date > {date}').reset_index(drop = True) 51 | valid.to_parquet(os.path.join(DATA_DIR,'valid.parquet')) 52 | 53 | def load_models(pt_model_files): 54 | ''' 55 | baseline mlp models in the mlp.mlp submodule 56 | ''' 57 | assert len(pt_model_files) == NFOLDS 58 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 59 | models = [] 60 | for model_file in pt_model_files: 61 | model = ResidualMLP(output_size=len(target_cols)) 62 | model.to(device) 63 | try: 64 | model.load_state_dict(torch.load(model_file)) 65 | except: 66 | model.load_state_dict(torch.load(model_file, map_location='cpu')) 67 | model.eval() 68 | models.append(model) 69 | return models 70 | 71 | 72 | def cv_score(valid_df, models, f=np.mean, thresh=0.5, device=None): 73 | print(f"Using {f.__qualname__} as ensembler.") 74 | if device is None: 75 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 76 | valid_pred = np.zeros((len(valid_df), len(target_cols))) 77 | valid_set = MarketDataset(valid_df, features=feat_cols, targets=target_cols) 78 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 79 | 80 | for _fold in range(len(models)): 81 | torch.cuda.empty_cache() 82 | model = models[_fold] 83 | valid_pred_fold = valid_epoch(model, valid_loader, device).reshape(-1, len(target_cols)) 84 | valid_pred += valid_pred_fold / len(models) 85 | valid_auc = roc_auc_score(valid[target_cols].values.astype(float), valid_pred) 86 | logloss_score = log_loss(valid[target_cols].values.astype(float), valid_pred) 87 | 88 | # valid_pred = f(valid_pred[...,:len(target_cols)], axis=-1) # only first 5 89 | valid_pred = f(valid_pred, axis=-1) # all 90 | valid_pred = np.where(valid_pred >= thresh, 1, 0).astype(int) 91 | valid_score = utility_score_bincount(date=valid.date.values, 92 | weight=valid.weight.values, 93 | resp=valid.resp.values, 94 | action=valid_pred) 95 | valid_score_max = utility_score_bincount(date=valid.date.values, 96 | weight=valid.weight.values, 97 | resp=valid.resp.values, 98 | action=(valid.resp.values>0)) 99 | print(f'Max utils score: {valid_score_max:.2f}') 100 | print(f'{len(models)} models valid score: {valid_score:.2f} \t auc: {valid_auc:.4f}') 101 | 102 | 103 | # %% 104 | if __name__ == '__main__': 105 | 106 | print(f"Current valid set is date after {VALID_DATE}.\n") 107 | valid_parquet = find_files('valid.parquet', DATA_DIR) 108 | if not valid_parquet: 109 | with timer("Generating validation df"): 110 | get_valid_df(VALID_DATE) 111 | else: 112 | with timer("Generating valid loader"): 113 | valid = pd.read_parquet(valid_parquet[0]) 114 | valid_set = MarketDataset(valid, features=feat_cols, targets=target_cols) 115 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 116 | models = load_models(model_list) 117 | cv_score(valid, models, f=f) 118 | 119 | 120 | ''' 121 | Lindada's model scores on date > 450: 122 | model 0: 4948 123 | model 1: 5641 124 | model 2: 5282 125 | model 3: 5825 126 | model 4: 5849 127 | all five: 6165 128 | ''' 129 | 130 | # %% 131 | -------------------------------------------------------------------------------- /mlp/run_train_embed.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os, sys 3 | import pandas as pd 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torchsummary import summary 8 | 9 | current_path = os.path.dirname(os.path.abspath(__file__)) 10 | HOME = os.path.dirname(current_path) 11 | MODEL_DIR = os.path.join(HOME, 'models') 12 | DATA_DIR = os.path.join(HOME, 'data') 13 | sys.path.append(HOME) 14 | 15 | from mlp import * 16 | from utils import * 17 | from utils_js import * 18 | 19 | #%% 20 | ''' 21 | Training script of the embedding model 22 | ''' 23 | 24 | 25 | HIDDEN_LAYERS = [400, 400, 400] # hidden layer size for the embedding model 26 | N_FEATURES = 130 27 | N_FEAT_TAGS = 29 28 | N_TARGETS = 6 29 | N_DENOISED_TARGET = 1 30 | 31 | BATCH_SIZE = 8196 32 | 33 | FINETUNE_BATCH_SIZE = 204_800 34 | 35 | EPOCHS = 50 36 | EARLYSTOP_NUM = 6 37 | 38 | LEARNING_RATE = 1e-3 39 | WEIGHT_DECAY = 1e-4 40 | 41 | 42 | 43 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 44 | 45 | feat_tag_file = os.path.join(DATA_DIR, 'features.csv') 46 | feat_cols = [f'feature_{i}' for i in range(130)] 47 | resp_cols = ['resp', 'resp_dn_0', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 48 | target_cols = ['action', 'action_dn_0', 'action_1', 'action_2', 'action_3', 'action_4'] 49 | 50 | # %% 51 | with timer("Preprocessing train"): 52 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 53 | train = pd.read_parquet(train_parquet) 54 | train = train.query ('date > 85').reset_index (drop = True) 55 | # df = df[df['weight'] != 0].reset_index (drop = True) 56 | 57 | train.fillna(train.mean(),inplace=True) 58 | train = add_denoised_target(train, num_dn_target=N_DENOISED_TARGET) 59 | 60 | train['action'] = (train['resp'] > 0).astype('int') 61 | 62 | print(f'action based on resp mean: ', train['action'].astype(int).mean()) 63 | print(f'action based on resp_dn_{0} mean:', train[f'action_dn_{0}'].astype(int).mean()) 64 | 65 | for c in range(1,5): 66 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int) 67 | print(f'action based on resp_{c} mean: ', train[f'action_{c}'].astype(int).mean()) 68 | 69 | valid = train.loc[train.date > 450].reset_index(drop=True) 70 | # %% 71 | # %% 72 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols) 73 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 74 | 75 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols) 76 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 77 | 78 | #%% 79 | model = EmbedFNN(hidden_layers=HIDDEN_LAYERS, output_dim=len(target_cols)) 80 | model.to(device); 81 | summary(model, input_size=(len(feat_cols), )) 82 | 83 | 84 | util_cols = resp_cols 85 | resp_index = [resp_cols.index(r) for r in util_cols] 86 | 87 | regularizer = UtilityLoss(alpha=1e-1, scaling=12, normalize=None, resp_index=resp_index) 88 | 89 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 90 | 91 | 92 | 93 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 94 | # optimizer = RAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 95 | 96 | # scheduler = None 97 | scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 98 | steps_per_epoch=len(train_loader), 99 | epochs=EPOCHS) 100 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 101 | # T_0=10, T_mult=1, 102 | # eta_min=LEARNING_RATE*1e-3, last_epoch=-1) 103 | 104 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8) 105 | 106 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-2) 107 | finetune_scheduler = None 108 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=5900) 109 | # %% 110 | _fold = 7 111 | SEED = 802 112 | get_seed(SEED+SEED*_fold) 113 | lr = [] 114 | 115 | for epoch in range(EPOCHS): 116 | 117 | train_loss = train_epoch(model, optimizer, scheduler,loss_fn, train_loader, device) 118 | lr.append(optimizer.param_groups[0]['lr']) 119 | if (epoch+1) % 10 == 0: 120 | _ = train_epoch_finetune(model, finetune_optimizer, finetune_scheduler, 121 | regularizer, finetune_loader, device, 122 | loss_fn=loss_fn) 123 | 124 | valid_pred = valid_epoch(model, valid_loader, device) 125 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 126 | f=median_avg, threshold=0.5, target_cols=target_cols) 127 | model_file = MODEL_DIR + \ 128 | f"/emb_fold_{_fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 129 | early_stop(valid_auc, model, model_path=model_file, 130 | epoch_utility_score=valid_score) 131 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}") 132 | tqdm.write( 133 | f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}") 134 | tqdm.write( 135 | f"Best util: {early_stop.best_utility_score:.2f} \t {early_stop.message} ") 136 | tqdm.write( 137 | f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 138 | if early_stop.early_stop: 139 | print("\nEarly stopping") 140 | break 141 | # %% 142 | CV_START_DAY = 100 143 | CV_DAYS = 50 144 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 145 | batch_size = 8192, f=median_avg, threshold=0.5, 146 | target_cols=target_cols, feat_cols=feat_cols, resp_cols=resp_cols) 147 | # %% 148 | -------------------------------------------------------------------------------- /iter_cv_torch.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os 3 | import sys 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm import tqdm 7 | from sklearn.metrics import roc_auc_score 8 | import torch 9 | from numba import njit 10 | import random 11 | import datetime 12 | 13 | HOME = os.path.dirname(os.path.abspath(__file__)) 14 | MODEL_DIR = HOME+'/models/' 15 | DATA_DIR = HOME+'/data/' 16 | from utils import * 17 | from utils_js import * 18 | from mlp.mlp import * 19 | get_system() 20 | # %% 21 | DEBUG = False 22 | SEED = 1127 23 | START_SIMU_TEST = 490 # this day to 499 as simulated test days 24 | END_SIMU_TEST = 499 25 | TQDM_INT = 20 26 | batch_size = 4096 27 | N_FOLDS = 5 28 | N_MODELS = 3 29 | 30 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 31 | 32 | #%% 33 | with timer("Loading train parquet"): 34 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 35 | train = pd.read_parquet(train_parquet) 36 | 37 | train['action'] = (train['resp'] > 0).astype(int) 38 | for c in range(1,5): 39 | train['action'] = train['action'] & ((train['resp_'+str(c)] > 0)) 40 | features = [c for c in train.columns if 'feature' in c] 41 | 42 | f_mean = np.mean(train[features[1:]].values, axis=0) 43 | 44 | simu_test = train.query(f'date > {START_SIMU_TEST} & date <= {END_SIMU_TEST}').reset_index(drop = True) 45 | print(f"Simulated public test file length: {len(simu_test)}") 46 | 47 | 48 | # %% 49 | class Iter_Valid(object): 50 | 51 | global predicted 52 | predicted = [] 53 | 54 | def __init__(self, df, features, batch_size = 1): 55 | df = df.reset_index(drop=True) 56 | self.columns = ['weight'] + features + ['date'] 57 | self.df = df[self.columns] 58 | self.weight = df['weight'].astype(float).values 59 | self.action = df['action'].astype(int).values 60 | self.pred_df = df[['action']] 61 | # self.pred_df[['action']] = 0 62 | self.len = len(df) 63 | self.current = 0 64 | self.batch_size = batch_size 65 | 66 | def __iter__(self): 67 | return self 68 | 69 | def __next__(self): 70 | pre_start = self.current 71 | self.current += self.batch_size 72 | if self.current <= self.len: 73 | df = self.df[pre_start:self.current].copy() 74 | pred_df = self.pred_df[pre_start:self.current].copy() 75 | return df, pred_df 76 | elif self.current > self.len and (self.current - self.len < self.batch_size): 77 | df = self.df[pre_start:self.len].copy() 78 | pred_df = self.pred_df[pre_start::self.len].copy() 79 | return df, pred_df 80 | else: 81 | raise StopIteration() 82 | 83 | def predict(self, pred_df): 84 | predicted.append(pred_df) 85 | # %% 86 | model_list = [] 87 | for _fold in range(N_FOLDS): 88 | torch.cuda.empty_cache() 89 | model = ResidualMLP() 90 | model.to(device) 91 | model_weights = os.path.join(MODEL_DIR, f"resmlp_{_fold}.pth") 92 | try: 93 | model.load_state_dict(torch.load(model_weights)) 94 | except: 95 | model.load_state_dict(torch.load(model_weights, map_location=torch.device('cpu'))) 96 | model.eval() 97 | n_params = get_num_params(model) 98 | print(f"Fold {_fold} model has {n_params} params.") 99 | model_list.append(model) 100 | 101 | model_list = model_list[-N_MODELS:] 102 | 103 | # %% 104 | if __name__ == '__main__': 105 | ''' 106 | inference simulation 107 | Using a customized class 108 | 109 | 110 | For the pytorch res+mlp model for day 490-499: 111 | 112 | 5 models, np.median: 1082.92 113 | 5 models, np.mean: 1030.73 114 | 5 models, median avg: 1067.43 115 | 3 models, np.median, 0.498 thresh: 1096.30 116 | 3 models, np.median, 0.497 thresh: 1116.35 117 | 3 models, np.median, 0.496 thresh: 1104.17 118 | 3 models, np.mean, 0.497 thresh: 1082 119 | 3 models, np.median, 0.502 thresh: 1088.58 120 | ''' 121 | date = simu_test['date'].values 122 | weight = simu_test['weight'].values 123 | resp = simu_test['resp'].values 124 | action = simu_test['action'].values 125 | 126 | # f = np.mean # 127 | # f = np.median 128 | f = median_avg 129 | 130 | thresh = 0.502 131 | print(f"\n\nPredicting the action using {thresh:.3f} threshold with {N_MODELS} models.") 132 | iter_test = Iter_Valid(simu_test, features) 133 | start = time() 134 | 135 | pbar = tqdm(total=len(simu_test)) 136 | for idx, (test_df, pred_df) in enumerate(iter_test): 137 | 138 | if test_df['weight'].item() > 0: 139 | x_tt = test_df.loc[:, features].values 140 | if np.isnan(x_tt[:, 1:].sum()): 141 | x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean 142 | 143 | cross_41_42_43 = x_tt[:, 41] + x_tt[:, 42] + x_tt[:, 43] 144 | cross_1_2 = x_tt[:, 1] / (x_tt[:, 2] + 1e-5) 145 | feature_inp = np.concatenate((x_tt, 146 | np.array(cross_41_42_43).reshape(x_tt.shape[0], 1), 147 | np.array(cross_1_2).reshape(x_tt.shape[0], 1)), axis=1) 148 | pred = np.zeros((1, len(target_cols))) 149 | for model in model_list: 150 | pred += model(torch.tensor(feature_inp, dtype=torch.float).to(device))\ 151 | .sigmoid().detach().cpu().numpy() / N_MODELS 152 | pred = f(pred.squeeze()) 153 | pred_df.action = np.where(pred >= thresh, 1, 0).astype(int) 154 | else: 155 | pred_df.action = 0 156 | 157 | iter_test.predict(pred_df) 158 | 159 | time_taken = time() - start 160 | total_time_est = time_taken / (idx+1) * 1000000 / 60 161 | pbar.set_description(f"Current speed = {total_time_est:.1f} minutes to complete inference") 162 | pbar.update() 163 | 164 | y_true = simu_test['action'] 165 | y_pred = pd.concat(predicted)['action'] 166 | print('\nValidation auc:', roc_auc_score(y_true, y_pred)) 167 | score = utility_score_bincount(date, weight, resp, y_true) 168 | score_pred = utility_score_bincount(date, weight, resp, y_pred) 169 | print('\nMax possible utility score:', score) 170 | print('\nModel utility score: ', score_pred) -------------------------------------------------------------------------------- /data/data_denoise.py: -------------------------------------------------------------------------------- 1 | #%% denoising target 2 | import os, sys 3 | import pandas as pd 4 | import numpy as np 5 | 6 | from scipy.optimize import minimize 7 | from sklearn.neighbors import KernelDensity 8 | from sklearn.base import BaseEstimator, TransformerMixin 9 | 10 | current_path = os.path.dirname(os.path.abspath(__file__)) 11 | HOME = os.path.dirname(current_path) 12 | sys.path.append(HOME) 13 | for f in ['/home/scao/anaconda3/lib/python3.8/lib-dynload', 14 | '/home/scao/anaconda3/lib/python3.8/site-packages']: 15 | sys.path.append(f) 16 | MODEL_DIR = HOME+'/models/' 17 | DATA_DIR = HOME+'/data/' 18 | from utils import * 19 | from utils_js import * 20 | # %% 21 | 22 | ''' 23 | By Lucas Morin 24 | https://www.kaggle.com/lucasmorin/target-engineering-patterns-denoising 25 | ''' 26 | 27 | def mpPDF(var,q,pts): 28 | # Marcenko-Pastur pdf 29 | # q=T/N 30 | eMin, eMax = var*(1-(1./q)**.5)**2, var*(1+(1./q)**.5)**2 31 | eVal = np.linspace(eMin,eMax,pts) 32 | pdf = q/(2*np.pi*var*eVal)*((eMax-eVal)*(eVal-eMin))**.5 33 | pdf = pd.Series(pdf.reshape(-1,), index=eVal.reshape(-1,)) 34 | return pdf 35 | 36 | 37 | def getPCA(matrix): 38 | # Get eVal,eVec from a Hermitian matrix 39 | eVal,eVec = np.linalg.eigh(matrix) 40 | indices=eVal.argsort()[::-1] # arguments for sorting eVal desc 41 | eVal,eVec=eVal[indices],eVec[:,indices] 42 | eVal=np.diagflat(eVal) 43 | return eVal,eVec 44 | 45 | def fitKDE(obs,bWidth=.25,kernel='gaussian',x=None): 46 | # Fit kernel to a series of obs, and derive the prob of obs 47 | # x is the array of values on which the fit KDE will be evaluated 48 | if len(obs.shape)==1: 49 | obs=obs.reshape(-1,1) 50 | kde=KernelDensity(kernel=kernel,bandwidth=bWidth).fit(obs) 51 | if x is None: 52 | x=np.unique(obs).reshape(-1,) 53 | if len(x.shape)==1: 54 | x=x.reshape(-1,1) 55 | logProb=kde.score_samples(x) # log(density) 56 | pdf=pd.Series(np.exp(logProb),index=x.flatten()) 57 | return pdf 58 | 59 | def cov2corr(cov): 60 | # Derive the correlation matrix from a covariance matrix 61 | std=np.sqrt(np.diag(cov)) 62 | corr=cov/np.outer(std,std) 63 | corr[corr<-1],corr[corr>1]=-1,1 # numerical error 64 | return corr 65 | 66 | def errPDFs(var,eVal,q,bWidth,pts=1000): 67 | # Fit error 68 | pdf0=mpPDF(var,q,pts) # theoretical pdf 69 | pdf1=fitKDE(eVal,bWidth,x=pdf0.index.values) # empirical pdf 70 | sse=np.sum((pdf1-pdf0)**2) 71 | return sse 72 | 73 | def findMaxEval(eVal,q,bWidth): 74 | # Find max random eVal by fitting Marcenko’s dist 75 | out=minimize(lambda *x:errPDFs(*x),.5,args=(eVal,q,bWidth), 76 | bounds=((1E-5,1-1E-5),)) 77 | if out['success']: 78 | var=out['x'][0] 79 | else: 80 | var=1 81 | eMax=var*(1+(1./q)**.5)**2 82 | return eMax,var 83 | 84 | def denoisedCorr(eVal,eVec,nFacts): 85 | # Remove noise from corr by fixing random eigenvalues 86 | eVal_=np.diag(eVal).copy() 87 | eVal_[nFacts:]=eVal_[nFacts:].sum()/float(eVal_.shape[0] - nFacts) 88 | eVal_=np.diag(eVal_) 89 | corr1=np.dot(eVec,eVal_).dot(eVec.T) 90 | corr1=cov2corr(corr1) 91 | return corr1 92 | 93 | def denoisedCorr2(eVal,eVec,nFacts,alpha=0): 94 | # Remove noise from corr through targeted shrinkage 95 | eValL,eVecL=eVal[:nFacts,:nFacts],eVec[:,:nFacts] 96 | eValR,eVecR=eVal[nFacts:,nFacts:],eVec[:,nFacts:] 97 | corr0=np.dot(eVecL,eValL).dot(eVecL.T) 98 | corr1=np.dot(eVecR,eValR).dot(eVecR.T) 99 | corr2=corr0+alpha*corr1+(1-alpha)*np.diag(np.diag(corr1)) 100 | return corr2 101 | 102 | 103 | class RMTDenoising(BaseEstimator, TransformerMixin): 104 | 105 | def __init__(self, bWidth=.01, alpha=.5, feature_0=True, sample=0.3, seed=2021): 106 | self.bWidth = bWidth 107 | self.alpha = alpha 108 | self.feature_0 = feature_0 109 | self.sample = sample 110 | self.seed = seed 111 | 112 | def denoise(self, X): 113 | sample = X.sample(frac=self.sample, random_state=self.seed) 114 | q = X.shape[0] / X.shape[1] 115 | cov = sample.cov().values 116 | corr0 = cov2corr(cov) 117 | 118 | eVal0, eVec0 = getPCA(corr0) 119 | eMax0, var0 = findMaxEval(np.diag(eVal0), q, bWidth=self.bWidth) 120 | nFacts0 = eVal0.shape[0] - np.diag(eVal0)[::-1].searchsorted(eMax0) 121 | corr1 = denoisedCorr2(eVal0, eVec0, nFacts0, alpha=self.alpha) 122 | eVal1, eVec1 = getPCA(corr1) 123 | #result = np.hstack((np.diag(eVal1), var0)) 124 | #name = [f'eigen_{i+1}' for i in range(len(eVal1))] + ['var_explained'] 125 | return eVec1[:, :nFacts0] 126 | 127 | def fit(self, X, y=None): 128 | if self.feature_0: 129 | self.cols_ = [c for c in X.columns if c != 'feature_0'] 130 | else: 131 | self.cols_ = list(X.columns) 132 | X_ = X[self.cols_] 133 | self.W_ = self.denoise(X_) 134 | self.dim_W_ = self.W_.shape[1] 135 | return self 136 | 137 | def transform(self, X, y=None): 138 | X_ = X.copy() 139 | names = [f'proj_{i}' for i in range(self.dim_W_)] 140 | projection = pd.DataFrame(fast_fillna(X_[self.cols_].values, 0).dot(self.W_), columns=names) 141 | if self.feature_0: 142 | projection['feature_0'] = X['feature_0'] 143 | return projection 144 | # %% 145 | if __name__ == '__main__': 146 | with timer("Preprocessing train"): 147 | train_file = os.path.join(DATA_DIR, 'train.parquet') 148 | train = pd.read_parquet(train_file) 149 | 150 | # train = train.loc[train.date > 85].reset_index(drop=True) 151 | drop_days = [2, 36, 270, 294] 152 | train = train.query(f'date not in {drop_days}').reset_index(drop=True) 153 | 154 | ''' 155 | 0: all resps 156 | 1: resp, 3, 4 157 | 2: resp, 1, 2 158 | ''' 159 | _f = 0 160 | targets = ['resp','resp_1','resp_2','resp_3','resp_4'] 161 | # targets = ['resp','resp_3','resp_4'] 162 | # targets = ['resp','resp_1','resp_2'] 163 | # targets = ['resp','resp_2','resp_4'] 164 | targets_f0 = targets + ['feature_0'] 165 | 166 | target_tf = RMTDenoising(sample=0.8, seed=1127802+_f) 167 | 168 | target_tf.fit(train[targets_f0]) 169 | 170 | targets_denoised = target_tf.transform(train[targets_f0]) 171 | targets_denoised = targets_denoised.rename(columns={'proj_0': f'resp_dn_{_f}'}) 172 | targets_denoised[[f'resp_dn_{_f}']] = -targets_denoised[f'resp_dn_{_f}'].values 173 | print(targets_denoised.head(10)) 174 | print(train[targets_f0].head(10)) 175 | targets_denoised[[f'resp_dn_{_f}']].to_csv(os.path.join(DATA_DIR,f'target_dn_{_f}.csv'), index=False) 176 | -------------------------------------------------------------------------------- /mlp/v08_submit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import time\n", 11 | "import pickle\n", 12 | "import random\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "from tqdm import tqdm\n", 16 | "from sklearn.metrics import log_loss, roc_auc_score\n", 17 | "\n", 18 | "import torch\n", 19 | "import torch.nn as nn\n", 20 | "from torch.autograd import Variable\n", 21 | "from torch.utils.data import DataLoader\n", 22 | "from torch.nn import CrossEntropyLoss, MSELoss\n", 23 | "from torch.nn.modules.loss import _WeightedLoss\n", 24 | "import torch.nn.functional as F\n", 25 | "\n", 26 | "import sys\n", 27 | "sys.path.insert(0, '../data/')\n", 28 | "import janestreet\n", 29 | "\n", 30 | "pd.set_option('display.max_columns', 100)\n", 31 | "pd.set_option('display.max_rows', 100)\n", 32 | "\n", 33 | "CACHE_PATH = './v08_pytorch_benchmark/'\n", 34 | "NFOLDS = 5" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "feat_cols = [f'feature_{i}' for i in range(130)]\n", 44 | "target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4']\n", 45 | "all_feat_cols = feat_cols.copy()\n", 46 | "all_feat_cols.extend(['cross_41_42_43', 'cross_1_2'])\n", 47 | "f_mean = np.load(f'{CACHE_PATH}/f_mean_online.npy')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "# Prediction" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "class MLPModel(nn.Module):\n", 64 | " \n", 65 | " # training parameters\n", 66 | " epochs = 200\n", 67 | " label_smoothing = 1e-2\n", 68 | " learning_rate = 1e-3\n", 69 | " \n", 70 | " # model parameters\n", 71 | " hidden_units = [160, 160, 160]\n", 72 | " dropout_rates = [0.2, 0.2, 0.2, 0.2]\n", 73 | " num_columns = len(all_feat_cols)\n", 74 | " num_labels = len(target_cols)\n", 75 | " units = [num_columns] + hidden_units + [num_labels]\n", 76 | " \n", 77 | " def __init__(self):\n", 78 | " super(MLPModel, self).__init__()\n", 79 | " self.batch_norm = nn.ModuleList()\n", 80 | " self.dropout = nn.ModuleList()\n", 81 | " self.dense = nn.ModuleList()\n", 82 | " \n", 83 | " for i in range(len(self.units) - 1):\n", 84 | " self.batch_norm.append(nn.BatchNorm1d(self.units[i]))\n", 85 | " self.dropout.append(nn.Dropout(self.dropout_rates[i]))\n", 86 | " self.dense.append(nn.Linear(self.units[i], self.units[i + 1]))\n", 87 | " \n", 88 | " self.activation = nn.SiLU()\n", 89 | " \n", 90 | " def forward(self, x):\n", 91 | " for i in range(len(self.units) - 1):\n", 92 | " x = self.batch_norm[i](x)\n", 93 | " if i != 0:\n", 94 | " x = self.activation(x)\n", 95 | " x = self.dropout[i](x)\n", 96 | " x = self.dense[i](x)\n", 97 | " # no sigmoid\n", 98 | " \n", 99 | " return x" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "model_list = []\n", 109 | "for _fold in range(NFOLDS):\n", 110 | " model = MLPModel()\n", 111 | " model_weights = f\"{CACHE_PATH}/online_model{_fold}.pth\"\n", 112 | " model.load_state_dict(torch.load(model_weights))\n", 113 | " model.eval()\n", 114 | " model_list.append(model)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stderr", 124 | "output_type": "stream", 125 | "text": [ 126 | "15219it [01:58, 128.15it/s]\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "env = janestreet.make_env()\n", 132 | "env_iter = env.iter_test()\n", 133 | "\n", 134 | "device = torch.device(\"cpu\")\n", 135 | "\n", 136 | "for (test_df, pred_df) in tqdm(env_iter):\n", 137 | " if test_df['weight'].item() > 0:\n", 138 | " x_tt = test_df.loc[:, feat_cols].values\n", 139 | " if np.isnan(x_tt.sum()):\n", 140 | " x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * f_mean\n", 141 | "\n", 142 | " cross_41_42_43 = x_tt[:, 41] + x_tt[:, 42] + x_tt[:, 43]\n", 143 | " cross_1_2 = x_tt[:, 1] / (x_tt[:, 2] + 1e-5)\n", 144 | " feature_inp = np.concatenate((\n", 145 | " x_tt,\n", 146 | " np.array(cross_41_42_43).reshape(x_tt.shape[0], 1),\n", 147 | " np.array(cross_1_2).reshape(x_tt.shape[0], 1),\n", 148 | " ), axis=1)\n", 149 | "\n", 150 | " # torch_pred\n", 151 | " torch_pred = np.zeros((1, len(target_cols)))\n", 152 | " for model in model_list:\n", 153 | " torch_pred += model(torch.tensor(feature_inp, dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy() / NFOLDS\n", 154 | " torch_pred = np.median(torch_pred)\n", 155 | "\n", 156 | " # tf_pred\n", 157 | " #tf_pred = np.median(np.mean([model(x_tt, training = False).numpy() for model in tf_models],axis=0))\n", 158 | "\n", 159 | " # avg\n", 160 | " #pred = torch_pred * 0.5 + tf_pred * 0.5\n", 161 | " pred = torch_pred\n", 162 | "\n", 163 | " pred_df.action = np.where(pred >= 0.5, 1, 0).astype(int)\n", 164 | " else:\n", 165 | " pred_df.action = 0\n", 166 | " env.predict(pred_df)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "Python 3", 180 | "language": "python", 181 | "name": "python3" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.7.9" 194 | } 195 | }, 196 | "nbformat": 4, 197 | "nbformat_minor": 4 198 | } 199 | -------------------------------------------------------------------------------- /mlp/run_train_final_1.py: -------------------------------------------------------------------------------- 1 | # %% 2 | from torchsummary import summary 3 | import os 4 | import sys 5 | import torch 6 | import torch.nn.functional as F 7 | import torch.nn as nn 8 | torch.backends.cudnn.deterministic = True # for bincount 9 | 10 | current_path = os.path.dirname(os.path.abspath(__file__)) 11 | HOME = os.path.dirname(current_path) 12 | MODEL_DIR = os.path.join(HOME, 'models') 13 | DATA_DIR = os.path.join(HOME, 'data') 14 | sys.path.append(HOME) 15 | 16 | from utils import * 17 | from mlp import * 18 | # %% 19 | ''' 20 | Training script (including volatile days): 21 | 1. data: including the volatile day but excluding the outlier days (2, 294, 36, 270) 22 | 2. data: the fillna is using the past day mean (after excluding the days above) 23 | 3. training: finetuning using resp colums as regularizer 24 | ''' 25 | 26 | DEBUG = False 27 | TRAINING_START = 0 28 | FINETUNE_BATCH_SIZE = 2048_00 29 | BATCH_SIZE = 8196 30 | EPOCHS = 120 31 | LEARNING_RATE = 1e-4 32 | WEIGHT_DECAY = 1e-5 33 | EARLYSTOP_NUM = 20 34 | NFOLDS = 1 35 | SCALING = 12 36 | THRESHOLD = 0.5 37 | CV_THRESH = 6000 38 | DAYS_TO_DROP = [2, 36, 270, 294] 39 | # VOLATILE_DAYS = [1, 3, 4, 5, 8, 9, 12, 16, 17, 18, 23, 24, 26, 27, 30, 31, 32, 37, 38, 40 | # 41, 43, 44, 45, 46, 47, 59, 63, 69, 80, 85, 161, 168, 185, 196, 223, 231, 235, 41 | # 262, 274, 276, 283, 324, 346, 353, 354, 356, 379, 380, 382, 393, 394, 427, 438, 42 | # 452, 454, 459, 462, 468, 475, 488, 489, 491, 492, 495] 43 | 44 | SEED = 1127802 45 | get_seed(SEED) 46 | 47 | f = median_avg 48 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 49 | 50 | # %% 51 | with timer("Preprocessing train"): 52 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 53 | train, valid = preprocess_final(train_parquet, day_start=TRAINING_START, 54 | training_days=range(0,475), valid_days=range(475, 500), 55 | drop_days=DAYS_TO_DROP, 56 | drop_zero_weight=True, denoised_resp=False) 57 | 58 | resp_cols = ['resp_3','resp', 'resp_4'] 59 | resp_cols_all = resp_cols 60 | target_cols = ['action_3', 'action', 'action_4'] 61 | feat_cols = [f'feature_{i}' for i in range(130)] 62 | 63 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 64 | 65 | ###### adding weight to the features ####### 66 | # feat_cols.extend(['weight']) 67 | # %% 68 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols) 69 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 70 | 71 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols) 72 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 73 | 74 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols)) 75 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols)) 76 | model.to(device) 77 | summary(model, input_size=(len(feat_cols), )) 78 | # %% 79 | util_cols = resp_cols 80 | resp_index = [resp_cols_all.index(r) for r in util_cols] 81 | 82 | regularizer = UtilityLoss(alpha=1e-1, scaling=SCALING, normalize=None, resp_index=resp_index) 83 | 84 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 85 | 86 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 87 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 88 | T_0=10, T_mult=1, 89 | eta_min=LEARNING_RATE*1e-3, last_epoch=-1) 90 | 91 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=10) 92 | 93 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-4) 94 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=CV_THRESH) 95 | # %% 96 | _fold = 0 97 | SEED = 1127802 98 | get_seed(SEED+SEED*_fold) 99 | 100 | for epoch in range(EPOCHS): 101 | 102 | # train_loss = train_epoch(model, optimizer, None, loss_fn, train_loader, device) 103 | train_loss = train_epoch_weighted(model, optimizer, None, loss_fn, train_loader, device) 104 | scheduler.step() 105 | lr = optimizer.param_groups[0]['lr'] 106 | if (epoch+1) % 2 == 0: 107 | _ = train_epoch_finetune(model, finetune_optimizer, scheduler, 108 | regularizer, finetune_loader, device, loss_fn=loss_fn) 109 | 110 | valid_pred = valid_epoch(model, valid_loader, device) 111 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 112 | f=median_avg, threshold=0.5, target_cols=target_cols) 113 | 114 | model_file = MODEL_DIR + f"/final_volatile_{_fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 115 | early_stop(epoch, valid_auc, model, 116 | model_path=model_file, 117 | epoch_utility_score=valid_score) 118 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}") 119 | tqdm.write(f"Train loss: {train_loss:.4e} \t Current learning rate: {lr:.4e}") 120 | tqdm.write(f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ") 121 | tqdm.write(f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 122 | if early_stop.early_stop: 123 | print("\nEarly stopping") 124 | break 125 | 126 | if DEBUG: 127 | torch.save(model.state_dict(), MODEL_DIR + f"/model_{_fold}.pth") 128 | # %% 129 | _fold = 4 130 | model_file = f"resw_interleave_1_util_6455_auc_0.6237.pth" 131 | print(f"Loading {model_file} for cv check.\n") 132 | model_weights = os.path.join(MODEL_DIR, model_file) 133 | 134 | model.to(device) 135 | feat_cols = [f'feature_{i}' for i in range(130)] 136 | feat_cols.extend(['weight']) 137 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 138 | 139 | 140 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, 141 | output_size=len(target_cols)) 142 | model.to(device) 143 | try: 144 | model.load_state_dict(torch.load(model_weights)) 145 | except: 146 | model.load_state_dict(torch.load( 147 | model_weights, map_location=torch.device('cpu'))) 148 | model.eval(); 149 | 150 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 151 | train = preprocess_pt(train_parquet, day_start=0, day_split=None, drop_zero_weight=False) 152 | 153 | CV_START_DAY = 100 154 | CV_DAYS = 25 155 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 156 | batch_size =2*8192, f=median_avg, threshold=0.5, 157 | target_cols=target_cols, 158 | feat_cols=feat_cols, 159 | resp_cols=resp_cols) 160 | # %% 161 | -------------------------------------------------------------------------------- /mlp/run_train_base.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os, sys 3 | from torchsummary import summary 4 | 5 | current_path = os.path.dirname(os.path.abspath(__file__)) 6 | HOME = os.path.dirname(current_path) 7 | MODEL_DIR = os.path.join(HOME, 'models') 8 | DATA_DIR = os.path.join(HOME, 'data') 9 | sys.path.append(HOME) 10 | from utils import * 11 | from mlp import * 12 | # %% 13 | BATCH_SIZE = 4096 14 | EPOCHS = 200 15 | LEARNING_RATE = 1e-4 16 | WEIGHT_DECAY = 1e-5 17 | EARLYSTOP_NUM = 5 18 | NFOLDS = 1 19 | SCALING = 1000 20 | THRESHOLD = 0.5 21 | SEED = 802 22 | get_seed(SEED) 23 | # f = np.median 24 | # f = np.mean 25 | f = median_avg 26 | 27 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 28 | #%% 29 | with timer("Loading train parquet"): 30 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 31 | train = pd.read_parquet(train_parquet) 32 | 33 | train = train.loc[train.date > 85].reset_index(drop=True) 34 | weight_mean = train.loc[train.weight > 0].mean() 35 | #%% 36 | # vanilla actions based on resp 37 | train['action_0'] = (train['resp'] > 0).astype('int') 38 | for c in range(1,5): 39 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype('int') 40 | print(f'action based on resp_{c} mean: ' ,' '*10, train['action_'+str(c)].astype(int).mean()) 41 | 42 | # sum 43 | train['resp_all'] = train['resp'].copy() 44 | for c in range(1,5): 45 | train['resp_all'] += train['resp_'+str(c)] 46 | train['action'] = (train['resp_all'] > 0).astype('int') 47 | print('All actions mean: ', ' '*10, train['action'].astype(int).mean()) 48 | 49 | for c in range(1,5): 50 | train['action_0'+str(c)] = (train['resp'] + train['resp_'+str(c)] > 0) 51 | print(f'action based on resp and resp_{c} mean: ', train['action_0'+str(c)].astype(int).mean()) 52 | 53 | for i in range(1,5): 54 | for j in range(i+1,5): 55 | train['action_'+str(i)+str(j)] = (train['resp_'+str(i)] + train['resp_'+str(j)] > 0) 56 | print(f'action based on resp_{i} and resp_{j} mean: ', train['action_'+str(i)+str(j)].astype(int).mean()) 57 | 58 | #%% 59 | feat_cols = [f'feature_{i}' for i in range(130)] 60 | # feat_cols = [c for c in train.columns if 'feature' in c] 61 | f_mean = np.mean(train[feat_cols[1:]].values, axis=0) 62 | train.fillna(train.mean(),inplace=True) 63 | 64 | valid = train.loc[train.date >= 450].reset_index(drop=True) 65 | train = train.loc[train.date <= 425].reset_index(drop=True) 66 | #%% 67 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 68 | weight_resp_cols = ['resp_w', 'resp_w_1', 'resp_w_2', 'resp_w_3', 'resp_w_4'] 69 | target_cols = ['action_0', 'action_1', 'action_2', 'action_3', 'action_4'] 70 | # target_cols_all = target_cols 71 | target_cols_all = ['action', 72 | 'action_0', 'action_1', 'action_2', 'action_3', 'action_4', 73 | 'action_01', 'action_02', 'action_03', 'action_04', 74 | 'action_12', 'action_13', 'action_14', 'action_23', 'action_24', 'action_34'] 75 | 76 | target_cols_ex = target_cols + resp_cols + weight_resp_cols 77 | 78 | train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43'] 79 | train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5) 80 | valid['cross_41_42_43'] = valid['feature_41'] + valid['feature_42'] + valid['feature_43'] 81 | valid['cross_1_2'] = valid['feature_1'] / (valid['feature_2'] + 1e-5) 82 | 83 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 84 | 85 | 86 | # %% 87 | train_set = MarketDataset(train, features=feat_cols, targets=target_cols_all) 88 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 89 | 90 | valid_set = MarketDataset(valid, features=feat_cols, targets=target_cols_all) 91 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 92 | # %% 93 | model = ResidualMLP(output_size=len(target_cols_all)) 94 | model.to(device) 95 | summary(model, input_size=(len(feat_cols), )) 96 | 97 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 98 | optimizer = Lookahead(optimizer=optimizer, k=10, alpha=0.5) 99 | scheduler = None 100 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 101 | # max_lr=1e-2, epochs=EPOCHS, 102 | # steps_per_epoch=len(train_loader)) 103 | loss_fn = SmoothBCEwLogits(smoothing=0.01) 104 | 105 | es = EarlyStopping(patience=EARLYSTOP_NUM, mode="max") 106 | 107 | # %% 108 | 109 | with tqdm(total=EPOCHS) as pbar: 110 | for epoch in range(EPOCHS): 111 | 112 | start_time = time() 113 | train_loss = train_epoch(model, optimizer, scheduler, loss_fn, train_loader, device) 114 | 115 | valid_pred = valid_epoch(model, valid_loader, device) 116 | valid_auc = roc_auc_score(valid[target_cols_all].values.astype(float).reshape(-1), valid_pred) 117 | valid_logloss = log_loss(valid[target_cols_all].values.astype(float).reshape(-1), valid_pred) 118 | valid_pred = valid_pred.reshape(-1, len(target_cols_all)) 119 | # valid_pred = f(valid_pred[...,:len(target_cols)], axis=-1) # only do first 5 120 | valid_pred = f(valid_pred, axis=-1) # all 121 | valid_pred = np.where(valid_pred >= THRESHOLD, 1, 0).astype(int) 122 | valid_score = utility_score_bincount(date=valid.date.values, 123 | weight=valid.weight.values, 124 | resp=valid.resp.values, 125 | action=valid_pred) 126 | model_file = MODEL_DIR+f"/resmlp_seed_{SEED}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 127 | es(valid_auc, model, model_path=model_file, epoch_utility_score=valid_score) 128 | 129 | pbar.set_description(f"EPOCH:{epoch:2d} tr_loss:{train_loss:.2f} " 130 | f"val_utitlity:{valid_score:.2f} valid_auc:{valid_auc:.4f} " 131 | f"epoch time: {time() - start_time:.1f}sec " 132 | f"early stop counter: {es.counter}") 133 | 134 | if es.early_stop: 135 | print("\nEarly stopping") 136 | break 137 | pbar.update() 138 | #%% 139 | if True: 140 | valid_pred = np.zeros((len(valid), len(target_cols_all))) 141 | for _fold in range(NFOLDS): 142 | torch.cuda.empty_cache() 143 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 144 | model = ResidualMLP(output_size=len(target_cols_all)) 145 | model.to(device) 146 | model_file = MODEL_DIR + '/resmlp_seed_802_util_2413_auc_0.5475.pth' 147 | # model_file = MODEL_DIR+f"/resmlp_seed_{SEED}_util_2217_auc_0.5526.pth" 148 | # model_file = MODEL_DIR + '/resmlp_seed_802_util_2229.pth' 149 | model.load_state_dict(torch.load(model_file)) 150 | valid_pred_fold = valid_epoch(model, valid_loader, device).reshape(-1, len(target_cols_all)) 151 | valid_pred += valid_pred_fold / NFOLDS 152 | valid_auc = roc_auc_score(valid[target_cols_all].values.astype(float), valid_pred) 153 | logloss_score = log_loss(valid[target_cols_all].values.astype(float), valid_pred) 154 | 155 | # valid_pred = f(valid_pred[...,:len(target_cols)], axis=-1) # only first 5 156 | valid_pred = f(valid_pred, axis=-1) # all 157 | valid_pred = np.where(valid_pred >= THRESHOLD, 1, 0).astype(int) 158 | valid_score = utility_score_bincount(date=valid.date.values, 159 | weight=valid.weight.values, 160 | resp=valid.resp.values, 161 | action=valid_pred) 162 | valid_score_max = utility_score_bincount(date=valid.date.values, 163 | weight=valid.weight.values, 164 | resp=valid.resp.values, 165 | action=(valid.resp.values>0)) 166 | print(f'{NFOLDS} models valid score: {valid_score:.2f}') 167 | print(f'Max possible valid score: {valid_score_max:.2f}') 168 | print(f'auc_score: {valid_auc:.4f} \t logloss_score: {logloss_score:.4f}') 169 | # %% 170 | -------------------------------------------------------------------------------- /mlp/run_train_denoise.py: -------------------------------------------------------------------------------- 1 | # %% 2 | from torchsummary import summary 3 | import os 4 | import sys 5 | import torch 6 | import torch.nn.functional as F 7 | import torch.nn as nn 8 | torch.backends.cudnn.deterministic = True # for bincount 9 | 10 | current_path = os.path.dirname(os.path.abspath(__file__)) 11 | HOME = os.path.dirname(current_path) 12 | MODEL_DIR = os.path.join(HOME, 'models') 13 | DATA_DIR = os.path.join(HOME, 'data') 14 | sys.path.append(HOME) 15 | 16 | from utils import * 17 | from mlp import * 18 | # %% 19 | 20 | ''' 21 | Training script finetuning using resp colums as regularizer with an additional denoised target 22 | ''' 23 | 24 | DEBUG = False 25 | LOAD_PRETRAIN = False 26 | TRAINING_START = 86 # 86 by default 27 | 28 | FINETUNE_BATCH_SIZE = 2048_00 29 | BATCH_SIZE = 8196 30 | EPOCHS = 120 31 | EARLYSTOP_NUM = 10 32 | 33 | LEARNING_RATE = 1e-4 34 | WEIGHT_DECAY = 1e-5 35 | SCALING = 10 36 | THRESHOLD = 0.5 37 | NUM_DENOISE = 1 38 | DAYS_TO_DROP = [2, 36, 270, 294] 39 | SEED = 1127802 40 | get_seed(SEED) 41 | 42 | # f = np.median 43 | # f = np.mean 44 | f = median_avg 45 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 46 | 47 | 48 | feat_cols = [f'feature_{i}' for i in range(130)] 49 | # f_mean = np.mean(train[feat_cols[1:]].values, axis=0) 50 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 51 | 52 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 53 | resp_cols_all = resp_cols 54 | target_cols = ['action','action_1', 'action_2', 'action_3', 'action_4'] 55 | 56 | for c in range(NUM_DENOISE): 57 | resp_cols += [f'resp_dn_{c}'] 58 | target_cols += [f'action_dn_{c}'] 59 | 60 | # util_cols = ['resp', 'resp_1', 'resp_2'] 61 | # util_cols = ['resp', 'resp_4'] 62 | # util_cols = ['resp'] 63 | util_cols = resp_cols 64 | 65 | resp_index = [resp_cols_all.index(r) for r in util_cols] 66 | 67 | 68 | # %% 69 | with timer("Preprocessing train"): 70 | # train_parquet = os.path.join(DATA_DIR, 'train.parquet') 71 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 72 | train, valid = preprocess_pt(train_parquet, 73 | day_start=TRAINING_START, 74 | drop_days=DAYS_TO_DROP, 75 | drop_zero_weight=True, 76 | zero_weight_thresh=None, 77 | denoised_resp=True, 78 | num_dn_target=NUM_DENOISE) 79 | 80 | print(f'action based on resp mean: ', train['action'].astype(int).mean()) 81 | print(f'action based on resp_dn_0 mean:', train[f'action_dn_0'].astype(int).mean()) 82 | 83 | for c in range(1, 5): 84 | print(f'action based on resp_{c} mean: ', train['action_'+str(c)].astype(int).mean()) 85 | 86 | 87 | # %% 88 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols) 89 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=10) 90 | 91 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols) 92 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=10) 93 | 94 | 95 | # %% 96 | 97 | # regularizer = RespMSELoss(alpha=1e-1, scaling=1, resp_index=resp_index) 98 | regularizer = UtilityLoss(alpha=5e-2, scaling=12, normalize=None, resp_index=resp_index) 99 | 100 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 101 | 102 | # all_train = pd.concat([train, valid], axis=0) 103 | # all_train_set = ExtendedMarketDataset(all_train, features=feat_cols, targets=target_cols, resp=resp_cols) 104 | # train_loader = DataLoader(all_train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 105 | 106 | 107 | model = ResidualMLP(hidden_size=128, output_size=len(target_cols)) 108 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols)) 109 | model.to(device) 110 | summary(model, input_size=(len(feat_cols), )) 111 | 112 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 113 | # optimizer = RAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 114 | # optimizer = Lookahead(optimizer=optimizer, alpha=1e-1) 115 | # scheduler = None 116 | 117 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 118 | # steps_per_epoch=len(train_loader), 119 | # epochs=EPOCHS) 120 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 121 | T_0=10, T_mult=2, 122 | eta_min=LEARNING_RATE*1e-3, last_epoch=-1) 123 | 124 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8) 125 | 126 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3) 127 | 128 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=1200) 129 | 130 | # %% 131 | if LOAD_PRETRAIN: 132 | print("Loading model for finetune.") 133 | _fold = 0 134 | model_weights = os.path.join(MODEL_DIR, f"resmlp_{_fold}.pth") 135 | # model_weights = os.path.join(MODEL_DIR, f"resmlp_ft_old_fold_{_fold}.pth") 136 | # model_weights = os.path.join(MODEL_DIR, f"resmlp_finetune_fold_{_fold}.pth") 137 | try: 138 | model.load_state_dict(torch.load(model_weights)) 139 | except: 140 | model.load_state_dict(torch.load( 141 | model_weights, map_location=torch.device('cpu'))) 142 | model.eval() 143 | valid_pred = valid_epoch(model, valid_loader, device) 144 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 145 | f=median_avg, threshold=0.5, target_cols=target_cols) 146 | 147 | print(f"valid_utility:{valid_score:.2f} \t valid_auc:{valid_auc:.4f}") 148 | # %% 149 | _fold = 6 150 | SEED = 1127802 151 | get_seed(SEED+SEED*_fold) 152 | lr = [] 153 | 154 | for epoch in range(EPOCHS): 155 | 156 | # train_loss = train_epoch(model, optimizer, scheduler,loss_fn, train_loader, device) 157 | train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device) 158 | lr.append(optimizer.param_groups[0]['lr']) 159 | if (epoch+1) % 10 == 0: 160 | _ = train_epoch_finetune(model, finetune_optimizer, scheduler, 161 | regularizer, finetune_loader, device, loss_fn=loss_fn) 162 | 163 | valid_pred = valid_epoch(model, valid_loader, device) 164 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 165 | f=median_avg, threshold=0.5, target_cols=target_cols) 166 | model_file = MODEL_DIR + \ 167 | f"/dn_fold_{_fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 168 | early_stop(epoch, valid_auc, model, model_path=model_file, 169 | epoch_utility_score=valid_score) 170 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}") 171 | tqdm.write( 172 | f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}") 173 | tqdm.write( 174 | f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ") 175 | tqdm.write( 176 | f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 177 | if early_stop.early_stop: 178 | print("\nEarly stopping") 179 | break 180 | 181 | if DEBUG: 182 | torch.save(model.state_dict(), MODEL_DIR + f"/resmlp_interleave_fold_{_fold}.pth") 183 | 184 | # %% 185 | model_file = f"resmlp_interleave_0_util_7437_auc_0.6389.pth" 186 | print(f"Loading {model_file} for cv check.") 187 | model_weights = os.path.join(MODEL_DIR, model_file) 188 | 189 | try: 190 | model.load_state_dict(torch.load(model_weights)) 191 | except: 192 | model.load_state_dict(torch.load( 193 | model_weights, map_location=torch.device('cpu'))) 194 | model.eval(); 195 | 196 | # %% 197 | CV_START_DAY = 100 198 | CV_DAYS = 25 199 | all_train = pd.concat([train, valid], axis=0) 200 | print_all_valid_score(all_train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 201 | batch_size = 8192, f=median_avg, threshold=0.5, 202 | target_cols=target_cols, feat_cols=feat_cols,resp_cols=resp_cols) 203 | # %% 204 | -------------------------------------------------------------------------------- /mlp/debug_ae_tf.py: -------------------------------------------------------------------------------- 1 | #%% 2 | from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation 3 | from tensorflow.keras.models import Model, Sequential 4 | from tensorflow.keras.losses import BinaryCrossentropy 5 | from tensorflow.keras.optimizers import Adam 6 | from tensorflow.keras.callbacks import EarlyStopping 7 | from tensorflow.keras.layers.experimental.preprocessing import Normalization 8 | from tensorflow.keras.metrics import AUC 9 | import tensorflow as tf 10 | import kerastuner as kt 11 | import numpy as np 12 | import pandas as pd 13 | import pickle 14 | from sklearn.model_selection import GroupKFold 15 | 16 | from tqdm import tqdm 17 | from random import choices 18 | 19 | import os, sys 20 | 21 | HOME = os.path.abspath(os.path.join('.', os.pardir)) 22 | MODEL_DIR = os.path.join(HOME, 'models') 23 | DATA_DIR = os.path.join(HOME, 'data') 24 | sys.path.append(HOME) 25 | from utils import * 26 | from utils_js import * 27 | #%% 28 | TRAINING = True 29 | TRAINING_AE = True 30 | HP_SEARCH = True 31 | GPU = True 32 | USE_FINETUNE = True 33 | FOLDS = 5 34 | SEED = 1127 35 | 36 | if GPU: 37 | gpus = tf.config.experimental.list_physical_devices(device_type="GPU") 38 | tf.config.experimental.set_visible_devices(devices=gpus[0], device_type="GPU") 39 | tf.config.experimental.set_memory_growth(device=gpus[0], enable=True) 40 | 41 | # %% loading data 42 | with timer("Loading train parquet"): 43 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 44 | train = pd.read_parquet(train_parquet) 45 | print(train.info()) 46 | 47 | # %% 48 | with timer("preprocess train"): 49 | train = preprocess(train) 50 | 51 | #%% 52 | features = [c for c in train.columns if 'feature' in c] 53 | 54 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4'] 55 | 56 | X = train[features].values 57 | y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget 58 | 59 | f_mean = np.mean(train[features[1:]].values,axis=0) 60 | # %% AE 61 | 62 | def create_autoencoder(input_dim, output_dim, noise=0.05, dropout=0.15): 63 | i = Input(input_dim) 64 | encoded = BatchNormalization()(i) 65 | encoded = GaussianNoise(noise)(encoded) 66 | encoded = Dense(64,activation='relu')(encoded) 67 | decoded = Dropout(dropout)(encoded) 68 | decoded = BatchNormalization()(decoded) 69 | decoded = Dense(input_dim,name='decoded')(decoded) 70 | x = Dense(32,activation='relu')(decoded) 71 | x = BatchNormalization()(x) 72 | x = Dropout(dropout)(x) 73 | x = Dense(32,activation='relu')(x) 74 | x = BatchNormalization()(x) 75 | x = Dropout(dropout)(x) 76 | x = Dense(output_dim, activation='sigmoid', name='label_output')(x) 77 | 78 | encoder = Model(inputs=i,outputs=encoded) 79 | autoencoder = Model(inputs=i,outputs=[decoded,x]) 80 | 81 | autoencoder.compile(optimizer=Adam(0.001), 82 | loss={'decoded':'mse', 83 | 'label_output':'binary_crossentropy'}) 84 | return autoencoder, encoder 85 | 86 | def create_model(hp,input_dim,output_dim,encoder): 87 | inputs = Input(input_dim) 88 | 89 | x = encoder(inputs) 90 | x = Concatenate()([x,inputs, x]) #use both raw and encoded features 91 | x = BatchNormalization()(x) 92 | x = Dropout(hp.Float('init_dropout',0.0,0.5))(x) 93 | 94 | for i in range(hp.Int('num_layers',1,5)): 95 | x = Dense(hp.Int(f'num_units_{i}',64,256))(x) 96 | x = BatchNormalization()(x) 97 | x = Lambda(tf.keras.activations.swish)(x) 98 | x = Dropout(hp.Float(f'dropout_{i}',0.0,0.5))(x) 99 | x = Dense(output_dim,activation='sigmoid')(x) 100 | model = Model(inputs=inputs,outputs=x) 101 | model.compile(optimizer=Adam(hp.Float('lr',0.00001,0.1, 102 | default=0.001)), 103 | loss=BinaryCrossentropy(label_smoothing=hp.Float('label_smoothing',0.0,0.1)), 104 | metrics=[AUC(name = 'auc')]) 105 | return model 106 | # %% 107 | autoencoder, encoder = create_autoencoder(X.shape[-1],y.shape[-1],noise=0.1) 108 | if TRAINING_AE: 109 | autoencoder.fit(X, (X,y), 110 | epochs=1000, 111 | batch_size=4096*2, 112 | validation_split=0.1, 113 | callbacks=[EarlyStopping('val_loss', 114 | patience=10, 115 | restore_best_weights=True)]) 116 | encoder.save_weights(MODEL_DIR+'/encoder.hdf5') 117 | else: 118 | encoder.load_weights(MODEL_DIR+'/encoder.hdf5') 119 | 120 | encoder.trainable = True 121 | 122 | #%% 123 | 124 | class CVTuner(kt.engine.tuner.Tuner): 125 | def run_trial(self, trial, X, y, splits, batch_size=32, verbose=2, epochs=1, callbacks=None): 126 | val_losses = [] 127 | for idx_tr, idx_val in splits: 128 | X_train, X_val = [x[idx_tr] for x in X], [x[idx_val] for x in X] 129 | y_train, y_val = [a[idx_tr] for a in y], [a[idx_val] for a in y] 130 | if len(X_train) < 2: 131 | X_train = X_train[0] 132 | X_val = X_val[0] 133 | if len(y_train) < 2: 134 | y_train = y_train[0] 135 | y_val = y_val[0] 136 | 137 | model = self.hypermodel.build(trial.hyperparameters) 138 | hist = model.fit(X_train,y_train, 139 | validation_data=(X_val,y_val), 140 | epochs=epochs, 141 | batch_size=batch_size, 142 | callbacks=callbacks, 143 | verbose=verbose) 144 | 145 | val_losses.append([hist.history[k][-1] for k in hist.history]) 146 | 147 | val_losses = np.asarray(val_losses) 148 | self.oracle.update_trial(trial.trial_id, 149 | {k:np.mean(val_losses[:,i]) for i,k in enumerate(hist.history.keys())}) 150 | self.save_model(trial.trial_id, model) 151 | 152 | model_fn = lambda hp: create_model(hp,X.shape[-1],y.shape[-1], encoder) 153 | 154 | tuner = CVTuner( 155 | hypermodel=model_fn, 156 | directory=f'ae_mlp_{SEED}', 157 | oracle=kt.oracles.BayesianOptimization( 158 | objective= kt.Objective('val_auc', direction='max'), 159 | num_initial_points=10, 160 | max_trials=50)) 161 | 162 | gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, group_gap=5) 163 | splits = list(gkf.split(y, groups=train['date'].values)) 164 | #%% 165 | if HP_SEARCH: 166 | tuner.search((X,),(y,), 167 | splits=splits, 168 | batch_size=8192, 169 | epochs=50, 170 | verbose=2, 171 | callbacks=[EarlyStopping('val_auc', 172 | mode='max', 173 | patience=5)]) 174 | hp = tuner.get_best_hyperparameters(1)[0] 175 | 176 | with open(MODEL_DIR+f'/best_hp_{SEED}.pkl', 'wb') as f: 177 | pickle.dump(hp, f, protocol=pickle.HIGHEST_PROTOCOL) 178 | tuner.results_summary() 179 | #%% 180 | if TRAINING: 181 | with open(MODEL_DIR+f'/best_hp_{SEED}.pkl', 'rb') as f: 182 | hp = pickle.load(f) 183 | 184 | for fold, (idx_tr, idx_val) in enumerate(splits): 185 | model = model_fn(hp) 186 | X_train, X_val = X[idx_tr], X[idx_val] 187 | y_train, y_val = y[idx_tr], y[idx_val] 188 | model.fit(X_train, 189 | y_train, 190 | validation_data=(X_val,y_val), 191 | epochs=100, 192 | batch_size=8192, 193 | callbacks=[EarlyStopping('val_auc', 194 | mode='max', 195 | patience=10, 196 | restore_best_weights=True)]) 197 | model.save_weights(MODEL_DIR + f'/model_{SEED}_{fold}.hdf5') 198 | model.compile(Adam(hp.get('lr')/100),loss='binary_crossentropy') 199 | 200 | model.fit(X_val, y_val, epochs=3, batch_size=8192) 201 | model.save_weights(MODEL_DIR+f'/model_{SEED}_{fold}_finetune.hdf5') 202 | 203 | else: 204 | models = [] 205 | hp = pd.read_pickle(MODEL_DIR+f'/best_hp_{SEED}.pkl') 206 | for f in range(FOLDS): 207 | model = model_fn(hp) 208 | if USE_FINETUNE: 209 | model.load_weights(MODEL_DIR+f'/model_{SEED}_{f}_finetune.hdf5') 210 | else: 211 | model.load_weights(MODEL_DIR+f'/model_{SEED}_{f}.hdf5') 212 | models.append(model) 213 | # %% 214 | -------------------------------------------------------------------------------- /iter_cv.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os 3 | import sys 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm import tqdm 7 | from sklearn.metrics import roc_auc_score 8 | import torch 9 | import tensorflow as tf 10 | from numba import njit 11 | import random 12 | import datetime 13 | 14 | HOME = os.path.dirname(os.path.abspath(__file__)) 15 | MODEL_DIR = HOME+'/models/' 16 | DATA_DIR = HOME+'/data/' 17 | from utils import * 18 | from utils_js import * 19 | # from nn.mlp import * 20 | 21 | DEBUG = False 22 | SEED = 1111 23 | START_SIMU_TEST = 490 # this day to 499 as simulated test days 24 | END_SIMU_TEST = 499 25 | TQDM_INT = 20 26 | batch_size = 5000 27 | label_smoothing = 1e-2 28 | learning_rate = 1e-3 29 | 30 | GPU = False 31 | 32 | if GPU: 33 | gpus = tf.config.experimental.list_physical_devices(device_type="GPU") 34 | tf.config.experimental.set_visible_devices(devices=gpus[0], device_type="GPU") 35 | tf.config.experimental.set_memory_growth(device=gpus[0], enable=True) 36 | else: 37 | cpus = tf.config.experimental.list_physical_devices(device_type='CPU') 38 | tf.config.experimental.set_visible_devices(devices= cpus, device_type='CPU') 39 | 40 | 41 | #%% 42 | ''' 43 | The mock test set is taken after the Purged Time series CV split last fold's test set: 44 | i.e., START_SIMU_TEST date needs to be > 382 45 | 46 | Reference: 47 | https://www.kaggle.com/jorijnsmit/found-the-holy-grail-grouptimeseriessplit 48 | https://www.kaggle.com/tomwarrens/purgedgrouptimeseriessplit-stacking-ensemble-mode 49 | ''' 50 | 51 | with timer("Loading train parquet"): 52 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 53 | train = pd.read_parquet(train_parquet) 54 | # print(train.info()) 55 | 56 | train['action'] = (train['resp'] > 0) 57 | for c in range(1,5): 58 | train['action'] = train['action'] & ((train['resp_'+str(c)] > 0)) 59 | features = [c for c in train.columns if 'feature' in c] 60 | 61 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 62 | 63 | # X = train[features].values 64 | # y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget 65 | 66 | f_mean = np.mean(train[features[1:]].values, axis=0) 67 | 68 | simu_test = train.query(f'date > {START_SIMU_TEST} & date <= {END_SIMU_TEST}').reset_index(drop = True) 69 | print(f"Simulated public test file length: {len(simu_test)}") 70 | 71 | #%% 72 | class Iter_Valid(object): 73 | 74 | global predicted 75 | predicted = [] 76 | 77 | def __init__(self, df, features, batch_size = 1): 78 | df = df.reset_index(drop=True) 79 | self.columns = ['weight'] + features + ['date'] 80 | self.df = df[self.columns] 81 | self.weight = df['weight'].astype(float).values 82 | self.action = df['action'].astype(int).values 83 | self.pred_df = df[['action']] 84 | # self.pred_df[['action']] = 0 85 | self.len = len(df) 86 | self.current = 0 87 | self.batch_size = batch_size 88 | 89 | def __iter__(self): 90 | return self 91 | 92 | def __next__(self): 93 | pre_start = self.current 94 | self.current += self.batch_size 95 | if self.current <= self.len: 96 | df = self.df[pre_start:self.current].copy() 97 | pred_df = self.pred_df[pre_start:self.current].copy() 98 | return df, pred_df 99 | elif self.current > self.len and (self.current - self.len < self.batch_size): 100 | df = self.df[pre_start:self.len].copy() 101 | pred_df = self.pred_df[pre_start::self.len].copy() 102 | return df, pred_df 103 | else: 104 | raise StopIteration() 105 | 106 | def predict(self,pred_df): 107 | predicted.append(pred_df) 108 | # %% seed 1111 overfit model 109 | hidden_units = [150, 150, 150] 110 | dropout_rates = [0.2, 0.2, 0.2, 0.2] 111 | 112 | def create_mlp_tf( 113 | num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate 114 | ): 115 | 116 | inp = tf.keras.layers.Input(shape=(num_columns,)) 117 | x = tf.keras.layers.BatchNormalization()(inp) 118 | x = tf.keras.layers.Dropout(dropout_rates[0])(x) 119 | for i in range(len(hidden_units)): 120 | x = tf.keras.layers.Dense(hidden_units[i])(x) 121 | x = tf.keras.layers.BatchNormalization()(x) 122 | x = tf.keras.layers.Activation(tf.keras.activations.swish)(x) 123 | x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x) 124 | 125 | x = tf.keras.layers.Dense(num_labels)(x) 126 | out = tf.keras.layers.Activation("sigmoid")(x) 127 | 128 | model = tf.keras.models.Model(inputs=inp, outputs=out) 129 | model.compile( 130 | optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 131 | loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing), 132 | metrics=tf.keras.metrics.AUC(name="AUC"), 133 | ) 134 | return model 135 | 136 | 137 | model = create_mlp_tf(num_columns=len(features), 138 | num_labels=5, 139 | hidden_units=hidden_units, 140 | dropout_rates=dropout_rates, 141 | label_smoothing=label_smoothing, 142 | learning_rate=learning_rate) 143 | 144 | model.load_weights(os.path.join(MODEL_DIR,f'model_{SEED}.hdf5')) 145 | model.summary() 146 | models = [] 147 | models.append(model) 148 | 149 | #%% 10k pytorch model 150 | 151 | #%% 152 | if DEBUG: 153 | ''' 154 | Old testing code here: using class is much faster than iterrows() of pandas 155 | ''' 156 | test_columns = ['weight'] + features + ['date'] 157 | predicted = [] 158 | def set_predict(df): 159 | predicted.append(df) 160 | 161 | test_len = 1_000 162 | start = time() 163 | with tqdm(total=test_len) as pbar: 164 | for idx, row in simu_test.iterrows(): 165 | row = pd.DataFrame(row.values.reshape(1,-1), columns=list(row.index)) 166 | test_df = row[test_columns].astype(float) 167 | pred_df = row[['action']].astype(int) 168 | pred_df.action = (random.random() > 0.7) 169 | set_predict(pred_df) 170 | 171 | time_taken = time() - start 172 | total_time_est = time_taken / (idx+1) * 1000000 / 60 173 | pbar.set_description(f"Current speed = {total_time_est:.2f} minutes to complete inference") 174 | pbar.update(1) 175 | 176 | if idx >= test_len: 177 | break 178 | 179 | 180 | # %% 181 | 182 | if __name__ == '__main__': 183 | ''' 184 | inference simulation 185 | Using a customized class 186 | 187 | 188 | For the seed = 1111 overfit model for day 490-499: 189 | np.mean: 815.71 190 | np.median: 893.32 191 | avg median: 838.97 192 | thresh 0.51 + np.median: 824.71 193 | thresh 0.501 + np.median: 878.82 194 | thresh 0.498 + np.median: 902.64 195 | thresh 0.499 + np.median: 893.70 196 | thresh 0.4985 + np.median: 908.28 197 | 198 | 199 | ''' 200 | date = simu_test['date'].values 201 | weight = simu_test['weight'].values 202 | resp = simu_test['resp'].values 203 | action = simu_test['action'].values 204 | 205 | # f = np.mean # 206 | f = np.median 207 | # f = median_avg 208 | 209 | THRESHOLD = 0.4985 210 | 211 | iter_test = Iter_Valid(simu_test, features, batch_size=1) 212 | start = time() 213 | 214 | pbar = tqdm(total=len(simu_test)) 215 | for idx, (test_df, pred_df) in enumerate(iter_test): 216 | 217 | if test_df['weight'].item() > 0: 218 | x_tt = test_df.loc[:, features].values 219 | if np.isnan(x_tt[:, 1:].sum()): 220 | x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean 221 | pred = np.mean([model(x_tt, training = False).numpy() for model in models],axis=0) 222 | pred = f(pred.squeeze()) 223 | pred_df.action = np.where(pred >= THRESHOLD, 1, 0).astype(int) 224 | else: 225 | pred_df.action = 0 226 | 227 | iter_test.predict(pred_df) 228 | 229 | time_taken = time() - start 230 | total_time_est = time_taken / (idx+1) * 1000000 / 60 231 | pbar.set_description(f"Current speed = {total_time_est:.2f} minutes to complete inference") 232 | pbar.update() 233 | 234 | y_true = simu_test['action'] 235 | y_pred = pd.concat(predicted)['action'] 236 | print('\nValidation auc:', roc_auc_score(y_true, y_pred)) 237 | score = utility_score_bincount(date, weight, resp, y_true) 238 | score_pred = utility_score_bincount(date, weight, resp, y_pred) 239 | print('\nMax possible utility score:', score) 240 | print('\nModel utility score: ', score_pred) -------------------------------------------------------------------------------- /data/data_rolling.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os 3 | import sys 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm.auto import tqdm 7 | from collections import deque 8 | import collections 9 | 10 | current_path = os.path.dirname(os.path.abspath(__file__)) 11 | HOME = os.path.dirname(current_path) 12 | MODEL_DIR = HOME+'/models/' 13 | DATA_DIR = HOME+'/data/' 14 | 15 | from utils import * 16 | from utils_js import * 17 | # %% 18 | ''' 19 | 1. Using the past day mean as fillna 20 | 2. For certain features use EWM (maybe too slow?) 21 | 22 | Past day mean 23 | Reference: Lucas Morin's notebook 24 | https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012 25 | 26 | Modified by Shuhao Cao and Ethan Zheng to 27 | 1. able to return past day trading numbers. 28 | 2. able to use feature 64 to predict whether a day is ''busy'' 29 | 30 | ''' 31 | 32 | 33 | class RunningPDA: 34 | ''' 35 | https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012 36 | ''' 37 | def __init__(self, past_mean=0, start=1000, end=2500, slope=0.00116): 38 | self.day = -1 39 | self.past_mean = past_mean # past day mean, initialized as the mean 40 | self.cum_sum = 0 41 | self.day_instances = 0 # current day instances 42 | self.past_value = past_mean # the previous row's value, initialized as the mean 43 | self.past_instances = 0 # instances in the past day 44 | 45 | self.start = start 46 | self.end = end 47 | self.slope = slope 48 | self.start_value = None 49 | self.end_value = None 50 | 51 | def clear(self): 52 | self.n = 0 53 | self.windows.clear() 54 | 55 | def push(self, x, date): 56 | x = fast_fillna(x, self.past_value) 57 | self.past_value = x 58 | 59 | # change of day 60 | if date > self.day: 61 | self.day = date 62 | if self.day_instances > 0: 63 | self.past_mean = self.cum_sum/self.day_instances 64 | self.past_instances = self.day_instances 65 | self.day_instances = 1 66 | self.cum_sum = x 67 | 68 | self.start_value, self.end_value = None, None 69 | 70 | else: 71 | self.day_instances += 1 72 | self.cum_sum += x 73 | 74 | if self.day_instances == self.start: 75 | self.start_value = x[:, 64] 76 | if self.day_instances == self.end: 77 | self.end_value = x[:, 64] 78 | 79 | def get_mean(self): 80 | return self.cum_sum/self.day_instances 81 | 82 | def get_past_mean(self): 83 | return self.past_mean 84 | 85 | def get_past_trade(self): 86 | return self.past_instances 87 | 88 | def predict_today_busy(self): 89 | if self.start_value is None or self.end_value is None: 90 | return False 91 | return (self.end_value - self.start_value) / (self.end - self.start) < self.slope 92 | 93 | class RunningEWMeanDay: 94 | ''' 95 | Reference: Lucas Morin 96 | https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012 97 | Modified to do the rolling mean only intraday 98 | ''' 99 | def __init__(self, window=20, num_feat = 1, lt_mean = None): 100 | if lt_mean is not None: 101 | self.s = lt_mean 102 | else: 103 | self.s = np.zeros(num_feat) 104 | self.past_value = np.zeros(num_feat) 105 | self.alpha = 2 /(window + 1) 106 | self.day = -1 107 | 108 | def clear(self): 109 | self.s = 0 110 | 111 | def push(self, x, date): 112 | 113 | x = fast_fillna(x, self.past_value) 114 | self.past_value = x 115 | 116 | if date > self.day: 117 | self.day = date 118 | self.clear() 119 | self.s = x 120 | else: 121 | self.s = self.alpha * x + (1 - self.alpha) * self.s 122 | 123 | def get_mean(self): 124 | return self.s 125 | 126 | 127 | class RunningMeanDay: 128 | ''' 129 | Reference: Lucas Morin 130 | https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012 131 | Modified to do the rolling mean only intraday 132 | ''' 133 | def __init__(self, window=1000, num_feat = 1): 134 | self.day = -1 135 | self.n = 0 136 | self.mean = 0 137 | self.run_var = 0 138 | self.window = window 139 | self.past_value = 0 140 | self.windows = deque(maxlen=window+1) 141 | self.num_feat=num_feat 142 | 143 | def clear(self): 144 | self.n = 0 145 | self.windows.clear() 146 | 147 | def push(self, x, date): 148 | 149 | x = fast_fillna(x, self.past_value) 150 | self.past_value = x 151 | 152 | if date > self.day: 153 | self.day = date 154 | self.clear() 155 | self.windows.append(x) 156 | self.n = 1 157 | self.mean = x 158 | self.run_var = 0 159 | else: 160 | self.windows.append(x) 161 | 162 | if self.n < self.window: 163 | # Calculating first variance 164 | self.n += 1 165 | delta = x - self.mean 166 | self.mean += delta / self.n 167 | self.run_var += delta * (x - self.mean) 168 | else: 169 | # Adjusting variance 170 | x_removed = self.windows.popleft() 171 | old_m = self.mean 172 | self.mean += (x - x_removed) / self.window 173 | self.run_var += (x + x_removed - old_m - self.mean) * (x - x_removed) 174 | 175 | def get_mean(self): 176 | return self.mean if self.n else np.zeros(self.num_feat) 177 | 178 | def get_var(self): 179 | return self.run_var / (self.n) if self.n > 1 else np.zeros(self.num_feat) 180 | 181 | def get_std(self): 182 | return math.sqrt(self.get_var()) 183 | 184 | def get_all(self): 185 | return list(self.windows) 186 | 187 | def __str__(self): 188 | return "Current window values: {}".format(list(self.windows)) 189 | 190 | 191 | #%% 192 | def load_train(drop_days=None, zero_weight=True): 193 | with timer("Loading train parquet"): 194 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 195 | train = pd.read_parquet(train_parquet) 196 | if drop_days: 197 | train = train.query(f'date not in {drop_days}').reset_index (drop = True) 198 | 199 | if not zero_weight: 200 | train = train.query('weight > 0').reset_index (drop = True) 201 | 202 | feat_cols = [f'feature_{i}' for i in range(130)] 203 | # train[feat_cols].mean().to_csv(os.path.join(DATA_DIR, 'f_mean_final.csv'), 204 | # index_label=['features'], header=['mean']) 205 | f_mean = train[feat_cols].mean().values.reshape(1,-1) 206 | if zero_weight: 207 | np.save(DATA_DIR+'f_mean_after_85_include_zero_weight.npy', f_mean) 208 | else: 209 | np.save(DATA_DIR+'f_mean_after_85_positive_weight.npy', f_mean) 210 | return train 211 | 212 | 213 | def process_train_rolling(train, debug=False): 214 | TRAIN_ROWS = 50_000 215 | if debug: 216 | train = train[:TRAIN_ROWS] 217 | 218 | f_mean = train.mean().values 219 | 220 | train_dtypes = {'date': np.int32, 221 | 'ts_id': np.int64, 222 | 'resp': np.float64, 223 | 'weight': np.float64, 224 | } 225 | for c in range(1,5): 226 | train_dtypes['resp_'+str(c)] = np.float64 227 | for c in range(130): 228 | train_dtypes['feature_'+str(c)] = np.float32 229 | 230 | pdm = RunningPDA(past_mean=f_mean) 231 | 232 | with tqdm(total=len(train)) as pbar: 233 | row_vals = [] 234 | for _, row in train.iterrows(): 235 | date = row['date'] 236 | pdm.push(np.array(row), date) 237 | 238 | past_day_mean = pdm.get_past_mean() 239 | 240 | x_tt = row.values 241 | if np.isnan(x_tt.sum()): 242 | x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * past_day_mean 243 | 244 | row_vals.append(x_tt) 245 | pbar.update() 246 | 247 | train_pdm = pd.DataFrame(row_vals, columns=train.columns, index=train.index).astype(train_dtypes) 248 | 249 | if not debug: 250 | train_pdm.to_parquet(os.path.join(DATA_DIR, 'train_pdm.parquet'), index=False) 251 | 252 | 253 | # %% 254 | 255 | if __name__ == '__main__': 256 | get_system() 257 | train = load_train(drop_days=[2, 36, 270, 294]) 258 | # train = load_train(drop_days=list(range(0,86))+[270, 294]) 259 | process_train_rolling(train, debug=True) 260 | -------------------------------------------------------------------------------- /mlp/run_train_final_4.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import os 3 | import sys 4 | current_path = os.path.dirname(os.path.abspath(__file__)) 5 | HOME = os.path.dirname(current_path) 6 | sys.path.append(HOME) 7 | 8 | from utils import * 9 | from utils_js import * 10 | 11 | import pandas as pd 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | from torchsummary import summary 16 | 17 | 18 | from mlp import * 19 | pd.set_option('display.max_rows', 100) 20 | pd.set_option('display.max_columns', 100) 21 | #%% 22 | ''' 23 | Final model spikenet: 24 | 25 | 1. subtract the most common values from columns with a spike in the histogram to form cat features. 26 | ''' 27 | 28 | 29 | # %% 30 | BATCH_SIZE = 8192 31 | FINETUNE_BATCH_SIZE = 4096_00 32 | 33 | LEARNING_RATE = 1e-4 34 | WEIGHT_DECAY = 1e-5 35 | EPOCHS = 100 36 | EARLYSTOP_NUM = 5 37 | ALPHA = 0.6 38 | EPSILON = 5e-2 # strength of the regularizer 39 | VOLATILE_MODEL = True 40 | 41 | s = 4 42 | SEED = 1127*s 43 | np.random.seed(SEED) 44 | pd.core.common.random_state(SEED) 45 | torch.manual_seed(SEED) 46 | torch.cuda.manual_seed(SEED) 47 | torch.backends.cudnn.deterministic = True 48 | torch.backends.cudnn.benchmark = False 49 | if torch.cuda.is_available(): 50 | torch.cuda.manual_seed_all(SEED) 51 | 52 | splits = { 53 | 'train_days': (range(0,457), range(0,424), range(0,391)), 54 | 'valid_days': (range(467, 500), range(434, 466), range(401, 433)), 55 | } 56 | fold = 2 57 | 58 | if fold == 0: 59 | SAVE_THRESH = 1300 60 | VAL_OFFSET = 100 61 | elif fold == 1: 62 | SAVE_THRESH = 1200 63 | VAL_OFFSET = 150 64 | elif fold == 2: 65 | SAVE_THRESH = 90 66 | VAL_OFFSET = 100 67 | EPOCHS = 40 68 | LEARNING_RATE = 1e-3 69 | EPSILON = 1e-2 70 | 71 | VOLATILE_DAYS = [1, 4, 5, 12, 16, 18, 24, 37, 38, 43, 44, 45, 47, 72 | 59, 63, 80, 85, 161, 168, 452, 459, 462] 73 | 74 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 75 | # %% 76 | with timer("Preprocessing train"): 77 | # train_parquet = os.path.join(DATA_DIR, 'train.parquet') 78 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 79 | train = pd.read_parquet(train_parquet) 80 | # %% 81 | # feat_reg_index = [0, 17, 18, 37, 39, 40, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 57, 58] 82 | # feat_reg_index += list(range(60,69)) 83 | # feat_reg_index += [89, 101, 108, 113, 119, 120, 121, 122, 124, 125, 126, 128] 84 | # feat_spike_index_temp = list(set(range(130)).difference(feat_reg_index)) 85 | # features_reg = [f'feature_{i}' for i in feat_reg_index] 86 | # features_spike = [f'feature_{i}' for i in feat_spike_index_temp] 87 | 88 | 89 | # %% 90 | # feat_spike_index = [eval(s) for s in feat_spike_index] 91 | # for f in feat_spike_index: 92 | # print(f'{f},', end=' ') 93 | # %% 94 | # feat_spike_index = [1, 2, 3, 4, 5, 6, 14, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118] 95 | feat_spike_index = [1, 2, 3, 4, 5, 6, 10, 14, 16, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85, 96 | 86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118] 97 | feat_reg_index = list(set(range(130)).difference(feat_spike_index)) 98 | features_reg = [f'feature_{i}' for i in feat_reg_index] 99 | features_spike = [f'feature_{i}' for i in feat_spike_index] 100 | 101 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4', ] 102 | target_cols = ['action_1', 'action_2', 'action_3', 'action', 'action_4'] 103 | 104 | feat_cols = [f'feature_{i}' for i in range(130)] 105 | # feat_cols = features_reg 106 | cat_cols = [f+'_c' for f in features_spike] 107 | print(f"Number of features with spike: {len(cat_cols)}") 108 | # %% 109 | 110 | feat_spike_index = [] 111 | most_common_vals = [] 112 | most_common_vals = np.load(DATA_DIR+'spike_common_vals_42.npy').reshape(-1) 113 | 114 | for i, feat in tqdm(enumerate(features_spike)): 115 | # sorted_counts = train[feat].value_counts().sort_values(ascending=False) 116 | # print(sorted_counts.head(5), '\n\n') 117 | # if sorted_counts.iloc[0]/sorted_counts.iloc[1] > 30 and sorted_counts.iloc[0] > 5000: 118 | # feat_spike_index.append(sorted_counts.name.split('_')[-1]) 119 | # most_common_val = sorted_counts.index[0] 120 | # most_common_vals.append(most_common_val) 121 | train[feat+'_c'] = (train[feat] - most_common_vals[i]).astype(int) 122 | # print(train[feat+'_c'].astype(int).value_counts()[:5]) 123 | 124 | # %% 125 | train = train.query(f'date not in {[2, 36, 270, 294]}').reset_index(drop=True) 126 | 127 | 128 | if not VOLATILE_MODEL: 129 | train = train.query('date > 85').reset_index(drop=True) 130 | # train = train.query(f'date not in {VOLATILE_DAYS}').reset_index(drop=True) 131 | # train.fillna(train.mean(), inplace=True) 132 | train = train[train['weight'] != 0].reset_index(drop=True) 133 | train['action'] = (train['resp'] > 0).astype('int') 134 | 135 | for c in range(1, 5): 136 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(np.int32) 137 | 138 | valid = train.loc[train.date.isin(splits['valid_days'][fold])].reset_index(drop=True) 139 | train = train.loc[train.date.isin(splits['train_days'][fold])].reset_index(drop=True) 140 | # %% 141 | 142 | 143 | train_set = MarketDatasetCat(train, 144 | features=feat_cols, cat_features=cat_cols, 145 | targets=target_cols, resp=resp_cols) 146 | train_loader = DataLoader( 147 | train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 148 | 149 | valid_set = MarketDatasetCat(valid, features=feat_cols, cat_features=cat_cols, 150 | targets=target_cols, resp=resp_cols) 151 | valid_loader = DataLoader( 152 | valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 153 | # %% 154 | util_cols = resp_cols 155 | # util_cols = ['resp'] 156 | resp_index = [resp_cols.index(r) for r in util_cols] 157 | regularizer = UtilityLoss(alpha=EPSILON, scaling=12, 158 | normalize=None, resp_index=resp_index) 159 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 160 | 161 | model = SpikeNet() 162 | model.to(device) 163 | summary(model, [(len(feat_cols),), (len(cat_cols),)]) 164 | 165 | optimizer = torch.optim.Adam( 166 | model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 167 | 168 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 169 | # steps_per_epoch=len( 170 | # train_loader), 171 | # epochs=EPOCHS) 172 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 173 | T_0=50, T_mult=2, 174 | eta_min=LEARNING_RATE*1e-4, last_epoch=-1) 175 | 176 | finetune_loader = DataLoader( 177 | train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8) 178 | 179 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-2) 180 | 181 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", 182 | save_threshold=SAVE_THRESH, util_offset=VAL_OFFSET) 183 | 184 | # %% 185 | 186 | lr = [] 187 | 188 | for epoch in range(EPOCHS): 189 | 190 | train_loss = train_epoch_cat( 191 | model, optimizer, scheduler, loss_fn, train_loader, device) 192 | # train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device) 193 | lr.append(optimizer.param_groups[0]['lr']) 194 | 195 | if (epoch+1) % 10 == 0: 196 | _ = train_epoch_ft_cat(model, finetune_optimizer, scheduler, 197 | regularizer, finetune_loader, device, loss_fn=loss_fn) 198 | 199 | valid_pred = valid_epoch(model, valid_loader, device, cat_input=True) 200 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 201 | f=median_avg, threshold=0.5, target_cols=target_cols) 202 | model_file = MODEL_DIR + \ 203 | f"/emb_fold_{fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 204 | early_stop(epoch, valid_auc, model, model_path=model_file, 205 | epoch_utility_score=valid_score) 206 | 207 | # if early_stop.model_saved: 208 | # for g in optimizer.param_groups: 209 | # g['lr'] *= 0.1 210 | # lr[-1] = optimizer.param_groups[0]['lr'] 211 | # tqdm.write(f"\nNew learning rate: {lr[-1]:.4e}") 212 | 213 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {fold}") 214 | tqdm.write( 215 | f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}") 216 | tqdm.write( 217 | f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ") 218 | tqdm.write( 219 | f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 220 | if early_stop.early_stop: 221 | print("\nEarly stopping") 222 | break 223 | # %% 224 | -------------------------------------------------------------------------------- /mlp/run_train_finetune.py: -------------------------------------------------------------------------------- 1 | # %% 2 | from torchsummary import summary 3 | import os 4 | import sys 5 | import torch 6 | import torch.nn.functional as F 7 | import torch.nn as nn 8 | torch.backends.cudnn.deterministic = True # for bincount 9 | 10 | current_path = os.path.dirname(os.path.abspath(__file__)) 11 | HOME = os.path.dirname(current_path) 12 | MODEL_DIR = os.path.join(HOME, 'models') 13 | DATA_DIR = os.path.join(HOME, 'data') 14 | sys.path.append(HOME) 15 | 16 | from utils import * 17 | from mlp import * 18 | # %% 19 | 20 | ''' 21 | Training script finetuning using resp colums as regularizer 22 | ''' 23 | 24 | DEBUG = False 25 | LOAD_PRETRAIN = False 26 | TRAINING_START = 86 # 86 by default 27 | FINETUNE_BATCH_SIZE = 2048_00 28 | BATCH_SIZE = 8196 29 | EPOCHS = 120 30 | LEARNING_RATE = 1e-3 31 | WEIGHT_DECAY = 1e-5 32 | EARLYSTOP_NUM = 6 33 | NFOLDS = 1 34 | SCALING = 10 35 | THRESHOLD = 0.5 36 | DAYS_TO_DROP = [2, 36, 270, 294] 37 | CV_START_DAY = 100 38 | CV_DAYS = 50 39 | 40 | SEED = 1127802 41 | get_seed(SEED) 42 | 43 | # f = np.median 44 | # f = np.mean 45 | f = median_avg 46 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 47 | 48 | # %% 49 | with timer("Preprocessing train"): 50 | # train_parquet = os.path.join(DATA_DIR, 'train.parquet') 51 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 52 | train, valid = preprocess_pt(train_parquet, day_start=TRAINING_START, 53 | drop_days=DAYS_TO_DROP, 54 | drop_zero_weight=True, denoised_resp=False) 55 | 56 | print(f'action based on resp mean: ', train['action'].astype(int).mean()) 57 | for c in range(1, 5): 58 | print(f'action based on resp_{c} mean: ', 59 | train['action_'+str(c)].astype(int).mean()) 60 | 61 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 62 | resp_cols_all = resp_cols 63 | target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4'] 64 | feat_cols = [f'feature_{i}' for i in range(130)] 65 | 66 | 67 | # f_mean = np.mean(train[feat_cols[1:]].values, axis=0) 68 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 69 | 70 | ###### adding weight to the features ####### 71 | # feat_cols.extend(['weight']) 72 | # %% 73 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols) 74 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 75 | 76 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols) 77 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 78 | 79 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols)) 80 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols)) 81 | model.to(device) 82 | summary(model, input_size=(len(feat_cols), )) 83 | # %% 84 | ''' 85 | fine-tuning the trained model based on resp or utils 86 | current fine-tuning train set is all train 87 | max batch_size: 88 | 3 resps: 102400 89 | 90 | current best setting: 91 | ''' 92 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 93 | 94 | # util_cols = ['resp', 'resp_1', 'resp_2'] 95 | # util_cols = ['resp', 'resp_4'] 96 | util_cols = resp_cols 97 | 98 | resp_index = [resp_cols_all.index(r) for r in util_cols] 99 | 100 | # regularizer = RespMSELoss(alpha=1e-1, scaling=1, resp_index=resp_index) 101 | regularizer = UtilityLoss(alpha=5e-2, scaling=12, normalize=None, resp_index=resp_index) 102 | 103 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 104 | 105 | all_train = pd.concat([train, valid], axis=0) 106 | all_train_set = ExtendedMarketDataset(all_train, features=feat_cols, targets=target_cols, resp=resp_cols) 107 | train_loader = DataLoader(all_train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 108 | 109 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 110 | # optimizer = RAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 111 | # optimizer = Lookahead(optimizer=optimizer, alpha=1e-1) 112 | 113 | scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 114 | steps_per_epoch=len(train_loader), 115 | epochs=EPOCHS) 116 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 117 | # T_0=10, T_mult=1, 118 | # eta_min=LEARNING_RATE*1e-3, last_epoch=-1) 119 | 120 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8) 121 | 122 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3) 123 | 124 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, 125 | mode="max", save_threshold=6000) 126 | 127 | # %% 128 | if LOAD_PRETRAIN: 129 | print("Loading model for finetune.") 130 | _fold = 0 131 | model_weights = os.path.join(MODEL_DIR, f"resmlp_{_fold}.pth") 132 | # model_weights = os.path.join(MODEL_DIR, f"resmlp_ft_old_fold_{_fold}.pth") 133 | # model_weights = os.path.join(MODEL_DIR, f"resmlp_finetune_fold_{_fold}.pth") 134 | try: 135 | model.load_state_dict(torch.load(model_weights)) 136 | except: 137 | model.load_state_dict(torch.load( 138 | model_weights, map_location=torch.device('cpu'))) 139 | model.eval() 140 | valid_pred = valid_epoch(model, valid_loader, device) 141 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 142 | f=median_avg, threshold=0.5, target_cols=target_cols) 143 | 144 | print(f"valid_utility:{valid_score:.2f} \t valid_auc:{valid_auc:.4f}") 145 | # %% 146 | _fold = 1 147 | SEED = 1127802 148 | get_seed(SEED+SEED*_fold) 149 | 150 | for epoch in range(EPOCHS): 151 | 152 | train_loss = train_epoch(model, optimizer, scheduler,loss_fn, train_loader, device) 153 | # train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device) 154 | lr = optimizer.param_groups[0]['lr'] 155 | if (epoch+1) % 10 == 0: 156 | _ = train_epoch_finetune(model, finetune_optimizer, scheduler, 157 | regularizer, finetune_loader, device, loss_fn=loss_fn) 158 | 159 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 160 | batch_size =2*8192, f=median_avg, threshold=0.5, 161 | target_cols=target_cols, 162 | feat_cols=feat_cols, 163 | resp_cols=resp_cols) 164 | 165 | valid_pred = valid_epoch(model, valid_loader, device) 166 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 167 | f=median_avg, threshold=0.5, target_cols=target_cols) 168 | # model_file = MODEL_DIR + \ 169 | # f"/resmlp_interleave_{_fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 170 | model_file = MODEL_DIR + \ 171 | f"/resw_interleave_{_fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 172 | early_stop(epoch, valid_auc, model, model_path=model_file, 173 | epoch_utility_score=valid_score) 174 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}") 175 | tqdm.write( 176 | f"Train loss: {train_loss:.4f} \t Current learning rate: {lr:.4e}") 177 | tqdm.write( 178 | f"Best util: {early_stop.best_utility_score:.2f} \t {early_stop.message} ") 179 | tqdm.write( 180 | f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 181 | if early_stop.early_stop: 182 | print("\nEarly stopping") 183 | break 184 | 185 | if DEBUG: 186 | torch.save(model.state_dict(), MODEL_DIR + f"/resmlp_interleave_fold_{_fold}.pth") 187 | # %% 188 | _fold = 4 189 | # model_file = f"resmlp_interleave_0_util_7437_auc_0.6389.pth" 190 | # model_file = f"resmlp_ft_old_fold_{_fold}.pth" # fold 1, 3, 4 good 191 | # model_file = f"resmlp_finetune_fold_{_fold}.pth" 192 | model_file = f"resw_interleave_1_util_6455_auc_0.6237.pth" 193 | # model_file = f"resw_interleave_1_util_6333_auc_0.6211.pth" 194 | # model_file = f"resmlp_{_fold}.pth" 195 | print(f"Loading {model_file} for cv check.\n") 196 | model_weights = os.path.join(MODEL_DIR, model_file) 197 | 198 | model.to(device) 199 | feat_cols = [f'feature_{i}' for i in range(130)] 200 | feat_cols.extend(['weight']) 201 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 202 | 203 | 204 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols)) 205 | model.to(device) 206 | model.load_state_dict(torch.load(model_weights)) 207 | # model.load_state_dict(torch.load( 208 | # model_weights, map_location=torch.device('cpu'))) 209 | model.eval(); 210 | 211 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 212 | train = preprocess_pt(train_parquet, day_start=0, day_split=None, 213 | drop_zero_weight=False) 214 | 215 | 216 | # %% 217 | -------------------------------------------------------------------------------- /data/data_final.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os 3 | import sys 4 | current_path = os.path.dirname(os.path.abspath(__file__)) 5 | HOME = os.path.dirname(current_path) 6 | sys.path.append(HOME) 7 | 8 | import pandas as pd 9 | pd.set_option('display.max_rows', 100) 10 | pd.set_option('display.max_columns', 100) 11 | 12 | import numpy as np 13 | import datatable as dt 14 | from tqdm.auto import tqdm 15 | from collections import deque 16 | import matplotlib.pyplot as plt 17 | import seaborn as sns 18 | sns.set(style="darkgrid", context="talk") 19 | from jupyterthemes import jtplot 20 | jtplot.style(theme='onedork', context='notebook', ticks=True, grid=False) 21 | 22 | 23 | MODEL_DIR = HOME+'/models/' 24 | DATA_DIR = HOME+'/data/' 25 | from utils import * 26 | from utils_js import * 27 | from data.data_rolling import RunningPDA, RunningEWMeanDay, RunningMeanDay 28 | 29 | # %% 30 | ''' 31 | data preparation for the final submission (in order) 32 | 33 | 1. Drop outliers [2, 294], low volume days [36, 270]. 34 | 2. fillna() uses past day mean including all weight zero rows. 35 | 3. Most common values fillna for spike features rows (a small random noise added). 36 | 4. all data, only drop the two partial days and the two <2k ts_id days. 37 | 5. smoother data, aside from 1, query day > 85, drop ts_id > 8700 days. 38 | 6. Final training uses only weight > 0 rows, but with a randomly 39 | selected 40% of weight zero rows' weight being replaced by 1e-7 to 40 | reduce overfitting. 41 | 7. a new denoised target is generated with all five targets. 42 | 43 | testing out new features 44 | - ewm for feature_0 45 | - moving average for feature_0 46 | 47 | Reference: Carl McBride Ellis 48 | https://www.kaggle.com/carlmcbrideellis/semper-augustus-pre-process-training-data 49 | 50 | Past day mean/EW mean push 51 | Reference: Lucas Morin's notebook 52 | https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012 53 | ''' 54 | # %% 55 | with timer("Loading train"): 56 | train_csv = os.path.join(DATA_DIR, 'train.csv') 57 | train = dt.fread(train_csv).to_pandas() 58 | 59 | # train_parquet = os.path.join(DATA_DIR, 'train.parquet') 60 | # train = pd.read_parquet(train_parquet) 61 | 62 | # train = train.set_index('ts_id') 63 | train = train.query('date not in [2, 36, 270, 294]').reset_index(drop=True) 64 | # %% 65 | # the first one is used for model 66 | # feat_spike_index = [1, 2, 3, 4, 5, 6, 10, 14, 16, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85, 67 | # 86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118] 68 | 69 | # this one is used for fillna 70 | feat_spike_index = [1, 2, 69, 71, 85, 87, 88, 91, 93, 94, 97, 99, 100, 103, 105, 106] 71 | 72 | noisy_index = [3, 4, 5, 6, 8, 10, 12, 14, 16, 37, 38, 39, 40, 72, 73, 74, 75, 76, 73 | 78, 79, 80, 81, 82, 83] 74 | negative_index = [73, 75, 76, 77, 79, 81, 82] 75 | hybrid_index = [55, 56, 57, 58, 59] 76 | running_indices = sorted([0]+noisy_index+negative_index+hybrid_index) 77 | features_running = [f'feature_{i}' for i in running_indices] 78 | 79 | feat_reg_index = list(set(range(130)).difference(feat_spike_index)) 80 | features_reg = [f'feature_{i}' for i in feat_reg_index] 81 | features_spike = [f'feature_{i}' for i in feat_spike_index] 82 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4', ] 83 | 84 | feat_cols = [f'feature_{i}' for i in range(130)] 85 | # feat_cols = features_reg 86 | feat_cols_c = feat_cols + [f+'_c' for f in features_spike] 87 | print(f"Number of features: {len(feat_cols)}") 88 | print(f"Number of spike fillna features: {len(features_spike)}") 89 | # %% 90 | try: 91 | feat_mean = np.load(DATA_DIR+'f_mean_all_days_include_zero_weight.npy') 92 | except: 93 | feat_mean = train[feat_cols].mean().values.reshape(1,-1) 94 | np.save(DATA_DIR+'f_mean_all_days_include_zero_weight.npy', feat_mean) 95 | all_mean = train.mean().values 96 | #%% 97 | # %% 98 | try: 99 | spike_fillna_val = np.load(DATA_DIR+'fillna_val_spike_feats.npy') 100 | except: 101 | most_common_vals = [] 102 | # most_common_vals = np.load(DATA_DIR+'spike_common_vals_42.npy').reshape(-1) 103 | 104 | for i, feat in enumerate(features_spike): 105 | sorted_counts = train[feat].value_counts().sort_values(ascending=False) 106 | print(sorted_counts.head(5), '\n\n') 107 | # if sorted_counts.iloc[0]/sorted_counts.iloc[1] > 30 and sorted_counts.iloc[0] > 5000: 108 | # feat_spike_index.append(sorted_counts.name.split('_')[-1]) 109 | most_common_val = sorted_counts.index[0] 110 | most_common_vals.append(most_common_val) 111 | 112 | spike_fillna_val = np.zeros((len(feat_cols), )) 113 | spike_fillna_val[feat_spike_index] = np.array(most_common_vals) 114 | np.save(DATA_DIR+'fillna_val_spike_feats.npy', spike_fillna_val) 115 | 116 | #%% 117 | 118 | class RunningPDAFinal(): 119 | ''' 120 | The subclass only for data-preparation, not for final submission pipeline 121 | ''' 122 | def __init__(self, past_mean=all_mean): 123 | self.day = -1 124 | self.past_mean = past_mean # past day mean, initialized as the mean 125 | self.cum_sum = 0 126 | self.day_instances = 0 # current day instances 127 | self.past_value = past_mean # the previous row's value, initialized as the mean 128 | self.past_instances = 0 # instances in the past day 129 | self.past_day_data = np.zeros_like(past_mean) 130 | self.current_day_data = past_mean 131 | 132 | def push(self, x, date): 133 | x = fast_fillna(x, self.past_value) 134 | self.past_value = x 135 | 136 | # change of day 137 | if date > self.day: 138 | self.day = date 139 | if self.day_instances > 0: 140 | self.past_mean = self.cum_sum/self.day_instances 141 | self.past_instances = self.day_instances 142 | self.day_instances = 1 143 | self.cum_sum = x 144 | self.past_day_data = np.array(self.current_day_data) 145 | # print(self.past_day_data[0]) 146 | self.current_day_data = [] 147 | self.current_day_data.append(list(x)) 148 | # print(self.current_day_data) 149 | # print(x[0]) 150 | 151 | else: 152 | self.day_instances += 1 153 | self.cum_sum += x 154 | self.current_day_data.append(list(x)) 155 | 156 | 157 | def get_mean(self): 158 | return self.cum_sum/self.day_instances 159 | 160 | def get_past_mean(self): 161 | return self.past_mean 162 | 163 | def get_past_mean_numpy(self): 164 | return np.mean(self.past_day_data, axis=0) 165 | 166 | def get_past_std(self): 167 | return np.std(self.past_day_data, axis=0) 168 | #%% 169 | feat_mean = feat_mean.reshape(-1) 170 | pdm = RunningPDAFinal(past_mean=feat_mean) 171 | 172 | feat_vals = [] 173 | # nonfeat_cols = ['date', 'weight', 'resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4',] 174 | n_feats = len(feat_cols) 175 | spike_fillna_val = spike_fillna_val.reshape(-1) 176 | 177 | with tqdm(total=len(train)) as pbar: 178 | 179 | for _, row in train.iterrows(): 180 | date = row['date'] 181 | x_tt = row.values[7:-1] 182 | assert x_tt[0] == 1 or x_tt[0] == -1 183 | pdm.push(x_tt, date) 184 | 185 | past_day_mean = pdm.get_past_mean().reshape(-1) 186 | past_day_mean[feat_spike_index] = 0 187 | fillna_val = past_day_mean + spike_fillna_val 188 | if np.isnan(x_tt.sum()): 189 | # x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt)*spike_fillna_val # bug!!!!!! 190 | x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt)*fillna_val 191 | 192 | feat_vals.append(x_tt) 193 | pbar.update() 194 | #%% 195 | train_dtypes = {'date': np.int32, 196 | 'ts_id': np.int64, 197 | 'resp': np.float64, 198 | 'weight': np.float64, 199 | } 200 | for c in range(1,5): 201 | train_dtypes['resp_'+str(c)] = np.float64 202 | for c in range(130): 203 | train_dtypes['feature_'+str(c)] = np.float32 204 | 205 | #%% 206 | feature_df = pd.DataFrame(feat_vals, columns=feat_cols, index=train.index) 207 | 208 | # %% 209 | train_final = train.copy() 210 | train_final[feat_cols] = feature_df 211 | train_final = train_final.astype(train_dtypes) 212 | # %% 213 | # train_final = train_final.astype(train_dtypes) 214 | train_final.to_parquet(os.path.join(DATA_DIR, 'train_final.parquet'), index=False) 215 | # %% 216 | train_final.to_feather(os.path.join(DATA_DIR, 'train_final.feather')) 217 | # %% 218 | trades_per_day = train_final.groupby(['date'])['ts_id'].count() 219 | volatile_days = pd.DataFrame(trades_per_day[trades_per_day > 8600]) 220 | print("Number of volatile days",volatile_days.count()) 221 | filter_list = volatile_days.index.to_list() 222 | 223 | #%% 224 | filter_list = [1, 4, 5, 12, 16, 18, 24, 37, 38, 43, 44, 45, 47, 225 | 59, 63, 80, 85, 161, 168, 452, 459, 462] 226 | train_final_regular = train_final.query('date != @filter_list').reset_index(drop = True) 227 | train_final_regular = train_final.query('date >85').reset_index(drop = True) 228 | # %% 229 | train_final_regular.to_parquet(os.path.join(DATA_DIR, 'train_final_regular.parquet'), index=False) 230 | # %% 231 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import sys 3 | import os 4 | def add_sys_path(): 5 | try: 6 | for f in ['/home/scao/anaconda3/lib/python3.8/lib-dynload', 7 | '/home/scao/anaconda3/lib/python3.8/site-packages']: 8 | sys.path.append(f) 9 | except: 10 | RuntimeError 11 | print("Path not added") 12 | add_sys_path() 13 | 14 | 15 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1" 16 | import random as rd 17 | from contextlib import contextmanager 18 | from collections import defaultdict 19 | from time import time 20 | import matplotlib.pyplot as plt 21 | from datetime import date 22 | import math 23 | import numpy as np 24 | import pandas as pd 25 | import psutil 26 | import torch 27 | import pickle 28 | import seaborn as sns 29 | sns.set() 30 | from sklearn.metrics import roc_auc_score 31 | 32 | 33 | 34 | SEED = 1127 35 | 36 | def get_size(bytes, suffix='B'): 37 | ''' 38 | by Fred Cirera, https://stackoverflow.com/a/1094933/1870254, modified 39 | Scale bytes to its proper format 40 | e.g: 41 | 1253656 => '1.20MiB' 42 | 1253656678 => '1.17GiB' 43 | ''' 44 | for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: 45 | if abs(bytes) < 1024.0: 46 | return f"{bytes:3.2f} {unit}{suffix}" 47 | bytes /= 1024.0 48 | return f"{bytes:3.2f} 'Yi'{suffix}" 49 | 50 | def get_file_size(filename): 51 | file_size = os.stat(filename) 52 | return get_size(file_size.st_size) 53 | 54 | 55 | def get_system(): 56 | print("="*40, "CPU Info", "="*40) 57 | # number of cores 58 | print("Physical cores :", psutil.cpu_count(logical=False)) 59 | print("Total cores :", psutil.cpu_count(logical=True)) 60 | # CPU frequencies 61 | cpufreq = psutil.cpu_freq() 62 | print(f"Max Frequency : {cpufreq.max:.2f} Mhz") 63 | print(f"Min Frequency : {cpufreq.min:.2f} Mhz") 64 | print(f"Current Frequency: {cpufreq.current:.2f} Mhz") 65 | 66 | print("="*40, "Memory Info", "="*40) 67 | # get the memory details 68 | svmem = psutil.virtual_memory() 69 | print(f"Total : {get_size(svmem.total)}") 70 | print(f"Available : {get_size(svmem.available)}") 71 | print(f"Used : {get_size(svmem.used)}") 72 | 73 | 74 | print("="*40, "Software Info", "="*40) 75 | print('Python : ' + sys.version.split('\n')[0]) 76 | print('Numpy : ' + np.__version__) 77 | print('Pandas : ' + pd.__version__) 78 | print('PyTorch : ' + torch.__version__) 79 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 80 | 81 | if device.type == 'cuda': 82 | print("="*40, "GPU Info", "="*40) 83 | print(f'Device : {device}') 84 | print(torch.cuda.get_device_name(0)) 85 | print(f"{'Mem total': <15}: {round(torch.cuda.get_device_properties(0).total_memory/1024**3,1)} GB") 86 | print(f"{'Mem allocated': <15}: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB") 87 | print(f"{'Mem cached': <15}: {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB") 88 | 89 | print("="*30, "system info print done", "="*30) 90 | 91 | def get_seed(s): 92 | rd.seed(s) 93 | os.environ['PYTHONHASHSEED'] = str(s) 94 | np.random.seed(s) 95 | pd.core.common.random_state(s) 96 | # Torch 97 | torch.manual_seed(s) 98 | torch.cuda.manual_seed(s) 99 | torch.backends.cudnn.deterministic = True 100 | torch.backends.cudnn.benchmark = False 101 | if torch.cuda.is_available(): 102 | torch.cuda.manual_seed_all(s) 103 | 104 | @contextmanager 105 | def simple_timer(title): 106 | t0 = time() 107 | yield 108 | print("{} - done in {:.1f} seconds.\n".format(title, time() - t0)) 109 | 110 | class Colors: 111 | """Defining Color Codes to color the text displayed on terminal. 112 | """ 113 | 114 | blue = "\033[94m" 115 | green = "\033[92m" 116 | yellow = "\033[93m" 117 | magenta = "\033[95m" 118 | red = "\033[91m" 119 | end = "\033[0m" 120 | 121 | def color(string: str, color: Colors = Colors.yellow) -> str: 122 | return f"{color}{string}{Colors.end}" 123 | 124 | @contextmanager 125 | def timer(label: str, compact=False) -> None: 126 | ''' 127 | https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/203020#1111022 128 | print 129 | 1. the time the code block takes to run 130 | 2. the memory usage. 131 | ''' 132 | p = psutil.Process(os.getpid()) 133 | m0 = p.memory_info()[0] / 2. ** 30 134 | start = time() # Setup - __enter__ 135 | if not compact: 136 | print(color(f"{label}: start at {start:.2f};", color=Colors.blue)) 137 | print(color(f"LOCAL RAM USAGE AT START: {m0:.2f} GB" , color=Colors.green)) 138 | try: 139 | yield # yield to body of `with` statement 140 | finally: # Teardown - __exit__ 141 | m1 = p.memory_info()[0] / 2. ** 30 142 | delta = m1 - m0 143 | sign = '+' if delta >= 0 else '-' 144 | delta = math.fabs(delta) 145 | end = time() 146 | print(color(f"{label}: done at {end:.2f} ({end - start:.6f} secs elapsed);", color=Colors.blue)) 147 | print(color(f"LOCAL RAM USAGE AT END: {m1:.2f}GB ({sign}{delta:.2f}GB)", color=Colors.green)) 148 | print('\n') 149 | else: 150 | yield 151 | print(color(f"{label} - done in {time() - start:.6f} seconds. \n", color=Colors.blue)) 152 | 153 | 154 | def get_memory(num_var=10): 155 | for name, size in sorted(((name, sys.getsizeof(value)) for name, value in globals().items()), key= lambda x: -x[1])[:num_var]: 156 | print(color(f"{name:>30}:", color=Colors.green), 157 | color(f"{get_size(size):>8}", color=Colors.magenta)) 158 | 159 | def find_files(name, path): 160 | result = [] 161 | for root, dirs, files in os.walk(path): 162 | for _file in files: 163 | if name in _file: 164 | result.append(os.path.join(root, _file)) 165 | return result 166 | 167 | def print_file_size(files): 168 | for file in files: 169 | size=get_file_size(file) 170 | filename = file.split('/')[-1] 171 | filesize = get_file_size(file) 172 | print(color(f"{filename:>30}:", color=Colors.green), 173 | color(f"{filesize:>8}", color=Colors.magenta)) 174 | 175 | @contextmanager 176 | def trace(title: str): 177 | t0 = time() 178 | p = psutil.Process(os.getpid()) 179 | m0 = p.memory_info()[0] / 2. ** 30 180 | yield 181 | m1 = p.memory_info()[0] / 2. ** 30 182 | delta = m1 - m0 183 | sign = '+' if delta >= 0 else '-' 184 | delta = math.fabs(delta) 185 | print(f"[{m1:.1f}GB ({sign}{delta:.3f}GB): {time() - t0:.2f}sec] {title} ", file=sys.stderr) 186 | 187 | def get_cmap(n, cmap='hsv'): 188 | '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 189 | RGB color; the keyword argument name must be a standard mpl colormap name.''' 190 | return plt.cm.get_cmap(cmap, n) 191 | 192 | def get_date(): 193 | today = date.today() 194 | return today.strftime("%b-%d-%Y") 195 | 196 | def roc_auc_compute_fn(y_targets, y_preds): 197 | ''' 198 | roc_auc func for torch tensors 199 | ''' 200 | y_true = y_targets.cpu().numpy() 201 | y_pred = y_preds.cpu().numpy() 202 | return roc_auc_score(y_true, y_pred) 203 | 204 | def argmax(lst): 205 | return lst.index(max(lst)) 206 | 207 | def get_num_params(model): 208 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 209 | params = sum([np.prod(p.size()) for p in model_parameters]) 210 | return params 211 | 212 | def reduce_mem_usage(df, verbose=True): 213 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 214 | start_mem = df.memory_usage().sum() / 1024**2 215 | for col in df.columns: 216 | col_type = df[col].dtypes 217 | if col_type in numerics: 218 | c_min = df[col].min() 219 | c_max = df[col].max() 220 | if str(col_type)[:3] == 'int': 221 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 222 | df[col] = df[col].astype(np.int8) 223 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 224 | df[col] = df[col].astype(np.int16) 225 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 226 | df[col] = df[col].astype(np.int32) 227 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 228 | df[col] = df[col].astype(np.int64) 229 | else: 230 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 231 | df[col] = df[col].astype(np.float16) 232 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 233 | df[col] = df[col].astype(np.float32) 234 | else: 235 | df[col] = df[col].astype(np.float64) 236 | end_mem = df.memory_usage().sum() / 1024**2 237 | if verbose: print(f'Mem. usage decreased to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)') 238 | return df 239 | 240 | def save_pickle(var, save_path): 241 | with open(save_path, 'wb') as f: 242 | pickle.dump(var, f) 243 | 244 | def load_pickle(load_path): 245 | with open(load_path, 'rb') as f: 246 | u = pickle.load(f) 247 | return u 248 | 249 | 250 | if __name__ == "__main__": 251 | get_system() 252 | get_memory() -------------------------------------------------------------------------------- /mlp/debug_embedding_1.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import os 3 | import sys 4 | 5 | import pandas as pd 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | 11 | 12 | current_path = os.path.dirname(os.path.abspath(__file__)) 13 | HOME = os.path.dirname(current_path) 14 | sys.path.append(HOME) 15 | for f in ['/home/scao/anaconda3/lib/python3.8/lib-dynload', 16 | '/home/scao/anaconda3/lib/python3.8/site-packages']: 17 | sys.path.append(f) 18 | 19 | from torchsummary import summary 20 | from utils import * 21 | from utils_js import * 22 | 23 | from mlp import * 24 | pd.set_option('display.max_rows', 100) 25 | pd.set_option('display.max_columns', 100) 26 | 27 | # %% 28 | BATCH_SIZE = 8192 29 | FINETUNE_BATCH_SIZE = 4096_00 30 | 31 | LEARNING_RATE = 1e-4 32 | WEIGHT_DECAY = 1e-5 33 | EPOCHS = 100 34 | EARLYSTOP_NUM = 20 35 | SAVE_THRESH = 3240 36 | 37 | ALPHA = 0.6 38 | 39 | _fold = 0 40 | SEED = 802 41 | get_seed(SEED+SEED*_fold) 42 | 43 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 44 | # %% 45 | with timer("Preprocessing train"): 46 | # train_parquet = os.path.join(DATA_DIR, 'train.parquet') 47 | # train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 48 | train_parquet = os.path.join(DATA_DIR, 'train_final.parquet') 49 | train = pd.read_parquet(train_parquet) 50 | # %% 51 | # feat_reg_index = [0, 17, 18, 37, 39, 40, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 57, 58] 52 | # feat_reg_index += list(range(60,69)) 53 | # feat_reg_index += [89, 101, 108, 113, 119, 120, 121, 122, 124, 125, 126, 128] 54 | # feat_spike_index_temp = list(set(range(130)).difference(feat_reg_index)) 55 | # features_reg = [f'feature_{i}' for i in feat_reg_index] 56 | # features_spike = [f'feature_{i}' for i in feat_spike_index_temp] 57 | 58 | 59 | # %% 60 | # feat_spike_index = [eval(s) for s in feat_spike_index] 61 | # for f in feat_spike_index: 62 | # print(f'{f},', end=' ') 63 | # %% 64 | feat_spike_index = [1, 2, 3, 4, 5, 6, 10, 14, 16, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85, 65 | 86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118] 66 | feat_reg_index = list(set(range(130)).difference(feat_spike_index)) 67 | features_reg = [f'feature_{i}' for i in feat_reg_index] 68 | features_spike = [f'feature_{i}' for i in feat_spike_index] 69 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4', ] 70 | 71 | feat_cols = [f'feature_{i}' for i in range(130)] 72 | # feat_cols = features_reg 73 | feat_cols += [f+'_c' for f in features_spike] 74 | print(f"Number of features: {len(feat_cols)}") 75 | # %% 76 | 77 | feat_spike_index = [] 78 | most_common_vals = [] 79 | most_common_vals = np.load(DATA_DIR+'spike_common_vals_42.npy').reshape(-1) 80 | 81 | for i, feat in tqdm(enumerate(features_spike)): 82 | # sorted_counts = train[feat].value_counts().sort_values(ascending=False) 83 | # print(sorted_counts.head(5), '\n\n') 84 | # if sorted_counts.iloc[0]/sorted_counts.iloc[1] > 30 and sorted_counts.iloc[0] > 5000: 85 | # feat_spike_index.append(sorted_counts.name.split('_')[-1]) 86 | # most_common_val = sorted_counts.index[0] 87 | # most_common_vals.append(most_common_val) 88 | train[feat+'_c'] = train[feat] - most_common_vals[i] 89 | 90 | # %% 91 | train = train.query('date not in [2, 36, 270, 294]').reset_index(drop=True) 92 | train = train.query('date > 85').reset_index(drop=True) 93 | 94 | train = train[train['weight'] != 0].reset_index(drop=True) 95 | train['action'] = (train['resp'] > 0).astype('int') 96 | 97 | for c in range(1, 5): 98 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype('int') 99 | 100 | # fold_0 470, 475 101 | # fold_1 450, 455 102 | valid = train.loc[train.date >= 475].reset_index(drop=True) 103 | train = train.loc[train.date <= 470].reset_index(drop=True) 104 | # %% 105 | ''' 106 | simpler model, not very promising 107 | ''' 108 | 109 | class SpikeNetC(nn.Module): 110 | def __init__(self, hidden_size=256, 111 | output_size=len(resp_cols), 112 | input_size=len(feat_cols), 113 | dropout_rate=0.2,): 114 | super(SpikeNetC, self).__init__() 115 | 116 | self.batch_norm0 = nn.BatchNorm1d(input_size) 117 | self.dropout0 = nn.Dropout(0.2) 118 | 119 | self.dense1 = nn.Linear(input_size, hidden_size) 120 | # nn.init.kaiming_normal_(self.dense1.weight.data) 121 | self.batch_norm1 = nn.BatchNorm1d(hidden_size) 122 | self.dropout1 = nn.Dropout(dropout_rate) 123 | 124 | self.dense2 = nn.Linear(hidden_size+input_size, hidden_size) 125 | # nn.init.kaiming_normal_(self.dense2.weight.data) 126 | self.batch_norm2 = nn.BatchNorm1d(hidden_size) 127 | self.dropout2 = nn.Dropout(dropout_rate) 128 | 129 | self.dense3 = nn.Linear(hidden_size+hidden_size, hidden_size) 130 | # nn.init.kaiming_normal_(self.dense3.weight.data) 131 | self.batch_norm3 = nn.BatchNorm1d(hidden_size) 132 | self.dropout3 = nn.Dropout(dropout_rate) 133 | 134 | self.dense4 = nn.Linear(hidden_size+hidden_size, output_size) 135 | # nn.init.kaiming_normal_(self.dense4.weight.data) 136 | 137 | self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True) 138 | 139 | def forward(self, x): 140 | x = self.batch_norm0(x) 141 | x = self.dropout0(x) 142 | 143 | x1 = self.dense1(x) 144 | x1 = self.batch_norm1(x1) 145 | x1 = self.LeakyReLU(x1) 146 | x1 = self.dropout1(x1) 147 | 148 | x = torch.cat([x, x1], 1) 149 | 150 | x2 = self.dense2(x) 151 | x2 = self.batch_norm2(x2) 152 | x2 = self.LeakyReLU(x2) 153 | x2 = self.dropout2(x2) 154 | 155 | x = torch.cat([x1, x2], 1) 156 | 157 | x3 = self.dense3(x) 158 | x3 = self.batch_norm3(x3) 159 | x3 = self.LeakyReLU(x3) 160 | x3 = self.dropout3(x3) 161 | 162 | x = torch.cat([x2, x3], 1) 163 | 164 | x = self.dense4(x) 165 | 166 | return x 167 | 168 | # %% 169 | 170 | train_set = ExtendedMarketDataset(train, 171 | features=feat_cols, 172 | targets=target_cols, resp=resp_cols) 173 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 174 | 175 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, 176 | targets=target_cols, resp=resp_cols) 177 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 178 | # %% 179 | util_cols = resp_cols 180 | # util_cols = ['resp'] 181 | resp_index = [resp_cols.index(r) for r in util_cols] 182 | regularizer = UtilityLoss(alpha=5e-2, scaling=12, 183 | normalize=None, resp_index=resp_index) 184 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 185 | 186 | model = SpikeNetC() 187 | model.to(device) 188 | summary(model, (len(feat_cols),)) 189 | 190 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 191 | 192 | scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 193 | steps_per_epoch=len(train_loader), 194 | epochs=EPOCHS) 195 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 196 | # T_0=10, T_mult=2, 197 | # eta_min=LEARNING_RATE*1e-3, last_epoch=-1) 198 | 199 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8) 200 | 201 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3) 202 | 203 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, 204 | mode="max", save_threshold=SAVE_THRESH) 205 | 206 | # %% 207 | 208 | lr = [] 209 | 210 | for epoch in range(EPOCHS): 211 | 212 | # train_loss = train_epoch( 213 | # model, optimizer, scheduler, loss_fn, train_loader, device) 214 | train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device) 215 | lr.append(optimizer.param_groups[0]['lr']) 216 | 217 | if (epoch+1) % 8 == 0: 218 | _ = train_epoch_finetune(model, finetune_optimizer, scheduler, 219 | regularizer, finetune_loader, device, loss_fn=loss_fn) 220 | 221 | valid_pred = valid_epoch(model, valid_loader, device) 222 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 223 | f=median_avg, threshold=0.5, target_cols=target_cols) 224 | model_file = MODEL_DIR + \ 225 | f"/emb_fold_{_fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 226 | early_stop(epoch, valid_auc, model, model_path=model_file, 227 | epoch_utility_score=valid_score) 228 | 229 | if early_stop.model_saved: 230 | for g in optimizer.param_groups: 231 | g['lr'] *= 0.1 232 | lr[-1] = optimizer.param_groups[0]['lr'] 233 | tqdm.write(f"\nNew learning rate: {lr[-1]:.4e}") 234 | 235 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}") 236 | tqdm.write( 237 | f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}") 238 | tqdm.write( 239 | f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ") 240 | tqdm.write( 241 | f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 242 | if early_stop.early_stop: 243 | print("\nEarly stopping") 244 | break 245 | # %% debug, un-necessary 246 | # sample = next(iter(train_loader)) 247 | # cat_dims = [int(train[col].nunique()) for col in cat_cols] 248 | # emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims] 249 | # emb_layers = nn.ModuleList([nn.Embedding(x, y) 250 | # for x, y in emb_dims]) 251 | # x = [emb_layer(sample['cat_features'][0,i].long()) 252 | # for i,emb_layer in enumerate(emb_layers)] 253 | # %% 254 | -------------------------------------------------------------------------------- /mlp/debug_resnet_tf.py: -------------------------------------------------------------------------------- 1 | #%% 2 | from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation 3 | from tensorflow.keras.models import Model, Sequential 4 | from tensorflow.keras.losses import BinaryCrossentropy 5 | from tensorflow.keras.optimizers import Adam 6 | from tensorflow.keras.callbacks import EarlyStopping, Callback 7 | from tensorflow.keras.layers.experimental.preprocessing import Normalization 8 | import tensorflow as tf 9 | import tensorflow_addons as tfa 10 | from tensorflow.keras import backend as K 11 | 12 | import numpy as np 13 | import pandas as pd 14 | from tqdm.auto import tqdm 15 | from random import choices 16 | 17 | current_path = os.path.dirname(os.path.abspath(__file__)) 18 | HOME = os.path.dirname(current_path) 19 | MODEL_DIR = os.path.join(HOME, 'models') 20 | DATA_DIR = os.path.join(HOME, 'data') 21 | sys.path.append(HOME) 22 | 23 | from utils import * 24 | from mlp import * 25 | 26 | # %% 27 | ''' 28 | baseline, dropped outlier days, fillna with mean, drop weight zero trades after. Using a feature split based on Carl's notebook. "Minor" features go through a linear layer block with high dropout rate first. Epoch = 50 29 | 30 | Added a util score callback for keras fit API, epoch 80, the util score is for every 50 days after day 100. This model reaches 5k util in the last 50 days in under 50 epochs, too good to be true? 31 | ''' 32 | 33 | SEED = 1127802 34 | BETA = 0.7 # 5 preds then the middle 3 35 | 36 | # split features for a ResNet feature 2 is more important 37 | features_2_list = [0, 1, 2, 3, 4, 5, 6, 15, 16, 25, 26, 35, 38 | 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 39 | 49, 50, 51, 52, 53, 54, 59, 60, 61, 62, 63, 64, 65, 40 | 66, 67, 68, 69, 70, 71, 76, 77, 82, 83, 88, 89, 94, 41 | 95, 100, 101, 106, 107, 112, 113, 118, 119, 128, 129] 42 | 43 | features_1_list = [0] + list(set(range(130)).difference(features_2_list)) 44 | 45 | features_1 = [f'feature_{i}' for i in features_1_list] 46 | 47 | features_2 = [f'feature_{i}' for i in features_2_list] 48 | 49 | # %% 50 | all_train = pd.read_parquet(DATA_DIR+'train.parquet') 51 | all_train = all_train.query('date > 85').reset_index(drop = True) 52 | all_train = all_train.query('date not in [2, 36, 270, 294]').reset_index(drop=True) 53 | 54 | all_train.fillna(all_train.mean(), inplace=True) 55 | 56 | features = [f'feature_{i}' for i in range(130)] 57 | f_mean = np.mean(all_train[features].values,axis=0) 58 | # np.save('f_mean_after_85_include_zero_weight.npy', f_mean) 59 | 60 | all_train = all_train[all_train['weight'] != 0].reset_index(drop=True) 61 | 62 | all_train = all_train.astype({feat: np.float32 for feat in features}) 63 | #%% 64 | _fold = 0 65 | split = [('date > 450','date <= 450'), 66 | ('date <= 450 and date > 400','date <= 400 or date>450'), 67 | ('date <= 400 and date > 350','date <= 350 or date>400'), 68 | ('date <= 350 and date > 300','date <= 300 or date>350'), 69 | ('date <= 300 and date > 250','date <= 250 or date>300'), 70 | ('date <= 250 and date > 200','date <= 200 or date>250'), 71 | ('date <= 200 and date > 150','date <= 150 or date>200'), 72 | ('date <= 150 and date > 100','date <= 100 or date>150'),] 73 | 74 | valid = all_train.query(split[_fold][0]).reset_index(drop = True) 75 | train = all_train.query(split[_fold][1]).reset_index(drop = True) 76 | 77 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4'] 78 | 79 | y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T 80 | y_val = np.stack([(valid[c] > 0).astype('int') for c in resp_cols]).T 81 | 82 | X_train = [train.loc[:, features_1].values, 83 | train.loc[:, features_2].values] 84 | X_val = [valid.loc[:, features_1].values, 85 | valid.loc[:, features_2].values] 86 | 87 | print(len(train), len(valid)) 88 | # %% 89 | class Mish(tf.keras.layers.Layer): 90 | 91 | def __init__(self, **kwargs): 92 | super(Mish, self).__init__(**kwargs) 93 | self.supports_masking = True 94 | 95 | def call(self, inputs): 96 | return inputs * K.tanh(K.softplus(inputs)) 97 | 98 | def get_config(self): 99 | base_config = super(Mish, self).get_config() 100 | return dict(list(base_config.items()) + list(config.items())) 101 | 102 | def compute_output_shape(self, input_shape): 103 | return input_shape 104 | 105 | def mish(x): 106 | return tf.keras.layers.Lambda(lambda x: x*K.tanh(K.softplus(x)))(x) 107 | 108 | 109 | tf.keras.utils.get_custom_objects().update({'mish': tf.keras.layers.Activation(mish)}) 110 | 111 | def create_resnet(n_features, n_features_2, n_labels, hidden_size, 112 | learning_rate=1e-3, label_smoothing = 0.005): 113 | input_1 = tf.keras.layers.Input(shape = (n_features,), name = 'Input1') 114 | input_2 = tf.keras.layers.Input(shape = (n_features_2,), name = 'Input2') 115 | 116 | head_1 = tf.keras.Sequential([ 117 | tf.keras.layers.BatchNormalization(), 118 | tf.keras.layers.Dropout(0.4), 119 | tf.keras.layers.Dense(hidden_size, activation="mish"), 120 | tf.keras.layers.BatchNormalization(), 121 | tf.keras.layers.Dropout(0.4), 122 | tf.keras.layers.Dense(hidden_size//2, activation = "mish") 123 | ],name='Head1') 124 | 125 | input_3 = head_1(input_1) 126 | input_3_concat = tf.keras.layers.Concatenate()([input_2, input_3]) 127 | 128 | head_2 = tf.keras.Sequential([ 129 | tf.keras.layers.BatchNormalization(), 130 | tf.keras.layers.Dropout(0.2), 131 | tf.keras.layers.Dense(hidden_size, "mish"), 132 | tf.keras.layers.BatchNormalization(), 133 | tf.keras.layers.Dropout(0.2), 134 | tf.keras.layers.Dense(hidden_size, "mish"), 135 | ],name='Head2') 136 | 137 | input_4 = head_2(input_3_concat) 138 | input_4_concat = tf.keras.layers.Concatenate()([input_3, input_4]) 139 | 140 | head_3 = tf.keras.Sequential([ 141 | tf.keras.layers.BatchNormalization(), 142 | tf.keras.layers.Dense(hidden_size, kernel_initializer='lecun_normal', activation='mish'), 143 | tf.keras.layers.BatchNormalization(), 144 | tf.keras.layers.Dropout(0.3), 145 | tf.keras.layers.Dense(hidden_size//2, kernel_initializer='lecun_normal', activation='mish'), 146 | tf.keras.layers.BatchNormalization(), 147 | tf.keras.layers.Dropout(0.3), 148 | tf.keras.layers.Dense(n_labels, activation="sigmoid") 149 | ],name='Head3') 150 | 151 | output = head_3(input_4_concat) 152 | 153 | 154 | model = tf.keras.models.Model(inputs = [input_1, input_2], outputs = output) 155 | model.compile(optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate), 156 | loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing), 157 | metrics=['AUC']) 158 | 159 | return model 160 | 161 | class UtilEvaluation(Callback): 162 | def __init__(self, val_df=None, interval=3, start_day=100, end_day=500, num_days=50): 163 | super(UtilEvaluation, self).__init__() 164 | 165 | self.interval = interval 166 | self.val_df = val_df 167 | self.start_day = start_day 168 | self.end_day = end_day 169 | self.num_days = num_days 170 | 171 | def on_epoch_end(self, epoch, logs={}): 172 | if (epoch+1) % self.interval == 0: 173 | print("*"*40) 174 | print(f"Epoch [{epoch+1:d}/{EPOCHS}]:") 175 | all_score = [] 176 | all_val_pred = self.val_df[['date', 'weight', 'resp']].copy() 177 | all_val_pred['action'] = 0 178 | 179 | for day in range(self.start_day, self.end_day, self.num_days): 180 | valid = self.val_df[self.val_df.date.isin(range(day, day+self.num_days))] 181 | valid = valid[valid.weight > 0] 182 | 183 | x_tt = valid.loc[:, features].values 184 | x_tt_1 = x_tt.take(features_1_list, axis=-1) 185 | x_tt_2 = x_tt.take(features_2_list, axis=-1) 186 | val_pred = self.model([x_tt_1, x_tt_2], training = False).numpy() 187 | val_pred = median_avg(val_pred) 188 | val_pred = np.where(val_pred >= 0.5, 1, 0).astype(int) 189 | valid_score = utility_score_bincount(date=valid.date.values, 190 | weight=valid.weight.values, 191 | resp=valid.resp.values, 192 | action=val_pred) 193 | all_score.append(valid_score) 194 | all_val_pred.loc[self.val_df.date.isin(range(day, day+self.num_days)), 'action']=val_pred 195 | all_val_pred.to_csv(f'val_pred_fold_{_fold}.csv', index=False) 196 | print(f"Day {day:3d}-{day+self.num_days-1:3d} - util score: {valid_score:.2f}") 197 | 198 | print(f"Utility score mean with {self.num_days} span: {np.mean(all_score):.2f} ") 199 | print(f"Utility score std with {self.num_days} span: {np.std(all_score):.2f}") 200 | print("*"*40, '\n') 201 | 202 | #%% 203 | tf.keras.backend.clear_session() 204 | SEED = 1127 205 | tf.random.set_seed(SEED) 206 | tf_model = create_resnet(len(features_1), len(features_2), len(resp_cols), 207 | hidden_size=300, learning_rate=1e-4, label_smoothing=5e-03) 208 | util_cb = UtilEvaluation(val_df=valid, start_day=valid.date.min(), end_day=valid.date.max()) 209 | tf_model.summary() 210 | # %% 211 | EPOCHS = 50 212 | tf_model.fit(X_train, y_train, epochs=EPOCHS, batch_size=8192, 213 | validation_data=(X_val, y_val), 214 | verbose=1, 215 | callbacks=[util_cb] 216 | ) 217 | 218 | # save model 219 | tf_model.save(f'tf_res_fold_{_fold}_ep_{EPOCHS}.h5') 220 | # %% 221 | 222 | all_val_preds = [] 223 | for i in range(6): 224 | val_preds = pd.read_csv(MODEL_DIR+f'val_pred_fold_{5-i}.csv') 225 | all_val_preds.append(val_preds) 226 | #%% 227 | all_val_preds = pd.concat(all_val_preds,axis=0) 228 | all_val_preds = all_val_preds.query('date >= 249 and date <=499') 229 | valid_score = utility_score_bincount(date=all_val_preds.date.values, 230 | weight=all_val_preds.weight.values, 231 | resp=all_val_preds.resp.values, 232 | action=all_val_preds.action.values) 233 | # %% 234 | -------------------------------------------------------------------------------- /mlp/run_train_final_2_overfit.py: -------------------------------------------------------------------------------- 1 | # %% 2 | 3 | import os 4 | import sys 5 | current_path = os.path.dirname(os.path.abspath(__file__)) 6 | HOME = os.path.dirname(current_path) 7 | MODEL_DIR = os.path.join(HOME, 'models') 8 | DATA_DIR = os.path.join(HOME, 'data') 9 | sys.path.append(HOME) 10 | 11 | from utils import * 12 | from mlp import * 13 | 14 | import torch 15 | import torch.nn.functional as F 16 | import torch.nn as nn 17 | torch.backends.cudnn.deterministic = True # for bincount 18 | 19 | 20 | from torchsummary import summary 21 | # %% 22 | ''' 23 | Training script (excluding volatile days): 24 | 1. data: after day 85, excluding (2, 294, 36, 270) 25 | 2. data: the fillna is using the past day mean (after excluding the days above) 26 | 3. data: all five resps 27 | 4. training: finetuning using resp columns as regularizer, every 10 iterations 28 | ''' 29 | 30 | DEBUG = False 31 | LOAD_PRETRAIN = False 32 | 33 | DROP_ZERO_WEIGHT = True 34 | 35 | TRAINING_START = 0 36 | FINETUNE_BATCH_SIZE = 4096_00 37 | BATCH_SIZE = 8192 38 | EPOCHS = 60 39 | FINETUNE_EPOCHS = 2 40 | LEARNING_RATE = 1e-4 41 | WEIGHT_DECAY = 1e-5 42 | EARLYSTOP_NUM = 5 43 | NFOLDS = 1 44 | SCALING = 12 45 | THRESHOLD = 0.5 46 | 47 | DAYS_TO_DROP = list(range(86))+[270, 294] 48 | VOLATILE_DAYS = [1, 4, 5, 12, 16, 18, 24, 37, 38, 43, 44, 45, 47, 49 | 59, 63, 80, 85, 161, 168, 452, 459, 462] 50 | VOLATILE_MODEL = True 51 | 52 | 53 | SEED = 802 54 | np.random.seed(SEED) 55 | pd.core.common.random_state(SEED) 56 | torch.manual_seed(SEED) 57 | torch.cuda.manual_seed(SEED) 58 | torch.backends.cudnn.deterministic = True 59 | torch.backends.cudnn.benchmark = False 60 | if torch.cuda.is_available(): 61 | torch.cuda.manual_seed_all(SEED) 62 | 63 | splits = { 64 | 'train_days': (range(0,500), range(0,466), range(0,433)), 65 | 'valid_days': (range(467, 500), range(434, 466), range(401, 433)), 66 | } 67 | 68 | fold = 2 69 | 70 | if fold == 0: 71 | SAVE_THRESH = 2000 72 | VAL_OFFSET = 70 73 | elif fold == 1: 74 | SAVE_THRESH = 1800 75 | VAL_OFFSET = 70 76 | elif fold == 2: 77 | SAVE_THRESH = 1000 78 | VAL_OFFSET = 70 79 | EPSILON = 1e-2 80 | 81 | if VOLATILE_MODEL: 82 | resp_cols = ['resp_3','resp','resp_4'] 83 | resp_cols_all = resp_cols 84 | util_cols = ['resp_3','resp','resp_4'] 85 | # util_cols =['resp_3','resp', 'resp_4'] 86 | resp_index = [resp_cols_all.index(r) for r in util_cols] 87 | target_cols = ['action_3','action', 'action_4'] 88 | else: 89 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3','resp_4'] 90 | resp_cols_all = resp_cols 91 | util_cols =['resp_1','resp_2', 'resp_3', 'resp', 'resp_4'] 92 | # util_cols =['resp_3','resp', 'resp_4'] 93 | resp_index = [resp_cols_all.index(r) for r in util_cols] 94 | target_cols = ['action', 'action_1','action_2','action_3', 'action_4'] 95 | 96 | feat_cols = [f'feature_{i}' for i in range(130)] 97 | feat_cols += ['cross_41_42_43', 'cross_1_2'] 98 | 99 | 100 | noisy_index = [3, 4, 5, 6, 8, 10, 12, 14, 16, 37, 38, 39, 40, 72, 73, 74, 75, 76, 101 | 78, 79, 80, 81, 82, 83] 102 | negative_index = [73, 75, 76, 77, 79, 81, 82] 103 | hybrid_index = [55, 56, 57, 58, 59] 104 | running_indices = sorted([0]+noisy_index+negative_index+hybrid_index) 105 | 106 | rm_500_cols = ['feature_' + str(i) + '_rm_500' for i in running_indices] 107 | 108 | #### adding the running mean 109 | # feat_cols += rm_500_cols 110 | 111 | ###### adding weight to the features ####### 112 | # feat_cols.extend(['weight']) 113 | 114 | 115 | 116 | 117 | f = median_avg 118 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 119 | 120 | # %% 121 | with timer("Preprocessing train"): 122 | # train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 123 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 124 | train = pd.read_parquet(train_parquet) 125 | 126 | # feat_add_parquet = os.path.join(DATA_DIR, 'feat_rm_500.parquet') 127 | # feat_add_df = pd.read_parquet(feat_add_parquet) 128 | 129 | # train = pd.concat([train, feat_add_df], axis=1) 130 | 131 | if not VOLATILE_MODEL: 132 | # train = train.query(f'date not in {VOLATILE_DAYS}').reset_index(drop = True) 133 | train = train.query('date > 85').reset_index(drop=True) 134 | train.fillna(train.mean(), inplace=True) 135 | 136 | train = train[train['weight'] > 0].reset_index(drop = True) 137 | 138 | # index_zero_weight = (train['weight']==0) 139 | # index_zero_weight = np.where(index_zero_weight)[0] 140 | # index_zero_weight = np.random.choice(index_zero_weight, size=int(0.4*len(index_zero_weight))) 141 | # train.loc[index_zero_weight, ['weight']] = train.loc[index_zero_weight, ['weight']].clip(1e-7) 142 | # # train = train[train['weight'] > 0].reset_index(drop = True) 143 | 144 | train['action'] = (train['resp'] > 0).astype(int) 145 | for c in range(1,5): 146 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int) 147 | 148 | train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43'] 149 | train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5).astype(np.float32) 150 | 151 | #### concat with moving mean features 152 | 153 | valid = train.loc[train.date.isin(splits['valid_days'][fold])].reset_index(drop=True) 154 | train = train.loc[train.date.isin(splits['train_days'][fold])].reset_index(drop=True) 155 | 156 | # %% 157 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols) 158 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 159 | 160 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols) 161 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 162 | 163 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols)) 164 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols)) 165 | # model = ResidualMLPLite(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols)) 166 | model.to(device) 167 | summary(model, input_size=(len(feat_cols), )) 168 | # %% 169 | regularizer = UtilityLoss(alpha=EPSILON, scaling=SCALING, normalize=None, resp_index=resp_index) 170 | 171 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 172 | 173 | optimizer = torch.optim.Adam(model.parameters(), 174 | lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY,) 175 | 176 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 177 | T_0=50, T_mult=2, 178 | eta_min=LEARNING_RATE*1e-3, last_epoch=-1) 179 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-8) 180 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 181 | # steps_per_epoch=len(train_loader), epochs=EPOCHS) 182 | # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=LEARNING_RATE*1e-2, 183 | # max_lr=LEARNING_RATE, step_size_up=5, 184 | # mode="triangular2") 185 | # scheduler_add = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20,39], gamma=0.1) 186 | # scheduler_add = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8) 187 | 188 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=10) 189 | 190 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3) 191 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=SAVE_THRESH, util_offset=VAL_OFFSET) 192 | # %% 193 | for epoch in range(EPOCHS): 194 | 195 | # train_loss = train_epoch(model, optimizer, scheduler, loss_fn, train_loader, device) 196 | train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device) 197 | # scheduler_add.step() 198 | lr = optimizer.param_groups[0]['lr'] 199 | if (epoch+1) % 10 == 0: 200 | _ = train_epoch_finetune(model, finetune_optimizer, scheduler, 201 | regularizer, finetune_loader, device, loss_fn=loss_fn) 202 | 203 | valid_pred = valid_epoch(model, valid_loader, device) 204 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 205 | f=median_avg, threshold=0.5, target_cols=target_cols) 206 | if VOLATILE_MODEL: 207 | model_file = MODEL_DIR + f"/pt_vol_overfit_{fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 208 | else: 209 | model_file = MODEL_DIR + f"/pt_overfit_{fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 210 | early_stop(epoch, valid_auc, model, 211 | model_path=model_file, 212 | epoch_utility_score=valid_score) 213 | 214 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {fold}") 215 | tqdm.write(f"Train loss: {train_loss:.4e} \t Current learning rate: {lr:.4e}") 216 | tqdm.write(f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ") 217 | tqdm.write(f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 218 | if early_stop.early_stop: 219 | print("\nEarly stopping") 220 | break 221 | # %% 222 | 223 | feat_cols = [f'feature_{i}' for i in range(130)] 224 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 225 | 226 | 227 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, 228 | output_size=len(target_cols)) 229 | model.to(device) 230 | try: 231 | print(f"Loading {early_stop.model_path} for cv check.\n") 232 | model_weights = early_stop.model_path 233 | model.load_state_dict(torch.load(model_weights)) 234 | model.eval(); 235 | 236 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 237 | train = preprocess_final(train_parquet, drop_zero_weight=True) 238 | 239 | CV_START_DAY = 401 240 | CV_DAYS = 32 241 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 242 | batch_size =2*8192, f=median_avg, threshold=0.5, 243 | target_cols=target_cols, 244 | feat_cols=feat_cols, 245 | resp_cols=resp_cols) 246 | except: 247 | FileNotFoundError 248 | print("Model not found") 249 | # %% 250 | -------------------------------------------------------------------------------- /mlp/run_train_final_2.py: -------------------------------------------------------------------------------- 1 | # %% 2 | from torchsummary import summary 3 | import os 4 | import sys 5 | import torch 6 | import torch.nn.functional as F 7 | import torch.nn as nn 8 | torch.backends.cudnn.deterministic = True # for bincount 9 | 10 | current_path = os.path.dirname(os.path.abspath(__file__)) 11 | HOME = os.path.dirname(current_path) 12 | MODEL_DIR = os.path.join(HOME, 'models') 13 | DATA_DIR = os.path.join(HOME, 'data') 14 | sys.path.append(HOME) 15 | 16 | from utils import * 17 | from mlp import * 18 | # %% 19 | ''' 20 | Training script (excluding volatile days): 21 | 1. data: after day 85, excluding (2, 294, 36, 270) 22 | 2. data: the fillna is using the past day mean (after excluding the days above) 23 | 3. data: all five resps 24 | 4. training: finetuning using resp columns as regularizer, every 10 iterations 25 | ''' 26 | 27 | DEBUG = False 28 | LOAD_PRETRAIN = False 29 | 30 | DROP_ZERO_WEIGHT = True 31 | 32 | FINETUNE_BATCH_SIZE = 4096_00 33 | BATCH_SIZE = 8192 34 | EPOCHS = 80 35 | FINETUNE_EPOCHS = 2 36 | LEARNING_RATE = 1e-4 37 | WEIGHT_DECAY = 1e-5 38 | EARLYSTOP_NUM = 5 39 | SCALING = 12 40 | THRESHOLD = 0.5 41 | 42 | DAYS_TO_DROP = list(range(86))+[270, 294] 43 | VOLATILE_DAYS = [1, 4, 5, 12, 16, 18, 24, 37, 38, 43, 44, 45, 47, 44 | 59, 63, 80, 85, 161, 168, 452, 459, 462] 45 | VOLATILE_MODEL = True 46 | 47 | fold = 2 48 | 49 | # s = 11 for fold 1 50 | SEED = 1127802//8+fold 51 | np.random.seed(SEED) 52 | pd.core.common.random_state(SEED) 53 | torch.manual_seed(SEED) 54 | torch.cuda.manual_seed(SEED) 55 | torch.backends.cudnn.deterministic = True 56 | torch.backends.cudnn.benchmark = False 57 | if torch.cuda.is_available(): 58 | torch.cuda.manual_seed_all(SEED) 59 | 60 | splits = { 61 | 'train_days': (range(0,457), range(0,424), range(0,391)), 62 | 'valid_days': (range(467, 500), range(434, 466), range(401, 433)), 63 | } 64 | 65 | if fold == 0: 66 | SAVE_THRESH = 1000 67 | VAL_OFFSET = 150 68 | elif fold == 1: 69 | LEARNING_RATE = 1e-3 70 | SAVE_THRESH = 1100 71 | VAL_OFFSET = 150 72 | elif fold == 2: 73 | SAVE_THRESH = 100 74 | VAL_OFFSET = 100 75 | EPOCHS = 40 76 | LEARNING_RATE = 1e-3 77 | EPSILON = 1e-2 78 | 79 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3','resp_4'] 80 | resp_cols_all = resp_cols 81 | target_cols = ['action', 'action_1','action_2','action_3', 'action_4'] 82 | feat_cols = [f'feature_{i}' for i in range(130)] 83 | feat_cols += ['cross_41_42_43', 'cross_1_2'] 84 | 85 | 86 | noisy_index = [3, 4, 5, 6, 8, 10, 12, 14, 16, 37, 38, 39, 40, 72, 73, 74, 75, 76, 87 | 78, 79, 80, 81, 82, 83] 88 | negative_index = [73, 75, 76, 77, 79, 81, 82] 89 | hybrid_index = [55, 56, 57, 58, 59] 90 | running_indices = sorted([0]+noisy_index+negative_index+hybrid_index) 91 | 92 | rm_500_cols = ['feature_' + str(i) + '_rm_500' for i in running_indices] 93 | 94 | #### adding the running mean 95 | # feat_cols += rm_500_cols 96 | 97 | ###### adding weight to the features ####### 98 | # feat_cols.extend(['weight']) 99 | 100 | util_cols =['resp_1','resp_2', 'resp_3', 'resp', 'resp_4'] 101 | # util_cols =['resp_3','resp', 'resp_4'] 102 | resp_index = [resp_cols_all.index(r) for r in util_cols] 103 | 104 | 105 | f = median_avg 106 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 107 | 108 | # %% 109 | with timer("Preprocessing train"): 110 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 111 | train = pd.read_parquet(train_parquet) 112 | 113 | # feat_add_parquet = os.path.join(DATA_DIR, 'feat_rm_500.parquet') 114 | # feat_add_df = pd.read_parquet(feat_add_parquet) 115 | 116 | # train = pd.concat([train, feat_add_df], axis=1) 117 | 118 | if not VOLATILE_MODEL: 119 | train = train.query(f'date not in {VOLATILE_DAYS}').reset_index(drop = True) 120 | train = train.query('date > 85').reset_index(drop=True) 121 | 122 | train = train[train['weight'] > 0].reset_index(drop = True) 123 | 124 | train['action'] = (train['resp'] > 0).astype(int) 125 | for c in range(1,5): 126 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int) 127 | 128 | train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43'] 129 | train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5).astype(np.float32) 130 | 131 | #### concat with moving mean features 132 | 133 | valid = train.loc[train.date.isin(splits['valid_days'][fold])].reset_index(drop=True) 134 | train = train.loc[train.date.isin(splits['train_days'][fold])].reset_index(drop=True) 135 | 136 | # %% 137 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols) 138 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 139 | 140 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols) 141 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 142 | 143 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols)) 144 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols)) 145 | # model = ResidualMLPLite(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols)) 146 | model.to(device) 147 | summary(model, input_size=(len(feat_cols), )) 148 | # %% 149 | regularizer = UtilityLoss(alpha=5e-2, scaling=SCALING, normalize=None, resp_index=resp_index) 150 | 151 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 152 | 153 | optimizer = torch.optim.Adam(model.parameters(), 154 | lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY,) 155 | 156 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 157 | T_0=50, T_mult=2, 158 | eta_min=LEARNING_RATE*1e-3, last_epoch=-1) 159 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-8) 160 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 161 | # steps_per_epoch=len(train_loader), epochs=EPOCHS) 162 | # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=LEARNING_RATE*1e-2, 163 | # max_lr=LEARNING_RATE, step_size_up=5, 164 | # mode="triangular2") 165 | # scheduler_add = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20,39], gamma=0.1) 166 | # scheduler_add = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8) 167 | 168 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=10) 169 | 170 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3) 171 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=SAVE_THRESH, util_offset=VAL_OFFSET) 172 | # %% 173 | for epoch in range(EPOCHS): 174 | 175 | # train_loss = train_epoch(model, optimizer, scheduler, loss_fn, train_loader, device) 176 | train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device) 177 | # scheduler_add.step() 178 | lr = optimizer.param_groups[0]['lr'] 179 | if (epoch+1) % 10 == 0: 180 | _ = train_epoch_finetune(model, finetune_optimizer, scheduler, 181 | regularizer, finetune_loader, device, loss_fn=loss_fn) 182 | 183 | valid_pred = valid_epoch(model, valid_loader, device) 184 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 185 | f=median_avg, threshold=0.5, target_cols=target_cols) 186 | if VOLATILE_MODEL: 187 | model_file = os.path.join(MODEL_DIR, 188 | f"pt_volatile_{fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth") 189 | else: 190 | model_file = os.path.join(MODEL_DIR, 191 | f"pt_{fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth") 192 | early_stop(epoch, valid_auc, model, 193 | model_path=model_file, 194 | epoch_utility_score=valid_score) 195 | 196 | # if early_stop.model_saved: 197 | # for g in optimizer.param_groups: 198 | # g['lr'] *= 0.1 199 | # lr = optimizer.param_groups[0]['lr'] 200 | # print(f"\nNew learning rate: {lr:.4e}") 201 | 202 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {fold}") 203 | tqdm.write(f"Train loss: {train_loss:.4e} \t Current learning rate: {lr:.4e}") 204 | tqdm.write(f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ") 205 | tqdm.write(f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 206 | if early_stop.early_stop: 207 | print("\nEarly stopping") 208 | break 209 | 210 | #%% 211 | # for epoch in range(FINETUNE_EPOCHS): 212 | # util_loss, train_loss = train_epoch_finetune(model, finetune_optimizer, scheduler, 213 | # regularizer, finetune_loader, device, loss_fn=loss_fn) 214 | 215 | # valid_pred = valid_epoch(model, valid_loader, device) 216 | # valid_auc, valid_score = get_valid_score(valid_pred, valid, 217 | # f=median_avg, threshold=0.5, target_cols=target_cols) 218 | 219 | # print(f"\n[Finetune epoch {epoch+1}/{FINETUNE_EPOCHS}] \t Fold {_fold}") 220 | # print(f"Train loss: {train_loss:.4e} \t Util loss: {util_loss:.2f}") 221 | # print(f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 222 | 223 | # if DEBUG: 224 | # torch.save(model.state_dict(), MODEL_DIR + f"/model_{_fold}.pth") 225 | # %% 226 | 227 | 228 | feat_cols = [f'feature_{i}' for i in range(130)] 229 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 230 | 231 | 232 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, 233 | output_size=len(target_cols)) 234 | model.to(device) 235 | try: 236 | print(f"Loading {early_stop.model_path} for cv check.\n") 237 | model_weights = early_stop.model_path 238 | # model_weights = os.path.join(MODEL_DIR, 'final_1_util_865_auc_0.5450.pth') 239 | model.load_state_dict(torch.load(model_weights)) 240 | model.eval(); 241 | 242 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 243 | train = preprocess_final(train_parquet, drop_zero_weight=True) 244 | 245 | CV_START_DAY = 401 246 | CV_DAYS = 32 247 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 248 | batch_size =2*8192, f=median_avg, threshold=0.5, 249 | target_cols=target_cols, 250 | feat_cols=feat_cols, 251 | resp_cols=resp_cols) 252 | except: 253 | FileNotFoundError 254 | print("Model not found") 255 | 256 | # %% 257 | -------------------------------------------------------------------------------- /mlp/debug_embedding_tag.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os, sys 3 | import pandas as pd 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torchsummary import summary 8 | # from fastai.tabular.all import TabularPandas, RandomSplitter, CategoryBlock, MultiCategoryBlock, range_of, accuracy, tabular_learner, TabularDataLoaders 9 | 10 | current_path = os.path.dirname(os.path.abspath(__file__)) 11 | HOME = os.path.dirname(current_path) 12 | MODEL_DIR = os.path.join(HOME, 'models') 13 | DATA_DIR = os.path.join(HOME, 'data') 14 | sys.path.append(HOME) 15 | 16 | from mlp import * 17 | from utils import * 18 | from utils_js import * 19 | #%% 20 | 21 | HIDDEN_LAYERS = [400, 400, 400] # hidden layer size for the embedding model 22 | N_FEAT_TAGS = 29 23 | N_TARGETS = 6 24 | 25 | BATCH_SIZE = 8196 26 | EARLYSTOP_NUM = 5 27 | FINETUNE_BATCH_SIZE = 51200 28 | 29 | EPOCHS = 100 30 | 31 | N_DENOISED_TARGET = 1 32 | LEARNING_RATE = 1e-4 33 | WEIGHT_DECAY = 1e-4 34 | 35 | N_FEATURES = 130 36 | N_FEAT_TAGS = 29 37 | 38 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 39 | 40 | dtype = { 41 | 'feature' : 'str', 42 | 'tag_0' : 'int8' 43 | } 44 | for i in range (1, 29): 45 | k = 'tag_' + str (i) 46 | dtype[k] = 'int8' 47 | 48 | features_df = pd.read_csv (os.path.join(DATA_DIR, 'features.csv'), usecols=range(1,30), dtype=dtype) 49 | # N_FEATURES = features_df.shape[0] # the features.csv has 130 features (1st row) = no of features in train.csv (feature_0 to feature_129) 50 | # N_FEAT_TAGS = features_df.shape[1] # the features.csv has 29 tags 51 | 52 | resp_cols = ['resp_1', 'resp_2', 'resp_3','resp_4', 'resp'] 53 | feat_cols = [f'feature_{i}' for i in range(130)] 54 | resp_cols = ['resp', 'resp_dn_0', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 55 | target_cols = ['action', 'action_dn_0', 'action_1', 'action_2', 'action_3', 'action_4'] 56 | # %% 57 | with timer("Preprocessing train"): 58 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 59 | train = pd.read_parquet(train_parquet) 60 | train = train.query ('date > 85').reset_index (drop = True) 61 | # df = df[df['weight'] != 0].reset_index (drop = True) 62 | 63 | train.fillna(train.mean(),inplace=True) 64 | train = add_denoised_target(train, num_dn_target=N_DENOISED_TARGET) 65 | y = np.stack ([(train[c] > 0).astype ('int') for c in resp_cols]).T 66 | # train.drop (columns=['weight', 'date', 'ts_id']+resp_cols, inplace=True) 67 | train['action'] = (train['resp'] > 0).astype('int') 68 | for c in range(1,5): 69 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype('int') 70 | valid = train.loc[train.date > 450].reset_index(drop=True) 71 | # %% 72 | class FeatureFFN (nn.Module): 73 | 74 | def __init__(self, inputCount=130, 75 | outputCount=5, 76 | hiddenLayerCounts=[150, 150, 150], 77 | drop_prob=0.2, 78 | activation=nn.SiLU() # this is swish activation 79 | ): 80 | ''' 81 | Feature generation embedding net, no output 82 | ''' 83 | super(FeatureFFN, self).__init__() 84 | 85 | self.activation = activation 86 | self.dropout = nn.Dropout (drop_prob) 87 | self.batchnorm0 = nn.BatchNorm1d (inputCount) 88 | self.dense1 = nn.Linear (inputCount, hiddenLayerCounts[0]) 89 | self.batchnorm1 = nn.BatchNorm1d (hiddenLayerCounts[0]) 90 | self.dense2 = nn.Linear(hiddenLayerCounts[0], hiddenLayerCounts[1]) 91 | self.batchnorm2 = nn.BatchNorm1d (hiddenLayerCounts[1]) 92 | self.dense3 = nn.Linear(hiddenLayerCounts[1], hiddenLayerCounts[2]) 93 | self.batchnorm3 = nn.BatchNorm1d (hiddenLayerCounts[2]) 94 | self.outDense = None 95 | if outputCount > 0: 96 | self.outDense = nn.Linear(hiddenLayerCounts[-1], outputCount) 97 | 98 | def forward (self, x): 99 | 100 | # x = self.dropout (self.batchnorm0 (x)) 101 | x = self.batchnorm0(x) 102 | x = self.dropout (self.activation (self.batchnorm1 (self.dense1 (x)))) 103 | x = self.dropout (self.activation (self.batchnorm2 (self.dense2 (x)))) 104 | x = self.dropout (self.activation (self.batchnorm3 (self.dense3 (x)))) 105 | # x = self.outDense (x) 106 | return x 107 | # %% 108 | class EmbedFNN (nn.Module): 109 | 110 | def __init__(self, hidden_layers=HIDDEN_LAYERS, 111 | embed_dim=N_FEAT_TAGS, 112 | features_tag_matrix=features_df): 113 | 114 | super(EmbedFNN, self).__init__() 115 | global N_FEAT_TAGS 116 | N_FEAT_TAGS = 29 117 | 118 | # store the features to tags mapping as a datframe tdf, feature_i mapping is in tdf[i, :] 119 | # dtype = {'tag_0' : 'int8'} 120 | # for i in range (1, 29): 121 | # k = 'tag_' + str (i) 122 | # dtype[k] = 'int8' 123 | # t_df = pd.read_csv ('features.csv', usecols=range (1,N_FEAT_TAGS+1), dtype=dtype) 124 | # tag_29 is for feature_0 125 | features_tag_matrix['tag_29'] = np.array ([1] + ([0]*(N_FEATURES-1)) ).astype ('int8') 126 | self.features_tag_matrix = torch.tensor(features_tag_matrix.values, dtype=torch.float32) 127 | # torch.tensor(t_df.to_numpy()) 128 | N_FEAT_TAGS += 1 129 | 130 | 131 | # embeddings for the tags. Each feature is taken a an embedding which is an avg. of its' tag embeddings 132 | self.embed_dim = embed_dim 133 | self.tag_embedding = nn.Embedding(N_FEAT_TAGS+1, embed_dim) # create a special tag if not known tag for any feature 134 | self.tag_weights = nn.Linear(N_FEAT_TAGS, 1) 135 | 136 | drop_prob = 0.5 137 | self.ffn = FeatureFFN(inputCount=(N_FEATURES+embed_dim), 138 | outputCount=0, 139 | hiddenLayerCounts=[(hidden_layers[0]+embed_dim), 140 | (hidden_layers[1]+embed_dim), 141 | (hidden_layers[2]+embed_dim)], 142 | drop_prob=drop_prob) 143 | self.outDense = nn.Linear (hidden_layers[2]+embed_dim, N_TARGETS) 144 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 145 | return 146 | 147 | def features2emb (self): 148 | """ 149 | idx : int feature index 0 to N_FEATURES-1 (129) 150 | """ 151 | 152 | all_tag_idxs = torch.LongTensor(np.arange(N_FEAT_TAGS)) # (29,) 153 | tag_bools = self.features_tag_matrix.to(self.device) # (130, 29) 154 | # print ('tag_bools.shape =', tag_bools.size()) 155 | all_tag_idxs = all_tag_idxs.to(self.device) 156 | f_emb = self.tag_embedding(all_tag_idxs).repeat(N_FEATURES, 1, 1) 157 | #;print ('1. f_emb =', f_emb) # (29, 7) * (130, 1, 1) = (130, 29, 7) 158 | # print ('f_emb.shape =', f_emb.size()) 159 | f_emb = f_emb * tag_bools[:, :, None] 160 | #;print ('2. f_emb =', f_emb) # (130, 29, 7) * (130, 29, 1) = (130, 29, 7) 161 | # print ('f_emb.shape =', f_emb.size()) 162 | 163 | # Take avg. of all the present tag's embeddings to get the embedding for a feature 164 | s = torch.sum (tag_bools, dim=1) # (130,) 165 | f_emb = torch.sum (f_emb, dim=-2) / s[:, None] 166 | # (130, 7) 167 | # print ('f_emb =', f_emb) 168 | # print ('f_emb.shape =', f_emb.shape) 169 | 170 | # take a linear combination of the present tag's embeddings 171 | # f_emb = f_emb.permute (0, 2, 1) # (130, 7, 29) 172 | # f_emb = self.tag_weights (f_emb) 173 | # #;print ('3. f_emb =', f_emb) # (130, 7, 1) 174 | # f_emb = torch.squeeze (f_emb, dim=-1) 175 | # #;print ('4. f_emb =', f_emb) # (130, 7) 176 | return f_emb.detach().to(self.device) 177 | 178 | def forward (self, features, cat_featrs=None): 179 | """ 180 | when you call `model (x ,y, z, ...)` then this method is invoked 181 | """ 182 | 183 | # cat_featrs = None 184 | features = features.view (-1, N_FEATURES) 185 | f_emb = self.features2emb() 186 | features_2 = torch.matmul (features, f_emb) 187 | 188 | # Concatenate the two features (features + their embeddings) 189 | features = torch.hstack ((features, features_2)) 190 | 191 | x = self.ffn(features) 192 | out = self.outDense(x) 193 | return out 194 | 195 | # %% 196 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols) 197 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 198 | 199 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols) 200 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 201 | 202 | # %% 203 | util_cols = resp_cols 204 | resp_index = [resp_cols.index(r) for r in util_cols] 205 | 206 | regularizer = UtilityLoss(alpha=5e-2, scaling=12, normalize=None, resp_index=resp_index) 207 | 208 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 209 | 210 | model = EmbedFNN() 211 | # model.to(device); 212 | # optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 213 | optimizer = RAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 214 | 215 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 216 | # steps_per_epoch=len(train_loader), 217 | # epochs=EPOCHS) 218 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 219 | T_0=10, T_mult=1, 220 | eta_min=LEARNING_RATE*1e-3, last_epoch=-1) 221 | 222 | finetune_loader = DataLoader( 223 | train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8) 224 | 225 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3) 226 | 227 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=5900) 228 | 229 | # %% 230 | _fold = 7 231 | SEED = 802 232 | get_seed(SEED+SEED*_fold) 233 | lr = [] 234 | 235 | for epoch in range(EPOCHS): 236 | 237 | train_loss = train_epoch(model, optimizer, scheduler,loss_fn, train_loader, device) 238 | # train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device) 239 | lr.append(optimizer.param_groups[0]['lr']) 240 | if (epoch+1) % 10 == 0: 241 | _ = train_epoch_finetune(model, finetune_optimizer, scheduler, 242 | regularizer, finetune_loader, device, loss_fn=loss_fn) 243 | 244 | valid_pred = valid_epoch(model, valid_loader, device) 245 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 246 | f=median_avg, threshold=0.5, target_cols=target_cols) 247 | model_file = MODEL_DIR + \ 248 | f"/emb_fold_{_fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 249 | early_stop(valid_auc, model, model_path=model_file, 250 | epoch_utility_score=valid_score) 251 | tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}") 252 | tqdm.write( 253 | f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}") 254 | tqdm.write( 255 | f"Best util: {early_stop.best_utility_score:.2f} \t {early_stop.message} ") 256 | tqdm.write( 257 | f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n") 258 | if early_stop.early_stop: 259 | print("\nEarly stopping") 260 | break 261 | # %% 262 | -------------------------------------------------------------------------------- /mlp/debug_train_utility_finetune.py: -------------------------------------------------------------------------------- 1 | # %% 2 | 3 | from torchsummary import summary 4 | import os 5 | import sys 6 | import torch 7 | import torch.nn.functional as F 8 | import torch.nn as nn 9 | torch.backends.cudnn.deterministic = True # for bincount 10 | 11 | 12 | current_path = os.path.dirname(os.path.abspath(__file__)) 13 | HOME = os.path.dirname(current_path) 14 | MODEL_DIR = os.path.join(HOME, 'models') 15 | DATA_DIR = os.path.join(HOME, 'data') 16 | sys.path.append(HOME) 17 | 18 | from mlp import * 19 | from utils_js import * 20 | from utils import * 21 | # %% 22 | 23 | ''' 24 | Training script finetuning using a utility regularizer 25 | ''' 26 | 27 | DEBUG = True 28 | FINETUNE = True 29 | BATCH_SIZE = 4096 30 | 31 | FINETUNE_BATCH_SIZE = 1024_00 32 | EPOCHS = 50 33 | FINETUNE_EPOCHS = 20 34 | LEARNING_RATE = 1e-3 35 | WEIGHT_DECAY = 1e-5 36 | EARLYSTOP_NUM = 10 37 | NFOLDS = 1 38 | SCALING = 10 39 | THRESHOLD = 0.5 40 | SEED = 802 41 | get_seed(SEED) 42 | 43 | # f = np.median 44 | # f = np.mean 45 | f = median_avg 46 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 47 | 48 | # %% 49 | with timer("Preprocessing train"): 50 | train_parquet = os.path.join(DATA_DIR, 'train.parquet') 51 | train, valid = preprocess_pt(train_parquet, drop_weight=True) 52 | 53 | for c in range(1, 5): 54 | print(f'action based on resp_{c} mean: ', ' '*10, 55 | train['action_'+str(c)].astype(int).mean()) 56 | 57 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 58 | resp_cols_all = resp_cols 59 | target_cols = ['action_0', 'action_1', 'action_2', 'action_3', 'action_4'] 60 | feat_cols = [f'feature_{i}' for i in range(130)] 61 | # f_mean = np.mean(train[feat_cols[1:]].values, axis=0) 62 | feat_cols.extend(['cross_41_42_43', 'cross_1_2']) 63 | # %% 64 | train_set = ExtendedMarketDataset( 65 | train, features=feat_cols, targets=target_cols, resp=resp_cols) 66 | train_loader = DataLoader( 67 | train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) 68 | 69 | valid_set = ExtendedMarketDataset( 70 | valid, features=feat_cols, targets=target_cols, resp=resp_cols) 71 | valid_loader = DataLoader( 72 | valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) 73 | 74 | # sanity check 75 | # item = next(iter(train_loader)) 76 | # print(item) 77 | # %% 78 | model = ResidualMLP(output_size=len(target_cols)) 79 | model.to(device) 80 | summary(model, input_size=(len(feat_cols), )) 81 | 82 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 83 | # optimizer = Lookahead(optimizer=optimizer, k=10, alpha=0.5) 84 | scheduler = None 85 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 86 | # max_lr=1e-2, epochs=EPOCHS, 87 | # steps_per_epoch=len(train_loader)) 88 | loss_fn = SmoothBCEwLogits(smoothing=0.005) 89 | 90 | es = EarlyStopping(patience=EARLYSTOP_NUM, mode="max") 91 | 92 | # %% 93 | 94 | 95 | class UtilityLoss(nn.Module): 96 | def __init__(self, weight=None, alpha=None, scaling=None, normalize='mean', resp_index=None): 97 | super(UtilityLoss, self).__init__() 98 | self.alpha = alpha if normalize == 'mean' else alpha * \ 99 | 1e-3 # the strength of this regularization 100 | self.normalize = normalize 101 | self.scaling = scaling 102 | self.weight = weight 103 | self.resp_index = resp_index 104 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 105 | 106 | def forward(self, inputs, targets, weights=None, date=None): 107 | ''' 108 | inputs: prediction of the model (without sigmoid, processed with a scaling) 109 | targets: resp columns 110 | negative of the utility for minimization 111 | ''' 112 | if (self.resp_index is not None) and (len(self.resp_index) < 5): 113 | inputs = inputs[..., self.resp_index] 114 | targets = targets[..., self.resp_index] 115 | 116 | inputs = F.sigmoid(self.scaling*inputs) 117 | n_targets = inputs.size(-1) 118 | if n_targets > 1: 119 | weights = weights.repeat((n_targets, 1)) 120 | date = date.repeat((n_targets, 1)) 121 | 122 | # flatten label and prediction tensors 123 | inputs = inputs.view(-1) 124 | targets = targets.view(-1) 125 | weights = weights.view(-1) 126 | date = date.view(-1) 127 | 128 | dates = date.unique().detach() 129 | ndays = len(dates) 130 | 131 | Pi = torch.zeros((ndays, 1), device=self.device, dtype=torch.float32) 132 | for i, day in enumerate(dates): 133 | mask = (date == day) 134 | Pi[i] = (weights[mask]*targets[mask]*inputs[mask]).sum() 135 | 136 | # a single day 137 | # DEBUG notes: bincount is not differentiable for autograd 138 | # Pi = torch.bincount(date, weight * targets * inputs) 139 | # loss = Pi.sum()*(Pi.sum().clamp(min=0))/(Pi.square().sum()) 140 | # loss = (Pi.sum()).square()/(Pi.square().sum()) 141 | 142 | sumPi = Pi.sum() 143 | if self.normalize == 'mean': 144 | loss = -self.alpha*sumPi * \ 145 | (sumPi.clamp(min=0))/(Pi.square().sum())/ndays 146 | else: 147 | loss = -self.alpha*sumPi*(sumPi.clamp(min=0))/ndays 148 | 149 | return loss 150 | 151 | 152 | # %% 153 | _fold = 0 154 | if FINETUNE: 155 | model_weights = os.path.join(MODEL_DIR, f"resmlp_{_fold}.pth") 156 | # model_weights = os.path.join(MODEL_DIR, f"resmlp_ft_old_fold_{_fold}.pth") 157 | # model_weights = os.path.join(MODEL_DIR, f"resmlp_finetune_fold_{_fold}.pth") 158 | try: 159 | model.load_state_dict(torch.load(model_weights)) 160 | except: 161 | model.load_state_dict(torch.load( 162 | model_weights, map_location=torch.device('cpu'))) 163 | model.eval() 164 | valid_pred = valid_epoch(model, valid_loader, device) 165 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 166 | f=median_avg, threshold=0.5, target_cols=target_cols) 167 | 168 | print(f"valid_utility:{valid_score:.2f} \t valid_auc:{valid_auc:.4f}") 169 | # %% 170 | ''' 171 | fine-tuning the trained model utility score 172 | max batch_size: 173 | 3 resps: 409600 174 | 5 resps: 204800 175 | 176 | current best setting: 177 | fold 0, batch_size = 409600, lr *= 1e-3, alpha=5e-2, 1 epoch with loss 178 | fold 1, batch_size = 102400, lr *= 1e-3, 2 epochs 179 | fold 2, batch_size = 102400, lr *= 1e-2, 2 epochs 180 | fold 3, batch_size = 409600, lr *= 1e-3, alpha=1e-1, 1 epoch without loss 181 | fold 4, batch_size = 12800, lr *= 1e-2, alpha=1, 1 epoch without loss 182 | to-do: using the least square loss to model w_{ij} res[ij] 183 | ''' 184 | get_seed(1127) 185 | # resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4'] 186 | 187 | # resp_cols = ['resp', 'resp_1', 'resp_2'] 188 | resp_cols = ['resp', 'resp_4'] 189 | resp_index = [resp_cols_all.index(r) for r in resp_cols] # resp_1, resp_2 190 | 191 | regularizer = UtilityLoss(alpha=1e-1, scaling=12, normalize=None, resp_index=resp_index) 192 | finetune_loader = DataLoader( 193 | train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8) 194 | train_loader = DataLoader(train_set, batch_size=400_000, 195 | shuffle=True, num_workers=8) 196 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3) 197 | 198 | # %% 199 | FINETUNE_EPOCHS = 1 200 | for epoch in range(FINETUNE_EPOCHS): 201 | tqdm.write(f"\nFine tuning epoch {epoch+1} for model {_fold}") 202 | # train_loss = train_epoch(model, finetune_optimizer, scheduler, 203 | # loss_fn, train_loader, device) 204 | _ = train_epoch_utility(model, finetune_optimizer, scheduler, 205 | regularizer, finetune_loader, device, loss_fn=loss_fn) 206 | valid_pred = valid_epoch(model, valid_loader, device) 207 | valid_auc, valid_score = get_valid_score(valid_pred, valid, 208 | f=median_avg, threshold=0.5, target_cols=target_cols) 209 | 210 | tqdm.write(f"\nval_utility:{valid_score:.2f} valid_auc:{valid_auc:.4f}") 211 | # %% 212 | # regularizer = UtilityLoss(alpha=1e-4, scaling=12) 213 | 214 | # finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8) 215 | # finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3) 216 | 217 | 218 | # for epoch in range(EPOCHS): 219 | 220 | # start_time = time() 221 | # train_loss = train_epoch(model, optimizer, scheduler, loss_fn, train_loader, device) 222 | 223 | # train_loss = train_epoch_utility(model, finetune_optimizer, scheduler, 224 | # loss_fn, regularizer, finetune_loader, device) 225 | 226 | # valid_pred = valid_epoch(model, valid_loader, device) 227 | # valid_auc, valid_score = get_valid_score(valid_pred, valid, 228 | # f=median_avg, threshold=0.5, target_cols=target_cols) 229 | # model_file = MODEL_DIR+f"/resmlp_seed_{SEED}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth" 230 | # es(valid_auc, model, model_path=model_file, epoch_utility_score=valid_score) 231 | 232 | # print(f"\nEPOCH:{epoch:2d} tr_loss:{train_loss:.2f} " 233 | # f"val_utility:{valid_score:.2f} valid_auc:{valid_auc:.4f} " 234 | # f"epoch time: {time() - start_time:.1f}sec " 235 | # f"early stop counter: {es.counter}\n") 236 | 237 | # if es.early_stop: 238 | # print("\nEarly stopping") 239 | # break 240 | 241 | # torch.save(model.state_dict(), MODEL_DIR+f"/resmlp_finetune_fold_{_fold}.pth") 242 | # %% 243 | if DEBUG: 244 | resp_cols = ['resp', 'resp_4'] 245 | resp_index = [resp_cols_all.index(r) for r in resp_cols] 246 | regularizer = UtilityLoss(alpha=1e-1, scaling=12, 247 | normalize=None, resp_index=resp_index) 248 | data = next(iter(finetune_loader)) 249 | optimizer.zero_grad() 250 | features = data['features'].to(device) 251 | label = data['label'].to(device) 252 | weights = data['weight'].to(device) 253 | resp = data['resp'].to(device) 254 | date = data['date'].to(device) 255 | model.eval() 256 | outputs = model(features) 257 | loss = loss_fn(outputs, label) 258 | # reg = regularizer(outputs, resp, weights=weight, date=date) 259 | 260 | targets = resp 261 | inputs = outputs 262 | alpha = 1e-3 263 | if resp_index is not None and len(resp_index) < 5: 264 | inputs = outputs[..., resp_index] 265 | targets = targets[..., resp_index] 266 | 267 | inputs = F.sigmoid(10*inputs) 268 | n_targets = inputs.size(-1) 269 | if n_targets > 1: 270 | weights = weights.repeat((n_targets, 1)) 271 | date = date.repeat((n_targets, 1)) 272 | 273 | # flatten label and prediction tensors 274 | inputs = inputs.view(-1) 275 | targets = targets.view(-1) 276 | weights = weights.view(-1) 277 | date = date.view(-1) 278 | 279 | dates = date.unique().detach() 280 | ndays = len(dates) 281 | 282 | Pi = torch.zeros((ndays, 1), device=device, dtype=torch.float32) 283 | for i, day in enumerate(dates): 284 | mask = (date == day) 285 | Pi[i] = (weights[mask]*targets[mask]*inputs[mask]).sum() 286 | 287 | sumPi = Pi.sum() 288 | loss = -alpha*sumPi*(sumPi.clamp(min=0))/ndays 289 | 290 | # loss.backward() 291 | # %% 292 | # %% 293 | if DEBUG: 294 | model.train() 295 | final_loss = 0 296 | data = next(iter(train_loader)) 297 | optimizer.zero_grad() 298 | _features = data['features'].to(device) 299 | _label = data['label'].to(device) 300 | _weights = torch.log(1+data['weight']).to(device) 301 | _outputs = model(_features) 302 | 303 | targets = SmoothBCEwLogits._smooth(_label, _outputs.size(-1), 0.005) 304 | _loss = F.binary_cross_entropy_with_logits(_outputs, _label, weight=_weights) -------------------------------------------------------------------------------- /lgb/v01_explore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Import" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 26 | "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", 27 | "# For example, here's several helpful packages to load\n", 28 | "\n", 29 | "import numpy as np # linear algebra\n", 30 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 31 | "\n", 32 | "# Input data files are available in the read-only \"../input/\" directory\n", 33 | "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", 34 | "\n", 35 | "import os\n", 36 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 37 | " for filename in filenames:\n", 38 | " print(os.path.join(dirname, filename))\n", 39 | " \n", 40 | "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", 41 | "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "\n" 70 | ], 71 | "text/plain": [ 72 | "" 73 | ] 74 | }, 75 | "metadata": {}, 76 | "output_type": "display_data" 77 | } 78 | ], 79 | "source": [ 80 | "import sys, inspect\n", 81 | "import os, gc\n", 82 | "import numpy as np\n", 83 | "from numba import njit\n", 84 | "import datatable as dtable\n", 85 | "import pandas as pd\n", 86 | "import xgboost as xgb\n", 87 | "from hyperopt import hp, fmin, tpe, Trials\n", 88 | "from hyperopt.pyll.base import scope\n", 89 | "from sklearn.metrics import roc_auc_score, roc_curve\n", 90 | "from sklearn.model_selection import GroupKFold\n", 91 | "import matplotlib.pyplot as plt\n", 92 | "from matplotlib.colors import ListedColormap\n", 93 | "from tqdm.notebook import tqdm\n", 94 | "from joblib import dump, load" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "TRAINING = True\n", 104 | "ENSEMBLE = False\n", 105 | "FOLDS = 4\n", 106 | "SEED = 42\n", 107 | "DATA_ROOT = '/storage1/lu/Active/tianyang/Workspace/janestreet/'\n", 108 | "DATA_FILE = DATA_ROOT + 'train.feather' # feather is faster than csv\n", 109 | "\n", 110 | "# janestreet\n", 111 | "currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))\n", 112 | "libdir = os.path.join(currentdir, '..', 'data')\n", 113 | "sys.path.insert(0, libdir) " 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "# Train" 121 | ] 122 | }, 123 | { 124 | "cell_type": "raw", 125 | "metadata": {}, 126 | "source": [ 127 | "train = pd.read_feather(DATA_FILE)\n", 128 | "train = train.query('date > 85').reset_index(drop = True) \n", 129 | "train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use\n", 130 | "train.fillna(train.mean(),inplace=True)\n", 131 | "train = train.query('weight > 0').reset_index(drop = True)\n", 132 | "#train['action'] = (train['resp'] > 0).astype('int')\n", 133 | "train['action'] = ((train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & \\\n", 134 | " (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) & \\\n", 135 | " (train['resp'] > 0 )).astype('int')\n", 136 | "features = [c for c in train.columns if 'feature' in c]\n", 137 | "\n", 138 | "resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']\n", 139 | "\n", 140 | "X = train[features].values\n", 141 | "y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget\n", 142 | "\n", 143 | "f_mean = np.mean(train[features[1:]].values,axis=0)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 6, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "Loading...\n", 156 | "Filling...\n", 157 | "Finish.\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "print('Loading...')\n", 163 | "train = pd.read_feather(DATA_FILE)\n", 164 | "features = [c for c in train.columns if 'feature' in c]\n", 165 | "\n", 166 | "print('Filling...')\n", 167 | "train = train.query('weight > 0').reset_index(drop = True)\n", 168 | "train[features] = train[features].fillna(method = 'ffill').fillna(0)\n", 169 | "train['action'] = (train['resp'] > 0).astype('int')\n", 170 | "\n", 171 | "print('Finish.')\n", 172 | "\n", 173 | "X_tr, y_tr = train.loc[train['date'] > 85, features].values, train.loc[train['date'] > 85, 'action'].values\n", 174 | "d_tr = xgb.DMatrix(X_tr, y_tr)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 7, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "CPU times: user 5h 47min 35s, sys: 11min 22s, total: 5h 58min 58s\n", 187 | "Wall time: 22min 37s\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "%%time\n", 193 | "from sklearn.model_selection import cross_validate\n", 194 | "from sklearn.model_selection import TimeSeriesSplit\n", 195 | "from xgboost import XGBClassifier\n", 196 | "\n", 197 | "# Seed Blending\n", 198 | "models = []\n", 199 | "\n", 200 | "p_best = {\n", 201 | " 'learning_rate': 0.014106988708201764,\n", 202 | " 'max_depth': 8, \n", 203 | " 'gamma': 9.800749651802157, \n", 204 | " 'min_child_weight': 0.3032862674190433, \n", 205 | " 'subsample': 0.4648851101943981, \n", 206 | " 'colsample_bytree': 0.994909039539885, \n", 207 | " 'objective': 'binary:logistic',\n", 208 | " 'eval_metric': 'auc', \n", 209 | " 'tree_method': 'hist', \n", 210 | " }\n", 211 | " \n", 212 | "if TRAINING:\n", 213 | " # scores = cross_validate(clf, X, yy, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=10)\n", 214 | " \n", 215 | " if ENSEMBLE:\n", 216 | " for seed in range(5):\n", 217 | " p_best['random_state'] = seed\n", 218 | " clf = xgb.train(p_best, d_tr, 950)\n", 219 | " models.append(clf)\n", 220 | "\n", 221 | " rubbish = gc.collect()\n", 222 | " else:\n", 223 | " clf = xgb.train(p_best, d_tr, 950)\n", 224 | " models = [clf]\n", 225 | "\n", 226 | " dump(clf, 'xgb.joblib')\n", 227 | " \n", 228 | "else:\n", 229 | " clf = load('xgb.joblib')\n", 230 | " models = [clf]\n", 231 | " " 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "# Predict" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 8, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "@njit\n", 248 | "def fast_fillna(array, values):\n", 249 | " if np.isnan(array.sum()):\n", 250 | " array = np.where(np.isnan(array), values, array)\n", 251 | " return array" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 9, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "application/vnd.jupyter.widget-view+json": { 262 | "model_id": "15004d286cef44d0ac949661c7771428", 263 | "version_major": 2, 264 | "version_minor": 0 265 | }, 266 | "text/plain": [ 267 | "| | 0/? [00:00 0:\n", 283 | " x_tt = test_df.loc[:, features].values\n", 284 | " x_tt[0, :] = fast_fillna(x_tt[0, :], tmp)\n", 285 | " tmp = x_tt[0, :]\n", 286 | " d_tt = xgb.DMatrix(x_tt)\n", 287 | " pred = 0.\n", 288 | " for clf in models:\n", 289 | " pred += clf.predict(d_tt) / len(models)\n", 290 | " pred_df.action = np.where(pred >= opt_th, 1, 0).astype(int)\n", 291 | " else:\n", 292 | " pred_df.action = 0\n", 293 | " env.predict(pred_df)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 10, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "6143" 305 | ] 306 | }, 307 | "execution_count": 10, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "pred = pd.read_csv('submission.csv')\n", 314 | "pred['action'].sum()" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 11, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "15219" 326 | ] 327 | }, 328 | "execution_count": 11, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "len(pred['action'])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [] 343 | } 344 | ], 345 | "metadata": { 346 | "kernelspec": { 347 | "display_name": "Python 3", 348 | "language": "python", 349 | "name": "python3" 350 | }, 351 | "language_info": { 352 | "codemirror_mode": { 353 | "name": "ipython", 354 | "version": 3 355 | }, 356 | "file_extension": ".py", 357 | "mimetype": "text/x-python", 358 | "name": "python", 359 | "nbconvert_exporter": "python", 360 | "pygments_lexer": "ipython3", 361 | "version": "3.7.9" 362 | } 363 | }, 364 | "nbformat": 4, 365 | "nbformat_minor": 4 366 | } 367 | -------------------------------------------------------------------------------- /cv_final.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import datetime 3 | import gc 4 | import os 5 | HOME = os.path.dirname(os.path.abspath(__file__)) 6 | MODEL_DIR = HOME+'/models/' 7 | DATA_DIR = HOME+'/data/' 8 | # from mlp.mlp import * 9 | from utils import * 10 | from utils_js import * 11 | from mlp.tf_models import * 12 | from mlp.mlp import * 13 | 14 | import random 15 | import sys 16 | 17 | import datatable as dt 18 | import matplotlib.pyplot as plt 19 | import numpy as np 20 | import pandas as pd 21 | import torch 22 | from numba import njit 23 | from sklearn.metrics import roc_auc_score 24 | from tqdm import tqdm 25 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 26 | tf.config.optimizer.set_jit(True) 27 | 28 | device = torch.device('cpu') 29 | # %% 30 | 31 | ''' 32 | Various setup for different models 33 | ''' 34 | CV_START_DAY = 401 35 | CV_DAYS = 32 36 | 37 | features = [f'feature_{i}' for i in range(130)] 38 | 39 | features_t = features+ ['cross_41_42_43', 'cross_1_2'] 40 | 41 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4'] 42 | target_cols = ['action_1', 'action_2', 'action_3', 'action', 'action_4'] 43 | 44 | resp_cols_vol = ['resp_3', 'resp', 'resp_4'] 45 | target_cols_vol = ['action_3', 'action', 'action_4'] 46 | # split features for a ResNet feature 2 is more important 47 | features_2_index = [0, 1, 2, 3, 4, 5, 6, 15, 16, 25, 26, 35, 48 | 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49 | 49, 50, 51, 52, 53, 54, 59, 60, 61, 62, 63, 64, 65, 50 | 66, 67, 68, 69, 70, 71, 76, 77, 82, 83, 88, 89, 94, 51 | 95, 100, 101, 106, 107, 112, 113, 118, 119, 128, 129] 52 | 53 | features_1_index = [0] + list(set(range(130)).difference(features_2_index)) 54 | 55 | features_1 = [f'feature_{i}' for i in features_1_index] 56 | 57 | features_2 = [f'feature_{i}' for i in features_2_index] 58 | 59 | 60 | # split features for a ResNet feature 2 is more important 61 | features_1_index_v = [0, 62 | 7, 8, 17, 18, 27, 28, 55, 72, 78, 84, 90, 96, 102, 108, 114, 120, 121, 63 | 11, 12, 21, 22, 31, 32, 57, 74, 80, 86, 92, 98, 104, 110, 116, 124, 125] 64 | # resp_1 resp_2 feat 65 | 66 | features_2_index_v = [0] + list(set(range(130)).difference(features_1_index_v)) 67 | 68 | features_1_v = [f'feature_{i}' for i in features_1_index_v] 69 | 70 | features_2_v = [f'feature_{i}' for i in features_2_index_v] 71 | 72 | 73 | feat_spike_index = [1, 2, 3, 4, 5, 6, 10, 14, 16, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85, 74 | 86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118] 75 | features_spike = [f'feature_{i}' for i in feat_spike_index] 76 | 77 | cat_cols = [f+'_c' for f in features_spike] 78 | 79 | #%% 80 | ''' 81 | Loading model trained in tf and verify their utility scores 82 | ''' 83 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet') 84 | train = pd.read_parquet(train_parquet) 85 | train['action'] = (train['resp'] > 0).astype(int) 86 | for c in range(1,5): 87 | train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int) 88 | 89 | train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43'] 90 | train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5).astype(np.float32) 91 | 92 | most_common_vals = np.load(DATA_DIR+'spike_common_vals_42.npy').reshape(-1) 93 | for i, feat in tqdm(enumerate(features_spike)): 94 | train[feat+'_c'] = (train[feat] - most_common_vals[i]).astype(np.int32) 95 | #%% 96 | ''' 97 | Final model resnet 98 | ''' 99 | # model_files = ['resnet_reg_fold_0_seed_1127802.h5', 100 | # 'resnet_reg_fold_1_seed_1127802.h5', 101 | # 'resnet_reg_fold_2_seed_1127802.h5'] 102 | model_files = ['resnet_reg_fold_1_res_seed_792734.h5', 103 | 'resnet_reg_fold_2_res_seed_97275.h5'] 104 | 105 | # model_files = ['resnet_reg_fold_0_seed_157157.h5', 106 | # 'resnet_reg_fold_1_seed_157157.h5', 107 | # 'resnet_reg_fold_2_seed_157157.h5'] 108 | for _fold, model_file in enumerate(model_files): 109 | print(f"Model {model_file}") 110 | tf.keras.backend.clear_session() 111 | tf_model = create_resnet_reg(len(features_1), len(features_2), len(resp_cols), 112 | hidden_size=256, label_smoothing=5e-03) 113 | 114 | tf_model.load_weights(os.path.join(MODEL_DIR, model_file)) 115 | # tf_model.call = tf.function(tf_model.call, experimental_relax_shapes=True) 116 | 117 | print_valid_score_tf(train, tf_model, start_day=400, num_days=33, 118 | f=median_avg, threshold=0.5, 119 | feature_indices=(features, features_1_index, features_2_index)) 120 | 121 | #%% 122 | ''' 123 | Final model resnet 124 | ''' 125 | # model_files = ['resnet_volatile_fold_0_seed_1127802.h5', 126 | # 'resnet_volatile_fold_1_seed_1127802.h5', 127 | # 'resnet_volatile_fold_2_seed_1127802.h5'] 128 | # model_files = ['resnet_volatile_fold_0_seed_157157.h5', 129 | # 'resnet_volatile_fold_1_seed_157157.h5', 130 | # 'resnet_volatile_fold_2_seed_157157.h5'] 131 | 132 | # model_files = ['resnet_volatile_fold_0_seed_745273.h5', 133 | # 'resnet_volatile_fold_2_seed_962656.h5'] 134 | model_files = ['resnet_volatile_fold_0_seed_5567273.h5', 135 | 'resnet_volatile_fold_1_seed_123835.h5', 136 | 'resnet_volatile_fold_2_seed_676656.h5'] 137 | 138 | for _fold, model_file in enumerate(model_files): 139 | print(f"Model {model_file}") 140 | tf.keras.backend.clear_session() 141 | tf_model = create_resnet(len(features_1_v), len(features_2_v), len(resp_cols_vol), 142 | hidden_size=256, label_smoothing=5e-03) 143 | 144 | tf_model.load_weights(os.path.join(MODEL_DIR, model_file)) 145 | # tf_model.call = tf.function(tf_model.call, experimental_relax_shapes=True) 146 | 147 | print_valid_score_tf(train, tf_model, start_day=400, num_days=33, 148 | f=median_avg, threshold=0.5, 149 | feature_indices=(features, features_1_index_v, features_2_index_v)) 150 | # %% 151 | ''' 152 | Final model ae+mlp, 5 targets 153 | ''' 154 | # encoder_file = 'encoder_reg.hdf5' 155 | # model_files = ['ae_reg_fold_0.hdf5', 156 | # 'ae_reg_fold_1.hdf5', 157 | # 'ae_reg_fold_2.hdf5'] 158 | # hp_file = 'hp_ae_reg.pkl' 159 | 160 | 161 | # encoder_file = 'encoder_692874.hdf5' 162 | # model_files = ['model_692874_0.hdf5', 163 | # 'model_692874_1.hdf5', 164 | # 'model_692874_2.hdf5'] 165 | # hp_file = 'best_hp_692874.pkl' 166 | 167 | encoder_file = 'ae_encoder_157157.hdf5' 168 | model_files = ['ae_157157_0.hdf5', 169 | 'ae_157157_1.hdf5', 170 | 'ae_157157_2.hdf5'] 171 | hp_file = 'ae_hp_157157.pkl' 172 | 173 | _, encoder = create_autoencoder(len(features), len(resp_cols), noise=0.1) 174 | 175 | encoder.load_weights(os.path.join(MODEL_DIR, encoder_file)) 176 | encoder.trainable = False 177 | 178 | model_fn = lambda hp: create_model(hp, len(features), len(resp_cols), encoder) 179 | 180 | hp = pd.read_pickle(os.path.join(MODEL_DIR, hp_file)) 181 | for _fold, model_file in enumerate(model_files): 182 | tf.keras.backend.clear_session() 183 | print(f"Model {model_file}") 184 | model = model_fn(hp) 185 | model.load_weights(os.path.join(MODEL_DIR, model_files[_fold])) 186 | 187 | print_valid_score_tf(train, model, start_day=400, num_days=33, 188 | f=median_avg, threshold=0.5, 189 | feature_indices=[features]) 190 | # %% 191 | ''' 192 | Final model ae+mlp 193 | ''' 194 | # volatile models, 3 targets 195 | # encoder_file = 'encoder_volatile.hdf5' 196 | # model_files = ['ae_volatile_fold_0.hdf5', 197 | # 'ae_volatile_fold_1.hdf5', 198 | # 'ae_volatile_fold_2.hdf5'] 199 | # hp_file = 'hp_ae_volatile.pkl' 200 | 201 | 202 | # encoder_file = 'v_encoder_969725.hdf5' 203 | # model_files = ['v_model_969725_0.hdf5', 204 | # 'v_model_969725_1.hdf5', 205 | # 'v_model_969725_2.hdf5'] 206 | # hp_file = 'v_best_hp_969725.pkl' 207 | 208 | # encoder_file = 'v_encoder_618734.hdf5' 209 | # model_files = ['v_model_618734_0.hdf5', 210 | # 'v_model_618734_1.hdf5', 211 | # 'v_model_618734_2.hdf5'] 212 | # hp_file = 'v_best_hp_618734.pkl' 213 | 214 | encoder_file = 'ae_vol_encoder_283467.hdf5' 215 | model_files = ['ae_vol_283467_0.hdf5', 216 | 'ae_vol_283467_1.hdf5', 217 | 'ae_vol_283467_2.hdf5'] 218 | hp_file = 'ae_vol_hp_283467.pkl' 219 | 220 | _, encoder = create_autoencoder(len(features), len(resp_cols_vol), noise=0.1) 221 | 222 | encoder.load_weights(os.path.join(MODEL_DIR, encoder_file)) 223 | encoder.trainable = False 224 | 225 | model_fn = lambda hp: create_model(hp, len(features), len(resp_cols_vol), encoder) 226 | 227 | hp = pd.read_pickle(os.path.join(MODEL_DIR, hp_file)) 228 | for _fold, model_file in enumerate(model_files): 229 | tf.keras.backend.clear_session() 230 | print(f"Model {model_file}") 231 | model = model_fn(hp) 232 | model.load_weights(os.path.join(MODEL_DIR, model_files[_fold])) 233 | 234 | print_valid_score_tf(train, model, start_day=400, num_days=33, 235 | f=median_avg, threshold=0.5, 236 | feature_indices=[features]) 237 | 238 | #%% 239 | model_files = ['tf_spike_reg_seed_1127802_fold_0.h5', 240 | 'tf_spike_reg_seed_1127802_fold_1.h5', 241 | # 'tf_spike_reg_seed_1127802_fold_2.h5', 242 | 'tf_spike_reg_seed_802_fold_2.h5' 243 | ] 244 | 245 | for _fold, model_file in enumerate(model_files): 246 | print(f"Model {model_file}") 247 | tf.keras.backend.clear_session() 248 | tf_model = create_spikenet(len(features_1), len(features_2), len(cat_cols), len(resp_cols), 249 | hidden_size=256, label_smoothing=5e-03) 250 | 251 | tf_model.load_weights(os.path.join(MODEL_DIR, model_file)) 252 | # tf_model.call = tf.function(tf_model.call, experimental_relax_shapes=True) 253 | 254 | print_valid_score_tf(train, tf_model, start_day=400, num_days=33, 255 | f=median_avg, threshold=0.5, 256 | feature_indices=(features, features_1_index, features_2_index, feat_spike_index)) 257 | # %% 258 | 259 | model_files = ['emb_volatile_fold_0_util_1445_auc_0.5550.pth', 260 | 'emb_volatile_fold_1_util_1225_auc_0.5557.pth', 261 | 'emb_volatile_fold_2_util_240_auc_0.5455.pth'] 262 | 263 | 264 | for _fold, model_file in enumerate(model_files): 265 | model = SpikeNet() 266 | model.to(device) 267 | model_weights = os.path.join(MODEL_DIR, model_file) 268 | model.load_state_dict(torch.load(model_weights, map_location='cpu')) 269 | model.eval(); 270 | print(f"\n\nModel {model_file}") 271 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 272 | batch_size =8192, f=median_avg, threshold=0.5, 273 | target_cols=target_cols, 274 | feat_cols=features, 275 | resp_cols=resp_cols, 276 | cat_input=cat_cols) 277 | # %% 278 | model_files = ['pt_volatile_0_util_1424_auc_0.5520.pth', 279 | 'pt_volatile_1_util_1137_auc_0.5470.pth', 280 | 'pt_volatile_2_util_322_auc_0.5444.pth'] 281 | 282 | 283 | for _fold, model_file in enumerate(model_files): 284 | model = ResidualMLP(input_size=len(features_t), hidden_size=256, output_size=len(target_cols)) 285 | model.to(device) 286 | model_weights = os.path.join(MODEL_DIR, model_file) 287 | model.load_state_dict(torch.load(model_weights, map_location='cpu')) 288 | model.eval(); 289 | print(f"\nModel {model_file}") 290 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 291 | batch_size = 8192, f=median_avg, threshold=0.5, 292 | target_cols=target_cols, 293 | feat_cols=features_t, 294 | resp_cols=resp_cols) 295 | # %% 296 | # %% 297 | model_files = ['final_0_util_1372_auc_0.5483.pth', 298 | 'final_1_util_865_auc_0.5450.pth', 299 | 'final_2_util_507_auc_0.5428.pth'] 300 | 301 | 302 | for _fold, model_file in enumerate(model_files): 303 | model = ResidualMLP(input_size=len(features_t), hidden_size=256, output_size=len(target_cols)) 304 | model.to(device) 305 | model_weights = os.path.join(MODEL_DIR, model_file) 306 | model.load_state_dict(torch.load(model_weights, map_location='cpu')) 307 | model.eval(); 308 | print(f"\nModel {model_file}") 309 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 310 | batch_size = 8192, f=median_avg, threshold=0.5, 311 | target_cols=target_cols, 312 | feat_cols=features_t, 313 | resp_cols=resp_cols) 314 | # %% 315 | --------------------------------------------------------------------------------