├── data
    ├── __init__.py
    ├── f_mean.npy
    ├── janestreet
    │   └── __init__.py
    ├── data_final_eda.py
    ├── data_denoise.py
    ├── data_rolling.py
    └── data_final.py
├── lgb
    ├── __init__.py
    └── v01_explore.ipynb
├── models
    └── __init__.py
├── mlp
    ├── __init__.py
    ├── run_train_final_3.py
    ├── debug_train_dense.py
    ├── run_train_embed.py
    ├── v08_submit.ipynb
    ├── run_train_final_1.py
    ├── run_train_base.py
    ├── run_train_denoise.py
    ├── debug_ae_tf.py
    ├── run_train_final_4.py
    ├── run_train_finetune.py
    ├── debug_embedding_1.py
    ├── debug_resnet_tf.py
    ├── run_train_final_2_overfit.py
    ├── run_train_final_2.py
    ├── debug_embedding_tag.py
    └── debug_train_utility_finetune.py
├── .gitattributes
├── janest.code-workspace
├── __init__.py
├── .gitignore
├── data.py
├── cv_splits.py
├── cv.py
├── iter_cv_torch.py
├── iter_cv.py
├── utils.py
└── cv_final.py


/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lgb/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp import *
2 | from .tf_models import *


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.python linguist-detectable=true 
2 | *.ipynb linguist-language=Python


--------------------------------------------------------------------------------
/data/f_mean.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scaomath/kaggle-jane-street/HEAD/data/f_mean.npy


--------------------------------------------------------------------------------
/janest.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | 	"folders": [
3 | 		{
4 | 			"path": "."
5 | 		}
6 | 	],
7 | }


--------------------------------------------------------------------------------
/data/janestreet/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .competition import make_env
3 | 
4 | __all__ = ['make_env']
5 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils_lgb import *
2 | from .utils import *
3 | # WORKSPACE_FOLDER=/home/scao/Documents/kaggle-jane-street
4 | # PYTHONPATH=${WORKSPACE_FOLDER}:${WORKSPACE_FOLDER}/nn


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.pt
 3 | *.png
 4 | *.txt
 5 | *.zip
 6 | *.pkl
 7 | *.pth
 8 | *.csv
 9 | data/janestreet/competition.cpython-37m-x86_64-linux-gnu.so
10 | data/example_sample_submission.csv
11 | *.parquet
12 | *.json
13 | *.index
14 | nn/untitled_project/
15 | *.hdf5
16 | nn/ae_mlp_1127/
17 | nn/ae_mlp_1127/untitled_project/
18 | *.feather
19 | *.npy
20 | *.h5
21 | .ipynb_checkpoints/
22 | 


--------------------------------------------------------------------------------
/mlp/run_train_final_3.py:
--------------------------------------------------------------------------------
 1 | #%%
 2 | from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, GaussianNoise, Concatenate, Lambda, Activation
 3 | from tensorflow.keras.models import Model, Sequential
 4 | from tensorflow.keras.losses import BinaryCrossentropy
 5 | from tensorflow.keras.optimizers import Adam
 6 | from tensorflow.keras.callbacks import EarlyStopping, Callback
 7 | from tensorflow.keras.layers.experimental.preprocessing import Normalization
 8 | import tensorflow as tf
 9 | import tensorflow_addons as tfa
10 | import kerastuner as kt
11 | from tensorflow.keras import backend as K
12 | 
13 | import numpy as np
14 | import pandas as pd
15 | from tqdm.auto import tqdm
16 | from random import choices
17 | 
18 | current_path = os.path.dirname(os.path.abspath(__file__))
19 | HOME = os.path.dirname(current_path)
20 | MODEL_DIR = os.path.join(HOME,  'models')
21 | DATA_DIR = os.path.join(HOME,  'data')
22 | sys.path.append(HOME)
23 | 
24 | from utils import *
25 | # %%
26 | '''
27 | Final model 2: 
28 | 1. data: including the volatile day but excluding the outlier days (2, 294, 36, 270)
29 | 2. data: the fillna is using the past day mean (after excluding the days above)
30 | 3. data: target is only resp_{0,3,4}
31 | 3. Denoised auto-encoder
32 | 4. simple MLP tf model
33 | '''
34 | # %%
35 | train = pd.read_parquet(os.path.join(DATA_DIR, 'train.parquet'))
36 | train_pdm = pd.read_parquet(os.path.join(DATA_DIR, 'train_pdm.parquet'))
37 | # %%
38 | 


--------------------------------------------------------------------------------
/mlp/debug_train_dense.py:
--------------------------------------------------------------------------------
 1 | #%%
 2 | import tensorflow_addons as tfa
 3 | import tensorflow as tf
 4 | import tensorflow.keras.backend as K
 5 | 
 6 | #%%
 7 | resp_cols = ['resp','resp_1', 'resp_2', 'resp_3', 'resp_4']
 8 | target_cols = ['action','action_1', 'action_2', 'action_3', 'action_4']
 9 | 
10 | 
11 | #%%
12 | def mish(x):
13 |     return tf.keras.layers.Lambda(lambda x: x*K.tanh(K.softplus(x)))(x)
14 | 
15 | tf.keras.utils.get_custom_objects().update({'mish': tf.keras.layers.Activation(mish)})
16 | 
17 | def create_model(input_shape):
18 |     
19 |     inp = tf.keras.layers.Input(input_shape)
20 |     tmp = tf.keras.layers.BatchNormalization()(inp)
21 |     xs = [tmp]
22 |     for _ in range(5):
23 |         if len(xs) > 1:
24 |             tmp = tf.keras.layers.Concatenate(axis=-1)(xs)
25 |         else:
26 |             tmp = xs[0]
27 |         # tmp = tf.keras.layers.Dense(128, activation='mish')(tmp)
28 |         tmp = tf.keras.layers.Dense(128, activation='swish')(tmp)
29 |         tmp = tf.keras.layers.BatchNormalization()(tmp)
30 |         tmp = tf.keras.layers.Dropout(0.2)(tmp)
31 |         xs.append(tmp)
32 |     
33 |     output = tf.keras.layers.Dense(len(resp_cols),activation='sigmoid')(tf.keras.layers.Concatenate()(xs))
34 |     model = tf.keras.models.Model(inp,output)
35 |     optimizer = tfa.optimizers.RectifiedAdam(1e-3)
36 |     model.compile(optimizer, loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001),
37 |                     metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')])
38 |     return model
39 | # %%
40 | model = create_model(132)
41 | model.summary()
42 | # %%
43 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
 1 | #%%
 2 | import kaggle
 3 | from kaggle.api.kaggle_api_extended import KaggleApi
 4 | import os, sys
 5 | from utils import *
 6 | import zipfile
 7 | import pandas as pd
 8 | import datatable as dt
 9 | import numpy as np
10 | 
11 | HOME = os.path.dirname(os.path.abspath(__file__))
12 | MODEL_DIR = os.path.join(HOME,  'models')
13 | DATA_DIR = os.path.join(HOME,  'data')
14 | sys.path.append(HOME) 
15 | 
16 | '''
17 | The API token from https://www.kaggle.com/<username>/account needs to put in ~/.kaggle/ folder in MacOS/Linux or to C:\\Users\\.kaggle\\ on Windows
18 | '''
19 | 
20 | train_dtypes = {'date': np.int32,
21 |                 'ts_id': np.int64,
22 |                 'resp': np.float64,
23 |                 'weight': np.float64,
24 |                 # 'feature_0': np.int8
25 |                 }
26 | for c in range(1,5):
27 |     train_dtypes['resp_'+str(c)] = np.float64
28 | for c in range(130):
29 |     train_dtypes['feature_'+str(c)] = np.float32
30 | 
31 | #%%
32 | if __name__ == "__main__":
33 |     print(f"Current directory     : {HOME}")
34 |     print(f"Current data directory: {DATA_DIR}")
35 |     data_file = find_files('train.csv', DATA_DIR)
36 |     data_parquet = find_files('train.parquet', DATA_DIR)
37 |     data_feather = find_files('train.feather', DATA_DIR)
38 |     if not data_file:
39 |         try:
40 |             api = KaggleApi()
41 |             api.authenticate()
42 |             api.competition_download_files('jane-street-market-prediction',
43 |                                             path=DATA_DIR, quiet=False)
44 |             data_file = find_files('zip', DATA_DIR)
45 |             with zipfile.ZipFile(data_file,"r") as f:
46 |                 f.extractall(DATA_DIR)
47 |         except RuntimeError as err:
48 |             print(f"Needs API token: {err}")
49 |     elif data_parquet and data_feather:
50 |         print(f"Train parquet at: {data_parquet[0]}.")
51 |         with timer("Loading train"):
52 |             train = pd.read_parquet(data_parquet[0])
53 |         print(train.dtypes[:10])
54 |         print(train.dtypes[-10:])
55 | 
56 |         print(f"Train feather at: {data_feather[0]}.")
57 |         with timer("Loading train"):
58 |             train = pd.read_feather(data_feather[0])
59 |         print(train.dtypes[:10])
60 |         print(train.dtypes[-10:])
61 | 
62 |     elif not data_parquet and data_feather:
63 |         with timer("Processing train parquet"):
64 |             # train = pd.read_csv(data_file[0])
65 |             # train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) 
66 |             train = dt.fread(data_file[0], 
67 |                        columns=set(train_dtypes.keys())).to_pandas().astype(train_dtypes)
68 |             train.set_index('ts_id')
69 |             train.to_parquet(os.path.join(DATA_DIR,'train.parquet'))
70 |     else:
71 |         with timer("Processing train feather"):
72 |             train = dt.fread(data_file[0], 
73 |                        columns=set(train_dtypes.keys())).to_pandas().astype(train_dtypes)
74 |             train.set_index('ts_id')
75 |             train.to_feather(os.path.join(DATA_DIR,'train.feather'))


--------------------------------------------------------------------------------
/data/data_final_eda.py:
--------------------------------------------------------------------------------
 1 | #%%
 2 | import os
 3 | import sys
 4 | current_path = os.path.dirname(os.path.abspath(__file__))
 5 | HOME = os.path.dirname(current_path)
 6 | sys.path.append(HOME)
 7 | 
 8 | import pandas as pd
 9 | pd.set_option('display.max_rows', 100)
10 | pd.set_option('display.max_columns', 100)
11 | 
12 | import numpy as np
13 | import datatable as dt
14 | from tqdm.auto import tqdm
15 | from collections import deque
16 | import matplotlib.pyplot as plt
17 | import seaborn as sns
18 | sns.set(style="darkgrid", context="talk")
19 | from jupyterthemes import jtplot
20 | jtplot.style(theme='onedork', context='notebook', ticks=True, grid=False)
21 | 
22 | 
23 | MODEL_DIR = HOME+'/models/'
24 | DATA_DIR = HOME+'/data/'
25 | from utils import *
26 | from utils_js import *
27 | #%%
28 | train_parquet = os.path.join(DATA_DIR, 'train_final.parquet')
29 | train_final = pd.read_parquet(train_parquet)
30 | 
31 | train_parquet = os.path.join(DATA_DIR, 'train_final_ver1.parquet')
32 | train_final_ver1 = pd.read_parquet(train_parquet)
33 | 
34 | train_parquet = os.path.join(DATA_DIR, 'train.parquet')
35 | train_orig = pd.read_parquet(train_parquet)
36 | 
37 | #%%
38 | features_csv = os.path.join(DATA_DIR, 'features.csv')
39 | features = pd.read_csv(features_csv)
40 | tags = [t for t in list(features.iloc[:,1:])]
41 | tags_dict = {}
42 | for tag in tags:
43 |     tags_dict[tag] = features[features[tag] == True]['feature'].to_list()
44 |     # print(tag)
45 |     feat_num = " ".join([t.split('_')[-1] for t in tags_dict[tag]])
46 |     # print(f"Features: {feat_num}")
47 | 
48 | 
49 | def plot_features(feats, train, scatter=False, num_days=3, start_day=None):
50 |     if not start_day:
51 |         start_day = np.random.randint(0, 500-num_days, 1)[0]
52 |     days = [start_day+i for i in range(num_days)]
53 |     days_str = " ".join([str(d) for d in days])
54 | 
55 |     num_feat = len(feats)
56 |     _, axes = plt.subplots(num_feat, 1, figsize=(15,num_feat*2), constrained_layout=True)
57 |     cmap = get_cmap(num_feat*2, cmap='RdYlGn')
58 |     for i in range(num_feat):
59 |         feat = feats[i]
60 |         feat_vals = train[train['date'].isin(days)][feat].reset_index(drop=True)
61 |         if scatter:
62 |             axes[i].scatter(pd.Series(feat_vals).index, pd.Series(feat_vals), s=5, color=cmap(i))
63 |         else:
64 |             axes[i].plot(pd.Series(feat_vals).index, pd.Series(feat_vals), lw=1, color=cmap(i))
65 |         axes[i].set_title (feat+" at "+days_str, fontsize=15);
66 |         axes[i].set_xlim(xmin=0)
67 | # %%
68 | plot_features(tags_dict['tag_22'], train_final, scatter=True)
69 | 
70 | 
71 | # %%
72 | plot_features(tags_dict['tag_2'], train_final)
73 | # %%
74 | # feats = ['feature_74', 'feature_80', 'feature_86', 'feature_92', 'feature_98', 'feature_104']
75 | # feats = ['feature_106', 'feature_118']
76 | feats = ['feature_98', 'feature_104']
77 | plot_features(feats, train_final, start_day=320, num_days=2)
78 | # plt.savefig(DATA_DIR+'feat_98_104_fillna_pdm.png')
79 | plot_features(feats, train_final_ver1, start_day=320, num_days=2)
80 | plot_features(feats, train_orig, start_day=320,num_days=2)
81 | # plt.savefig(DATA_DIR+'feat_98_104.png')
82 | # %%
83 | train_final['feature_92'].value_counts().sort_values(ascending=False)
84 | train_final.query('date in [320]')['feature_92'].value_counts().sort_values(ascending=False)
85 | # %%
86 | feats = ['feature_1', 'feature_69']
87 | start_day = np.random.randint(0, 500-3, 1)[0]
88 | plot_features(feats, train_final, start_day=start_day)
89 | plot_features(feats, train, start_day=start_day)
90 | 
91 | #%%
92 | feat_spike_index = [1, 2, 69, 71, 85, 87, 88, 91, 93, 94, 97, 99, 100, 103, 105, 106]
93 | # feats = ['feature_100', 'feature_106']
94 | feats = ['feature_1', 'feature_2', 'feature_69']
95 | start_day = np.random.randint(0, 500-3, 1)[0]
96 | plot_features(feats, train_final, start_day=start_day, scatter=True)
97 | plot_features(feats, train, start_day=start_day,  scatter=True)
98 | # %%
99 | train[['feature_85','feature_91']].value_counts()


--------------------------------------------------------------------------------
/cv_splits.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import datetime
  3 | import gc
  4 | import os
  5 | import random
  6 | import sys
  7 | 
  8 | import datatable as dt
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import pandas as pd
 12 | import torch
 13 | from numba import njit
 14 | from sklearn.metrics import roc_auc_score
 15 | from tqdm import tqdm
 16 | 
 17 | HOME = os.path.dirname(os.path.abspath(__file__))
 18 | MODEL_DIR = HOME+'/models/'
 19 | DATA_DIR = HOME+'/data/'
 20 | from mlp.mlp import *
 21 | from utils import *
 22 | from utils_js import *
 23 | 
 24 | get_system()
 25 | 
 26 | import warnings
 27 | 
 28 | from tqdm.auto import tqdm
 29 | 
 30 | warnings.simplefilter(action='ignore', category=FutureWarning)
 31 | warnings.simplefilter(action='ignore', category=pd.core.common.SettingWithCopyWarning)
 32 |     
 33 | plt.style.use('bmh')
 34 | plt.rcParams['figure.figsize'] = [14, 8]  # width, height
 35 | 
 36 | #%%
 37 | from matplotlib.colors import ListedColormap
 38 | import numpy as np
 39 | import matplotlib.pyplot as plt
 40 |     
 41 | # this is code slightly modified from the sklearn docs here:
 42 | # https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
 43 | def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
 44 |     """Create a sample plot for indices of a cross-validation object."""
 45 |     
 46 |     cmap_cv = plt.cm.coolwarm
 47 | 
 48 |     jet = plt.cm.get_cmap('jet', 256)
 49 |     seq = np.linspace(0, 1, 256)
 50 |     _ = np.random.shuffle(seq)   # inplace
 51 |     cmap_data = ListedColormap(jet(seq))
 52 | 
 53 |     # Generate the training/testing visualizations for each CV split
 54 |     for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
 55 |         # Fill in indices with the training/test groups
 56 |         indices = np.array([np.nan] * len(X))
 57 |         indices[tt] = 1
 58 |         indices[tr] = 0
 59 | 
 60 |         # Visualize the results
 61 |         ax.scatter(range(len(indices)), [ii + .5] * len(indices),
 62 |                    c=indices, marker='_', lw=lw, cmap=cmap_cv,
 63 |                    vmin=-.2, vmax=1.2)
 64 | 
 65 |     # Plot the data classes and groups at the end
 66 |     ax.scatter(range(len(X)), [ii + 1.5] * len(X),
 67 |                c=y, marker='_', lw=lw, cmap=plt.cm.Set3)
 68 | 
 69 |     ax.scatter(range(len(X)), [ii + 2.5] * len(X),
 70 |                c=group, marker='_', lw=lw, cmap=cmap_data)
 71 | 
 72 |     # Formatting
 73 |     yticklabels = list(range(n_splits)) + ['target', 'day']
 74 |     ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
 75 |            xlabel='Sample index', ylabel="CV iteration",
 76 |            ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
 77 |     ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
 78 |     return ax
 79 | # %%
 80 | n_samples = 20000
 81 | n_groups = 500
 82 | assert n_samples % n_groups == 0
 83 | 
 84 | idx = np.linspace(0, n_samples-1, num=n_samples)
 85 | X_train = np.random.random(size=(n_samples, 5))
 86 | y_train = np.random.choice([0, 1], n_samples)
 87 | groups = np.repeat(np.linspace(0, n_groups-1, num=n_groups), n_samples/n_groups)
 88 | 
 89 | fig, ax = plt.subplots()
 90 | 
 91 | cv = PurgedGroupTimeSeriesSplit(
 92 |     n_splits=5,
 93 |     max_train_group_size=300,
 94 |     group_gap=5,
 95 |     max_test_group_size=40
 96 | )
 97 | 
 98 | plot_cv_indices(cv, X_train, y_train, groups, ax, 5, lw=20);
 99 | # %%
100 | train_parquet = os.path.join(DATA_DIR, 'train_final.parquet')
101 | train_final = pd.read_parquet(train_parquet)
102 | # %%
103 | fig, ax = plt.subplots()
104 | 
105 | cv = PurgedGroupTimeSeriesSplit(
106 |     n_splits=5,
107 |     max_train_group_size=15,
108 |     group_gap=5,
109 |     max_test_group_size=5
110 | )
111 | 
112 | plot_cv_indices(
113 |     cv,
114 |     train_final.query('date < 50')[
115 |         train_final.columns[train_final.columns.str.contains('feature')]
116 |     ].values,
117 |     (train_final.query('date < 50')['resp'] > 0).astype(int).values,
118 |     train_final.query('date < 50')['date'].values,
119 |     ax,
120 |     5,
121 |     lw=20
122 | );


--------------------------------------------------------------------------------
/cv.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import os, sys
  3 | import pandas as pd
  4 | import numpy as np
  5 | import datatable as dt
  6 | 
  7 | HOME = os.path.dirname(os.path.abspath(__file__))
  8 | MODEL_DIR = os.path.join(HOME,  'models')
  9 | DATA_DIR = os.path.join(HOME,  'data')
 10 | sys.path.append(HOME) 
 11 | from utils import *
 12 | from mlp.mlp import *
 13 | # %%
 14 | '''
 15 | Current CV uses train.query('date>450')
 16 | Model: pt models
 17 | '''
 18 | target_cols = ['action_0', 'action_1', 'action_2', 'action_3', 'action_4']
 19 | N_FOLDS = 5
 20 | N_MODELS = 5
 21 | BATCH_SIZE = 8192
 22 | VALID_DATE = 450
 23 | model_list = [MODEL_DIR+f'/resmlp_{i}.pth' for i in range(N_FOLDS)] # baseline
 24 | 
 25 | feat_cols = [f'feature_{i}' for i in range(130)]
 26 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
 27 | # f = median_avg
 28 | f = np.median
 29 | 
 30 | #%%
 31 | 
 32 | def get_valid_df(date, fillna = 'mean'):
 33 |     data_file = find_files('train.csv', DATA_DIR)
 34 |     train = dt.fread(data_file[0]).to_pandas()
 35 |     _feat_cols = [f'feature_{i}' for i in range(130)]
 36 |     if fillna == 'mean':
 37 |         f_mean = np.mean(train[_feat_cols[1:]].values, axis=0) # for inference
 38 |         train.fillna(train.mean(),inplace=True)
 39 |     elif fillna == 'ffill':
 40 |         train[_feat_cols[1:]] = train[_feat_cols[1:]].fillna(method = 'ffill').fillna(0)
 41 |     else: # TO_DO: customized fillna_func
 42 |         pass
 43 | 
 44 |     train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43']
 45 |     train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5)
 46 |     train['action_0'] = (train['resp'] > 0).astype(int)
 47 |     for c in range(1,5):
 48 |         train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int)
 49 |         print(f'action based on resp_{c} mean:   ', train['action_'+str(c)].mean())
 50 |     valid = train.query(f'date > {date}').reset_index(drop = True) 
 51 |     valid.to_parquet(os.path.join(DATA_DIR,'valid.parquet'))
 52 | 
 53 | def load_models(pt_model_files):
 54 |     '''
 55 |     baseline mlp models in the mlp.mlp submodule
 56 |     '''
 57 |     assert len(pt_model_files) == NFOLDS
 58 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 59 |     models = []
 60 |     for model_file in pt_model_files:
 61 |         model = ResidualMLP(output_size=len(target_cols))
 62 |         model.to(device)
 63 |         try:
 64 |             model.load_state_dict(torch.load(model_file))
 65 |         except:
 66 |             model.load_state_dict(torch.load(model_file, map_location='cpu'))
 67 |         model.eval()
 68 |         models.append(model)
 69 |     return models
 70 | 
 71 | 
 72 | def cv_score(valid_df, models, f=np.mean, thresh=0.5, device=None):
 73 |     print(f"Using {f.__qualname__} as ensembler.")
 74 |     if device is None: 
 75 |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 76 |     valid_pred = np.zeros((len(valid_df), len(target_cols)))
 77 |     valid_set = MarketDataset(valid_df, features=feat_cols, targets=target_cols)
 78 |     valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
 79 | 
 80 |     for _fold in range(len(models)):
 81 |         torch.cuda.empty_cache()
 82 |         model = models[_fold]
 83 |         valid_pred_fold = valid_epoch(model, valid_loader, device).reshape(-1, len(target_cols))
 84 |         valid_pred += valid_pred_fold / len(models)
 85 |     valid_auc = roc_auc_score(valid[target_cols].values.astype(float), valid_pred)
 86 |     logloss_score = log_loss(valid[target_cols].values.astype(float), valid_pred)
 87 | 
 88 |     # valid_pred = f(valid_pred[...,:len(target_cols)], axis=-1) # only first 5
 89 |     valid_pred = f(valid_pred, axis=-1) # all
 90 |     valid_pred = np.where(valid_pred >= thresh, 1, 0).astype(int)
 91 |     valid_score = utility_score_bincount(date=valid.date.values, 
 92 |                                          weight=valid.weight.values, 
 93 |                                          resp=valid.resp.values,
 94 |                                          action=valid_pred)
 95 |     valid_score_max = utility_score_bincount(date=valid.date.values, 
 96 |                                          weight=valid.weight.values, 
 97 |                                          resp=valid.resp.values,
 98 |                                          action=(valid.resp.values>0))
 99 |     print(f'Max utils score: {valid_score_max:.2f}') 
100 |     print(f'{len(models)} models valid score: {valid_score:.2f} \t auc: {valid_auc:.4f}') 
101 | 
102 | 
103 | # %%
104 | if __name__ == '__main__':
105 | 
106 |     print(f"Current valid set is date after {VALID_DATE}.\n")
107 |     valid_parquet = find_files('valid.parquet', DATA_DIR)
108 |     if not valid_parquet:
109 |         with timer("Generating validation df"):
110 |             get_valid_df(VALID_DATE)
111 |     else:
112 |         with timer("Generating valid loader"):
113 |             valid = pd.read_parquet(valid_parquet[0])
114 |             valid_set = MarketDataset(valid, features=feat_cols, targets=target_cols)
115 |             valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
116 |         models = load_models(model_list)
117 |         cv_score(valid, models, f=f)
118 | 
119 | 
120 |     '''
121 |     Lindada's model scores on date > 450:
122 |     model 0:  4948
123 |     model 1:  5641
124 |     model 2:  5282
125 |     model 3:  5825
126 |     model 4:  5849
127 |     all five: 6165
128 |     '''
129 | 
130 | # %%
131 | 


--------------------------------------------------------------------------------
/mlp/run_train_embed.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import os, sys
  3 | import pandas as pd
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from torchsummary import summary
  8 | 
  9 | current_path = os.path.dirname(os.path.abspath(__file__))
 10 | HOME = os.path.dirname(current_path)
 11 | MODEL_DIR = os.path.join(HOME,  'models')
 12 | DATA_DIR = os.path.join(HOME,  'data')
 13 | sys.path.append(HOME)
 14 | 
 15 | from mlp import *
 16 | from utils import *
 17 | from utils_js import *
 18 | 
 19 | #%%
 20 | '''
 21 | Training script of the embedding model
 22 | '''
 23 | 
 24 | 
 25 | HIDDEN_LAYERS = [400, 400, 400] # hidden layer size for the embedding model 
 26 | N_FEATURES = 130
 27 | N_FEAT_TAGS = 29
 28 | N_TARGETS = 6
 29 | N_DENOISED_TARGET = 1
 30 | 
 31 | BATCH_SIZE = 8196
 32 | 
 33 | FINETUNE_BATCH_SIZE = 204_800
 34 | 
 35 | EPOCHS = 50
 36 | EARLYSTOP_NUM = 6
 37 | 
 38 | LEARNING_RATE = 1e-3
 39 | WEIGHT_DECAY = 1e-4
 40 | 
 41 | 
 42 | 
 43 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 44 | 
 45 | feat_tag_file = os.path.join(DATA_DIR, 'features.csv')
 46 | feat_cols = [f'feature_{i}' for i in range(130)]
 47 | resp_cols = ['resp', 'resp_dn_0', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
 48 | target_cols = ['action', 'action_dn_0', 'action_1', 'action_2', 'action_3', 'action_4']
 49 | 
 50 | # %%
 51 | with timer("Preprocessing train"):
 52 |     train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 53 |     train = pd.read_parquet(train_parquet)
 54 |     train = train.query ('date > 85').reset_index (drop = True)
 55 |         # df = df[df['weight'] != 0].reset_index (drop = True)
 56 | 
 57 | train.fillna(train.mean(),inplace=True)
 58 | train = add_denoised_target(train, num_dn_target=N_DENOISED_TARGET)
 59 | 
 60 | train['action'] = (train['resp'] > 0).astype('int')
 61 | 
 62 | print(f'action based on resp mean:     ', train['action'].astype(int).mean())
 63 | print(f'action based on resp_dn_{0} mean:', train[f'action_dn_{0}'].astype(int).mean())
 64 | 
 65 | for c in range(1,5):
 66 |     train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int)
 67 |     print(f'action based on resp_{c} mean:   ', train[f'action_{c}'].astype(int).mean())
 68 | 
 69 | valid = train.loc[train.date > 450].reset_index(drop=True)
 70 | # %%
 71 | # %%
 72 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols)
 73 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
 74 | 
 75 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols)
 76 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
 77 | 
 78 | #%%
 79 | model = EmbedFNN(hidden_layers=HIDDEN_LAYERS, output_dim=len(target_cols))
 80 | model.to(device);
 81 | summary(model, input_size=(len(feat_cols), ))
 82 | 
 83 | 
 84 | util_cols = resp_cols
 85 | resp_index = [resp_cols.index(r) for r in util_cols]
 86 | 
 87 | regularizer = UtilityLoss(alpha=1e-1, scaling=12, normalize=None, resp_index=resp_index)
 88 | 
 89 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
 90 | 
 91 | 
 92 | 
 93 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
 94 | # optimizer = RAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
 95 | 
 96 | # scheduler = None
 97 | scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE,
 98 |                                                 steps_per_epoch=len(train_loader),
 99 |                                                 epochs=EPOCHS)
100 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
101 | #                                                                  T_0=10, T_mult=1, 
102 | #                                                                  eta_min=LEARNING_RATE*1e-3, last_epoch=-1)
103 | 
104 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8)
105 | 
106 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-2)
107 | finetune_scheduler = None
108 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=5900)
109 | # %%
110 | _fold = 7
111 | SEED = 802
112 | get_seed(SEED+SEED*_fold)
113 | lr = []
114 | 
115 | for epoch in range(EPOCHS):
116 | 
117 |     train_loss = train_epoch(model, optimizer, scheduler,loss_fn, train_loader, device)
118 |     lr.append(optimizer.param_groups[0]['lr'])
119 |     if (epoch+1) % 10 == 0:
120 |         _ = train_epoch_finetune(model, finetune_optimizer, finetune_scheduler,
121 |                                  regularizer, finetune_loader, device, 
122 |                                  loss_fn=loss_fn)
123 | 
124 |     valid_pred = valid_epoch(model, valid_loader, device)
125 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
126 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
127 |     model_file = MODEL_DIR + \
128 |         f"/emb_fold_{_fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
129 |     early_stop(valid_auc, model, model_path=model_file,
130 |                epoch_utility_score=valid_score)
131 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}")
132 |     tqdm.write(
133 |         f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}")
134 |     tqdm.write(
135 |         f"Best util: {early_stop.best_utility_score:.2f} \t {early_stop.message} ")
136 |     tqdm.write(
137 |         f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
138 |     if early_stop.early_stop:
139 |         print("\nEarly stopping")
140 |         break
141 | # %%
142 | CV_START_DAY = 100
143 | CV_DAYS = 50
144 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
145 |                         batch_size = 8192, f=median_avg, threshold=0.5, 
146 |                         target_cols=target_cols, feat_cols=feat_cols, resp_cols=resp_cols)
147 | # %%
148 | 


--------------------------------------------------------------------------------
/iter_cv_torch.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import os
  3 | import sys
  4 | import pandas as pd
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | from sklearn.metrics import roc_auc_score
  8 | import torch
  9 | from numba import njit
 10 | import random
 11 | import datetime
 12 | 
 13 | HOME = os.path.dirname(os.path.abspath(__file__))
 14 | MODEL_DIR = HOME+'/models/'
 15 | DATA_DIR = HOME+'/data/'
 16 | from utils import *
 17 | from utils_js import *
 18 | from mlp.mlp import *
 19 | get_system()
 20 | # %%
 21 | DEBUG = False
 22 | SEED = 1127
 23 | START_SIMU_TEST = 490 # this day to 499 as simulated test days
 24 | END_SIMU_TEST = 499
 25 | TQDM_INT = 20
 26 | batch_size = 4096
 27 | N_FOLDS = 5
 28 | N_MODELS = 3
 29 | 
 30 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 31 | 
 32 | #%%
 33 | with timer("Loading train parquet"):
 34 |     train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 35 |     train = pd.read_parquet(train_parquet)
 36 | 
 37 | train['action'] = (train['resp'] > 0).astype(int)
 38 | for c in range(1,5):
 39 |     train['action'] = train['action'] & ((train['resp_'+str(c)] > 0))
 40 | features = [c for c in train.columns if 'feature' in c]
 41 | 
 42 | f_mean = np.mean(train[features[1:]].values, axis=0)
 43 | 
 44 | simu_test = train.query(f'date > {START_SIMU_TEST} & date <= {END_SIMU_TEST}').reset_index(drop = True) 
 45 | print(f"Simulated public test file length: {len(simu_test)}")
 46 | 
 47 | 
 48 | # %%
 49 | class Iter_Valid(object):
 50 | 
 51 |     global predicted
 52 |     predicted = []
 53 | 
 54 |     def __init__(self, df, features, batch_size = 1):
 55 |         df = df.reset_index(drop=True)
 56 |         self.columns = ['weight'] + features + ['date']
 57 |         self.df = df[self.columns]
 58 |         self.weight = df['weight'].astype(float).values
 59 |         self.action = df['action'].astype(int).values
 60 |         self.pred_df = df[['action']]
 61 |         # self.pred_df[['action']] = 0
 62 |         self.len = len(df)
 63 |         self.current = 0
 64 |         self.batch_size = batch_size
 65 | 
 66 |     def __iter__(self):
 67 |         return self
 68 | 
 69 |     def __next__(self):
 70 |         pre_start = self.current
 71 |         self.current += self.batch_size
 72 |         if self.current <= self.len:
 73 |             df = self.df[pre_start:self.current].copy()
 74 |             pred_df = self.pred_df[pre_start:self.current].copy()
 75 |             return df, pred_df
 76 |         elif self.current > self.len and (self.current - self.len < self.batch_size):
 77 |             df = self.df[pre_start:self.len].copy()
 78 |             pred_df = self.pred_df[pre_start::self.len].copy()
 79 |             return df, pred_df
 80 |         else:
 81 |             raise StopIteration()
 82 | 
 83 |     def predict(self, pred_df):
 84 |         predicted.append(pred_df)
 85 | # %%
 86 | model_list = []
 87 | for _fold in range(N_FOLDS):
 88 |     torch.cuda.empty_cache()
 89 |     model = ResidualMLP()
 90 |     model.to(device)
 91 |     model_weights = os.path.join(MODEL_DIR, f"resmlp_{_fold}.pth")
 92 |     try:
 93 |         model.load_state_dict(torch.load(model_weights))
 94 |     except:
 95 |         model.load_state_dict(torch.load(model_weights, map_location=torch.device('cpu')))
 96 |     model.eval()
 97 |     n_params = get_num_params(model)
 98 |     print(f"Fold {_fold} model has {n_params} params.")
 99 |     model_list.append(model)
100 | 
101 | model_list = model_list[-N_MODELS:]
102 | 
103 | # %%
104 | if __name__ == '__main__':
105 |     '''
106 |     inference simulation
107 |     Using a customized class
108 | 
109 | 
110 |     For the pytorch res+mlp model for day 490-499:
111 | 
112 |     5 models, np.median: 1082.92
113 |     5 models, np.mean: 1030.73
114 |     5 models, median avg: 1067.43
115 |     3 models, np.median, 0.498 thresh: 1096.30
116 |     3 models, np.median, 0.497 thresh: 1116.35
117 |     3 models, np.median, 0.496 thresh: 1104.17
118 |     3 models, np.mean,  0.497 thresh: 1082
119 |     3 models, np.median, 0.502 thresh: 1088.58
120 |     '''
121 |     date = simu_test['date'].values
122 |     weight = simu_test['weight'].values
123 |     resp = simu_test['resp'].values
124 |     action = simu_test['action'].values
125 | 
126 |     # f = np.mean # 
127 |     # f = np.median 
128 |     f = median_avg 
129 | 
130 |     thresh = 0.502
131 |     print(f"\n\nPredicting the action using {thresh:.3f} threshold with {N_MODELS} models.")
132 |     iter_test = Iter_Valid(simu_test, features)
133 |     start = time()
134 | 
135 |     pbar = tqdm(total=len(simu_test))
136 |     for idx, (test_df, pred_df) in enumerate(iter_test):
137 | 
138 |         if test_df['weight'].item() > 0:
139 |             x_tt = test_df.loc[:, features].values
140 |             if np.isnan(x_tt[:, 1:].sum()):
141 |                 x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
142 | 
143 |             cross_41_42_43 = x_tt[:, 41] + x_tt[:, 42] + x_tt[:, 43]
144 |             cross_1_2 = x_tt[:, 1] / (x_tt[:, 2] + 1e-5)
145 |             feature_inp = np.concatenate((x_tt,
146 |                                           np.array(cross_41_42_43).reshape(x_tt.shape[0], 1),
147 |                                           np.array(cross_1_2).reshape(x_tt.shape[0], 1)), axis=1)
148 |             pred = np.zeros((1, len(target_cols)))
149 |             for model in model_list:
150 |                 pred += model(torch.tensor(feature_inp, dtype=torch.float).to(device))\
151 |                                         .sigmoid().detach().cpu().numpy() / N_MODELS
152 |             pred = f(pred.squeeze())
153 |             pred_df.action = np.where(pred >= thresh, 1, 0).astype(int)
154 |         else:
155 |             pred_df.action = 0
156 | 
157 |         iter_test.predict(pred_df)
158 | 
159 |         time_taken = time() - start
160 |         total_time_est = time_taken / (idx+1) * 1000000 / 60
161 |         pbar.set_description(f"Current speed = {total_time_est:.1f} minutes to complete inference")
162 |         pbar.update()
163 | 
164 |     y_true = simu_test['action']
165 |     y_pred = pd.concat(predicted)['action']
166 |     print('\nValidation auc:', roc_auc_score(y_true, y_pred))
167 |     score = utility_score_bincount(date, weight, resp, y_true)
168 |     score_pred = utility_score_bincount(date, weight, resp, y_pred)
169 |     print('\nMax possible utility score:', score)
170 |     print('\nModel utility score:       ', score_pred)


--------------------------------------------------------------------------------
/data/data_denoise.py:
--------------------------------------------------------------------------------
  1 | #%% denoising target
  2 | import os, sys
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | from scipy.optimize import minimize
  7 | from sklearn.neighbors import KernelDensity
  8 | from sklearn.base import BaseEstimator, TransformerMixin
  9 | 
 10 | current_path = os.path.dirname(os.path.abspath(__file__))
 11 | HOME = os.path.dirname(current_path)
 12 | sys.path.append(HOME)
 13 | for f in ['/home/scao/anaconda3/lib/python3.8/lib-dynload', 
 14 |           '/home/scao/anaconda3/lib/python3.8/site-packages']:
 15 |     sys.path.append(f) 
 16 | MODEL_DIR = HOME+'/models/'
 17 | DATA_DIR = HOME+'/data/'
 18 | from utils import *
 19 | from utils_js import *
 20 | # %%
 21 | 
 22 | '''
 23 | By Lucas Morin
 24 | https://www.kaggle.com/lucasmorin/target-engineering-patterns-denoising
 25 | '''
 26 | 
 27 | def mpPDF(var,q,pts):
 28 |     # Marcenko-Pastur pdf
 29 |     # q=T/N
 30 |     eMin, eMax = var*(1-(1./q)**.5)**2, var*(1+(1./q)**.5)**2
 31 |     eVal = np.linspace(eMin,eMax,pts)
 32 |     pdf = q/(2*np.pi*var*eVal)*((eMax-eVal)*(eVal-eMin))**.5
 33 |     pdf = pd.Series(pdf.reshape(-1,), index=eVal.reshape(-1,))
 34 |     return pdf
 35 | 
 36 | 
 37 | def getPCA(matrix):
 38 |     # Get eVal,eVec from a Hermitian matrix
 39 |     eVal,eVec = np.linalg.eigh(matrix)
 40 |     indices=eVal.argsort()[::-1] # arguments for sorting eVal desc
 41 |     eVal,eVec=eVal[indices],eVec[:,indices]
 42 |     eVal=np.diagflat(eVal)
 43 |     return eVal,eVec
 44 | 
 45 | def fitKDE(obs,bWidth=.25,kernel='gaussian',x=None):
 46 |     # Fit kernel to a series of obs, and derive the prob of obs
 47 |     # x is the array of values on which the fit KDE will be evaluated
 48 |     if len(obs.shape)==1:
 49 |         obs=obs.reshape(-1,1)
 50 |     kde=KernelDensity(kernel=kernel,bandwidth=bWidth).fit(obs)
 51 |     if x is None:
 52 |         x=np.unique(obs).reshape(-1,)
 53 |     if len(x.shape)==1:
 54 |         x=x.reshape(-1,1)
 55 |     logProb=kde.score_samples(x) # log(density)
 56 |     pdf=pd.Series(np.exp(logProb),index=x.flatten())
 57 |     return pdf
 58 | 
 59 | def cov2corr(cov):
 60 |     # Derive the correlation matrix from a covariance matrix
 61 |     std=np.sqrt(np.diag(cov))
 62 |     corr=cov/np.outer(std,std)
 63 |     corr[corr<-1],corr[corr>1]=-1,1 # numerical error
 64 |     return corr
 65 | 
 66 | def errPDFs(var,eVal,q,bWidth,pts=1000):
 67 |     # Fit error
 68 |     pdf0=mpPDF(var,q,pts) # theoretical pdf
 69 |     pdf1=fitKDE(eVal,bWidth,x=pdf0.index.values) # empirical pdf
 70 |     sse=np.sum((pdf1-pdf0)**2)
 71 |     return sse
 72 | 
 73 | def findMaxEval(eVal,q,bWidth):
 74 |     # Find max random eVal by fitting Marcenko’s dist
 75 |     out=minimize(lambda *x:errPDFs(*x),.5,args=(eVal,q,bWidth),
 76 |     bounds=((1E-5,1-1E-5),))
 77 |     if out['success']:
 78 |         var=out['x'][0]
 79 |     else:
 80 |         var=1
 81 |     eMax=var*(1+(1./q)**.5)**2
 82 |     return eMax,var
 83 | 
 84 | def denoisedCorr(eVal,eVec,nFacts):
 85 |     # Remove noise from corr by fixing random eigenvalues
 86 |     eVal_=np.diag(eVal).copy()
 87 |     eVal_[nFacts:]=eVal_[nFacts:].sum()/float(eVal_.shape[0] - nFacts)
 88 |     eVal_=np.diag(eVal_)
 89 |     corr1=np.dot(eVec,eVal_).dot(eVec.T)
 90 |     corr1=cov2corr(corr1)
 91 |     return corr1
 92 | 
 93 | def denoisedCorr2(eVal,eVec,nFacts,alpha=0):
 94 |     # Remove noise from corr through targeted shrinkage
 95 |     eValL,eVecL=eVal[:nFacts,:nFacts],eVec[:,:nFacts]
 96 |     eValR,eVecR=eVal[nFacts:,nFacts:],eVec[:,nFacts:]
 97 |     corr0=np.dot(eVecL,eValL).dot(eVecL.T)
 98 |     corr1=np.dot(eVecR,eValR).dot(eVecR.T)
 99 |     corr2=corr0+alpha*corr1+(1-alpha)*np.diag(np.diag(corr1))
100 |     return corr2
101 | 
102 | 
103 | class RMTDenoising(BaseEstimator, TransformerMixin):
104 |     
105 |     def __init__(self, bWidth=.01, alpha=.5, feature_0=True, sample=0.3, seed=2021):
106 |         self.bWidth = bWidth
107 |         self.alpha = alpha
108 |         self.feature_0 = feature_0
109 |         self.sample = sample
110 |         self.seed = seed
111 |     
112 |     def denoise(self, X):
113 |         sample = X.sample(frac=self.sample, random_state=self.seed)
114 |         q = X.shape[0] / X.shape[1]
115 |         cov = sample.cov().values
116 |         corr0 = cov2corr(cov)
117 | 
118 |         eVal0, eVec0 = getPCA(corr0)
119 |         eMax0, var0 = findMaxEval(np.diag(eVal0), q, bWidth=self.bWidth)
120 |         nFacts0 = eVal0.shape[0] - np.diag(eVal0)[::-1].searchsorted(eMax0)
121 |         corr1 = denoisedCorr2(eVal0, eVec0, nFacts0, alpha=self.alpha)
122 |         eVal1, eVec1 = getPCA(corr1)
123 |         #result = np.hstack((np.diag(eVal1), var0))
124 |         #name = [f'eigen_{i+1}' for i in range(len(eVal1))] + ['var_explained']
125 |         return eVec1[:, :nFacts0]
126 |     
127 |     def fit(self, X, y=None):
128 |         if self.feature_0:
129 |             self.cols_ = [c for c in X.columns if c != 'feature_0']
130 |         else:
131 |             self.cols_ = list(X.columns)
132 |         X_ = X[self.cols_]
133 |         self.W_ = self.denoise(X_)
134 |         self.dim_W_ = self.W_.shape[1]
135 |         return self
136 |     
137 |     def transform(self, X, y=None):
138 |         X_ = X.copy()
139 |         names = [f'proj_{i}' for i in range(self.dim_W_)]
140 |         projection = pd.DataFrame(fast_fillna(X_[self.cols_].values, 0).dot(self.W_), columns=names)
141 |         if self.feature_0:
142 |             projection['feature_0'] = X['feature_0']
143 |         return projection
144 | # %%
145 | if __name__ == '__main__':
146 |     with timer("Preprocessing train"):
147 |         train_file = os.path.join(DATA_DIR, 'train.parquet')
148 |         train = pd.read_parquet(train_file)
149 | 
150 |     # train = train.loc[train.date > 85].reset_index(drop=True)
151 |     drop_days = [2, 36, 270, 294]
152 |     train = train.query(f'date not in {drop_days}').reset_index(drop=True)
153 | 
154 |     '''
155 |     0: all resps
156 |     1: resp, 3, 4
157 |     2: resp, 1, 2
158 |     '''
159 |     _f = 0
160 |     targets = ['resp','resp_1','resp_2','resp_3','resp_4']
161 |     # targets = ['resp','resp_3','resp_4']
162 |     # targets = ['resp','resp_1','resp_2']
163 |     # targets = ['resp','resp_2','resp_4']
164 |     targets_f0 = targets + ['feature_0']
165 | 
166 |     target_tf = RMTDenoising(sample=0.8, seed=1127802+_f)
167 | 
168 |     target_tf.fit(train[targets_f0])
169 | 
170 |     targets_denoised = target_tf.transform(train[targets_f0])
171 |     targets_denoised = targets_denoised.rename(columns={'proj_0': f'resp_dn_{_f}'})
172 |     targets_denoised[[f'resp_dn_{_f}']] = -targets_denoised[f'resp_dn_{_f}'].values
173 |     print(targets_denoised.head(10))
174 |     print(train[targets_f0].head(10))
175 |     targets_denoised[[f'resp_dn_{_f}']].to_csv(os.path.join(DATA_DIR,f'target_dn_{_f}.csv'), index=False)
176 | 


--------------------------------------------------------------------------------
/mlp/v08_submit.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import time\n",
 11 |     "import pickle\n",
 12 |     "import random\n",
 13 |     "import numpy as np\n",
 14 |     "import pandas as pd\n",
 15 |     "from tqdm import tqdm\n",
 16 |     "from sklearn.metrics import log_loss, roc_auc_score\n",
 17 |     "\n",
 18 |     "import torch\n",
 19 |     "import torch.nn as nn\n",
 20 |     "from torch.autograd import Variable\n",
 21 |     "from torch.utils.data import DataLoader\n",
 22 |     "from torch.nn import CrossEntropyLoss, MSELoss\n",
 23 |     "from torch.nn.modules.loss import _WeightedLoss\n",
 24 |     "import torch.nn.functional as F\n",
 25 |     "\n",
 26 |     "import sys\n",
 27 |     "sys.path.insert(0, '../data/')\n",
 28 |     "import janestreet\n",
 29 |     "\n",
 30 |     "pd.set_option('display.max_columns', 100)\n",
 31 |     "pd.set_option('display.max_rows', 100)\n",
 32 |     "\n",
 33 |     "CACHE_PATH = './v08_pytorch_benchmark/'\n",
 34 |     "NFOLDS = 5"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "feat_cols = [f'feature_{i}' for i in range(130)]\n",
 44 |     "target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4']\n",
 45 |     "all_feat_cols = feat_cols.copy()\n",
 46 |     "all_feat_cols.extend(['cross_41_42_43', 'cross_1_2'])\n",
 47 |     "f_mean = np.load(f'{CACHE_PATH}/f_mean_online.npy')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "# Prediction"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "class MLPModel(nn.Module):\n",
 64 |     "    \n",
 65 |     "    # training parameters\n",
 66 |     "    epochs = 200\n",
 67 |     "    label_smoothing = 1e-2\n",
 68 |     "    learning_rate = 1e-3\n",
 69 |     "    \n",
 70 |     "    # model parameters\n",
 71 |     "    hidden_units = [160, 160, 160]\n",
 72 |     "    dropout_rates = [0.2, 0.2, 0.2, 0.2]\n",
 73 |     "    num_columns = len(all_feat_cols)\n",
 74 |     "    num_labels = len(target_cols)\n",
 75 |     "    units = [num_columns] + hidden_units + [num_labels]\n",
 76 |     "    \n",
 77 |     "    def __init__(self):\n",
 78 |     "        super(MLPModel, self).__init__()\n",
 79 |     "        self.batch_norm = nn.ModuleList()\n",
 80 |     "        self.dropout = nn.ModuleList()\n",
 81 |     "        self.dense = nn.ModuleList()\n",
 82 |     "        \n",
 83 |     "        for i in range(len(self.units) - 1):\n",
 84 |     "            self.batch_norm.append(nn.BatchNorm1d(self.units[i]))\n",
 85 |     "            self.dropout.append(nn.Dropout(self.dropout_rates[i]))\n",
 86 |     "            self.dense.append(nn.Linear(self.units[i], self.units[i + 1]))\n",
 87 |     "        \n",
 88 |     "        self.activation = nn.SiLU()\n",
 89 |     "        \n",
 90 |     "    def forward(self, x):\n",
 91 |     "        for i in range(len(self.units) - 1):\n",
 92 |     "            x = self.batch_norm[i](x)\n",
 93 |     "            if i != 0:\n",
 94 |     "                x = self.activation(x)\n",
 95 |     "            x = self.dropout[i](x)\n",
 96 |     "            x = self.dense[i](x)\n",
 97 |     "        # no sigmoid\n",
 98 |     "            \n",
 99 |     "        return x"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 5,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "model_list = []\n",
109 |     "for _fold in range(NFOLDS):\n",
110 |     "    model = MLPModel()\n",
111 |     "    model_weights = f\"{CACHE_PATH}/online_model{_fold}.pth\"\n",
112 |     "    model.load_state_dict(torch.load(model_weights))\n",
113 |     "    model.eval()\n",
114 |     "    model_list.append(model)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stderr",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "15219it [01:58, 128.15it/s]\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "env = janestreet.make_env()\n",
132 |     "env_iter = env.iter_test()\n",
133 |     "\n",
134 |     "device = torch.device(\"cpu\")\n",
135 |     "\n",
136 |     "for (test_df, pred_df) in tqdm(env_iter):\n",
137 |     "    if test_df['weight'].item() > 0:\n",
138 |     "        x_tt = test_df.loc[:, feat_cols].values\n",
139 |     "        if np.isnan(x_tt.sum()):\n",
140 |     "            x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * f_mean\n",
141 |     "\n",
142 |     "        cross_41_42_43 = x_tt[:, 41] + x_tt[:, 42] + x_tt[:, 43]\n",
143 |     "        cross_1_2 = x_tt[:, 1] / (x_tt[:, 2] + 1e-5)\n",
144 |     "        feature_inp = np.concatenate((\n",
145 |     "            x_tt,\n",
146 |     "            np.array(cross_41_42_43).reshape(x_tt.shape[0], 1),\n",
147 |     "            np.array(cross_1_2).reshape(x_tt.shape[0], 1),\n",
148 |     "        ), axis=1)\n",
149 |     "\n",
150 |     "        # torch_pred\n",
151 |     "        torch_pred = np.zeros((1, len(target_cols)))\n",
152 |     "        for model in model_list:\n",
153 |     "            torch_pred += model(torch.tensor(feature_inp, dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy() / NFOLDS\n",
154 |     "        torch_pred = np.median(torch_pred)\n",
155 |     "\n",
156 |     "        # tf_pred\n",
157 |     "        #tf_pred = np.median(np.mean([model(x_tt, training = False).numpy() for model in tf_models],axis=0))\n",
158 |     "\n",
159 |     "        # avg\n",
160 |     "        #pred = torch_pred * 0.5 + tf_pred * 0.5\n",
161 |     "        pred = torch_pred\n",
162 |     "\n",
163 |     "        pred_df.action = np.where(pred >= 0.5, 1, 0).astype(int)\n",
164 |     "    else:\n",
165 |     "        pred_df.action = 0\n",
166 |     "    env.predict(pred_df)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": []
175 |   }
176 |  ],
177 |  "metadata": {
178 |   "kernelspec": {
179 |    "display_name": "Python 3",
180 |    "language": "python",
181 |    "name": "python3"
182 |   },
183 |   "language_info": {
184 |    "codemirror_mode": {
185 |     "name": "ipython",
186 |     "version": 3
187 |    },
188 |    "file_extension": ".py",
189 |    "mimetype": "text/x-python",
190 |    "name": "python",
191 |    "nbconvert_exporter": "python",
192 |    "pygments_lexer": "ipython3",
193 |    "version": "3.7.9"
194 |   }
195 |  },
196 |  "nbformat": 4,
197 |  "nbformat_minor": 4
198 | }
199 | 


--------------------------------------------------------------------------------
/mlp/run_train_final_1.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | from torchsummary import summary
  3 | import os
  4 | import sys
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torch.nn as nn
  8 | torch.backends.cudnn.deterministic = True  # for bincount
  9 | 
 10 | current_path = os.path.dirname(os.path.abspath(__file__))
 11 | HOME = os.path.dirname(current_path)
 12 | MODEL_DIR = os.path.join(HOME,  'models')
 13 | DATA_DIR = os.path.join(HOME,  'data')
 14 | sys.path.append(HOME)
 15 | 
 16 | from utils import *
 17 | from mlp import *
 18 | # %%
 19 | '''
 20 | Training script (including volatile days):
 21 | 1. data: including the volatile day but excluding the outlier days (2, 294, 36, 270)
 22 | 2. data: the fillna is using the past day mean (after excluding the days above)
 23 | 3. training: finetuning using resp colums as regularizer
 24 | '''
 25 | 
 26 | DEBUG = False
 27 | TRAINING_START = 0 
 28 | FINETUNE_BATCH_SIZE = 2048_00
 29 | BATCH_SIZE = 8196
 30 | EPOCHS = 120
 31 | LEARNING_RATE = 1e-4
 32 | WEIGHT_DECAY = 1e-5
 33 | EARLYSTOP_NUM = 20
 34 | NFOLDS = 1
 35 | SCALING = 12
 36 | THRESHOLD = 0.5
 37 | CV_THRESH = 6000
 38 | DAYS_TO_DROP = [2, 36, 270, 294]
 39 | # VOLATILE_DAYS = [1, 3, 4, 5, 8, 9, 12, 16, 17, 18, 23, 24, 26, 27, 30, 31, 32, 37, 38, 
 40 | #                  41, 43, 44, 45, 46, 47, 59, 63, 69, 80, 85, 161, 168, 185, 196, 223, 231, 235, 
 41 | #                  262, 274, 276, 283, 324, 346, 353, 354, 356, 379, 380, 382, 393, 394, 427, 438, 
 42 | #                  452, 454, 459, 462, 468, 475, 488, 489, 491, 492, 495]
 43 | 
 44 | SEED = 1127802
 45 | get_seed(SEED)
 46 | 
 47 | f = median_avg
 48 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 49 | 
 50 | # %%
 51 | with timer("Preprocessing train"):
 52 |     train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
 53 |     train, valid = preprocess_final(train_parquet, day_start=TRAINING_START, 
 54 |                                     training_days=range(0,475), valid_days=range(475, 500),
 55 |                                     drop_days=DAYS_TO_DROP,
 56 |                                     drop_zero_weight=True, denoised_resp=False)
 57 | 
 58 | resp_cols = ['resp_3','resp', 'resp_4']
 59 | resp_cols_all = resp_cols
 60 | target_cols = ['action_3', 'action', 'action_4']
 61 | feat_cols = [f'feature_{i}' for i in range(130)]
 62 | 
 63 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
 64 | 
 65 | ###### adding weight to the features #######
 66 | # feat_cols.extend(['weight'])
 67 | # %%
 68 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols)
 69 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
 70 | 
 71 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols)
 72 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
 73 | 
 74 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols))
 75 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols))
 76 | model.to(device)
 77 | summary(model, input_size=(len(feat_cols), ))
 78 | # %%
 79 | util_cols = resp_cols
 80 | resp_index = [resp_cols_all.index(r) for r in util_cols]
 81 | 
 82 | regularizer = UtilityLoss(alpha=1e-1, scaling=SCALING, normalize=None, resp_index=resp_index)
 83 | 
 84 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
 85 | 
 86 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
 87 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
 88 |                                                                  T_0=10, T_mult=1, 
 89 |                                                                  eta_min=LEARNING_RATE*1e-3, last_epoch=-1)
 90 | 
 91 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=10)
 92 | 
 93 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-4)
 94 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=CV_THRESH)
 95 | # %%
 96 | _fold = 0
 97 | SEED = 1127802
 98 | get_seed(SEED+SEED*_fold)
 99 | 
100 | for epoch in range(EPOCHS):
101 | 
102 |     # train_loss = train_epoch(model, optimizer, None, loss_fn, train_loader, device)
103 |     train_loss = train_epoch_weighted(model, optimizer, None, loss_fn, train_loader, device)
104 |     scheduler.step()
105 |     lr = optimizer.param_groups[0]['lr']
106 |     if (epoch+1) % 2 == 0:
107 |         _ = train_epoch_finetune(model, finetune_optimizer, scheduler,
108 |                                  regularizer, finetune_loader, device, loss_fn=loss_fn)
109 | 
110 |     valid_pred = valid_epoch(model, valid_loader, device)
111 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
112 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
113 | 
114 |     model_file = MODEL_DIR + f"/final_volatile_{_fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
115 |     early_stop(epoch, valid_auc, model, 
116 |                model_path=model_file,
117 |                epoch_utility_score=valid_score)
118 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}")
119 |     tqdm.write(f"Train loss: {train_loss:.4e} \t Current learning rate: {lr:.4e}")
120 |     tqdm.write(f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ")
121 |     tqdm.write(f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
122 |     if early_stop.early_stop:
123 |         print("\nEarly stopping")
124 |         break
125 | 
126 | if DEBUG:
127 |     torch.save(model.state_dict(), MODEL_DIR + f"/model_{_fold}.pth")
128 | # %%
129 | _fold = 4
130 | model_file = f"resw_interleave_1_util_6455_auc_0.6237.pth"
131 | print(f"Loading {model_file} for cv check.\n")
132 | model_weights = os.path.join(MODEL_DIR, model_file)
133 | 
134 | model.to(device)
135 | feat_cols = [f'feature_{i}' for i in range(130)]
136 | feat_cols.extend(['weight'])
137 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
138 | 
139 | 
140 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, 
141 |                     output_size=len(target_cols))
142 | model.to(device)
143 | try:
144 |     model.load_state_dict(torch.load(model_weights))
145 | except:
146 |     model.load_state_dict(torch.load(
147 |         model_weights, map_location=torch.device('cpu')))
148 | model.eval();
149 | 
150 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
151 | train = preprocess_pt(train_parquet, day_start=0, day_split=None, drop_zero_weight=False)
152 | 
153 | CV_START_DAY = 100
154 | CV_DAYS = 25
155 | print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
156 |                         batch_size =2*8192, f=median_avg, threshold=0.5, 
157 |                         target_cols=target_cols, 
158 |                         feat_cols=feat_cols,
159 |                         resp_cols=resp_cols)
160 | # %%
161 | 


--------------------------------------------------------------------------------
/mlp/run_train_base.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import os, sys
  3 | from torchsummary import summary
  4 | 
  5 | current_path = os.path.dirname(os.path.abspath(__file__))
  6 | HOME = os.path.dirname(current_path)
  7 | MODEL_DIR = os.path.join(HOME,  'models')
  8 | DATA_DIR = os.path.join(HOME,  'data')
  9 | sys.path.append(HOME) 
 10 | from utils import *
 11 | from mlp import *
 12 | # %%
 13 | BATCH_SIZE = 4096
 14 | EPOCHS = 200
 15 | LEARNING_RATE = 1e-4
 16 | WEIGHT_DECAY = 1e-5
 17 | EARLYSTOP_NUM = 5
 18 | NFOLDS = 1
 19 | SCALING = 1000
 20 | THRESHOLD = 0.5
 21 | SEED = 802
 22 | get_seed(SEED)
 23 | # f = np.median
 24 | # f = np.mean
 25 | f = median_avg
 26 | 
 27 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 28 | #%%
 29 | with timer("Loading train parquet"):
 30 |     train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 31 |     train = pd.read_parquet(train_parquet)
 32 | 
 33 | train = train.loc[train.date > 85].reset_index(drop=True)
 34 | weight_mean = train.loc[train.weight > 0].mean()
 35 | #%%
 36 | # vanilla actions based on resp
 37 | train['action_0'] = (train['resp'] > 0).astype('int')
 38 | for c in range(1,5):
 39 |     train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype('int')
 40 |     print(f'action based on resp_{c} mean: ' ,' '*10, train['action_'+str(c)].astype(int).mean())
 41 | 
 42 | # sum
 43 | train['resp_all'] = train['resp'].copy()
 44 | for c in range(1,5):
 45 |     train['resp_all'] += train['resp_'+str(c)]
 46 | train['action'] = (train['resp_all'] > 0).astype('int')
 47 | print('All actions mean:  ', '  '*10, train['action'].astype(int).mean())
 48 | 
 49 | for c in range(1,5):
 50 |     train['action_0'+str(c)] = (train['resp'] + train['resp_'+str(c)] > 0)
 51 |     print(f'action based on resp and resp_{c} mean:   ', train['action_0'+str(c)].astype(int).mean())
 52 | 
 53 | for i in range(1,5):
 54 |     for j in range(i+1,5):
 55 |         train['action_'+str(i)+str(j)] = (train['resp_'+str(i)] + train['resp_'+str(j)] > 0) 
 56 |         print(f'action based on resp_{i} and resp_{j} mean: ', train['action_'+str(i)+str(j)].astype(int).mean())
 57 | 
 58 | #%%
 59 | feat_cols = [f'feature_{i}' for i in range(130)]
 60 | # feat_cols = [c for c in train.columns if 'feature' in c]
 61 | f_mean = np.mean(train[feat_cols[1:]].values, axis=0)
 62 | train.fillna(train.mean(),inplace=True)
 63 | 
 64 | valid = train.loc[train.date >= 450].reset_index(drop=True)
 65 | train = train.loc[train.date <= 425].reset_index(drop=True)
 66 | #%%
 67 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
 68 | weight_resp_cols = ['resp_w', 'resp_w_1', 'resp_w_2', 'resp_w_3', 'resp_w_4']
 69 | target_cols = ['action_0', 'action_1', 'action_2', 'action_3', 'action_4']
 70 | # target_cols_all = target_cols
 71 | target_cols_all = ['action', 
 72 |                'action_0', 'action_1', 'action_2', 'action_3', 'action_4', 
 73 |                'action_01', 'action_02', 'action_03', 'action_04', 
 74 |                'action_12', 'action_13', 'action_14', 'action_23', 'action_24', 'action_34']
 75 | 
 76 | target_cols_ex = target_cols + resp_cols + weight_resp_cols
 77 | 
 78 | train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43']
 79 | train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5)
 80 | valid['cross_41_42_43'] = valid['feature_41'] + valid['feature_42'] + valid['feature_43']
 81 | valid['cross_1_2'] = valid['feature_1'] / (valid['feature_2'] + 1e-5)
 82 | 
 83 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
 84 | 
 85 | 
 86 | # %%
 87 | train_set = MarketDataset(train, features=feat_cols, targets=target_cols_all)
 88 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
 89 | 
 90 | valid_set = MarketDataset(valid, features=feat_cols, targets=target_cols_all)
 91 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
 92 | # %%
 93 | model = ResidualMLP(output_size=len(target_cols_all))
 94 | model.to(device)
 95 | summary(model, input_size=(len(feat_cols), ))
 96 | 
 97 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
 98 | optimizer = Lookahead(optimizer=optimizer, k=10, alpha=0.5)
 99 | scheduler = None
100 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
101 | #                                                 max_lr=1e-2, epochs=EPOCHS, 
102 | #                                                 steps_per_epoch=len(train_loader))
103 | loss_fn = SmoothBCEwLogits(smoothing=0.01)
104 | 
105 | es = EarlyStopping(patience=EARLYSTOP_NUM, mode="max")
106 | 
107 | # %%
108 | 
109 | with tqdm(total=EPOCHS) as pbar:
110 |     for epoch in range(EPOCHS):
111 | 
112 |         start_time = time()
113 |         train_loss = train_epoch(model, optimizer, scheduler, loss_fn, train_loader, device)
114 | 
115 |         valid_pred = valid_epoch(model, valid_loader, device)
116 |         valid_auc = roc_auc_score(valid[target_cols_all].values.astype(float).reshape(-1), valid_pred)
117 |         valid_logloss = log_loss(valid[target_cols_all].values.astype(float).reshape(-1), valid_pred)
118 |         valid_pred = valid_pred.reshape(-1, len(target_cols_all))
119 |         # valid_pred = f(valid_pred[...,:len(target_cols)], axis=-1) # only do first 5
120 |         valid_pred = f(valid_pred, axis=-1) # all
121 |         valid_pred = np.where(valid_pred >= THRESHOLD, 1, 0).astype(int)
122 |         valid_score = utility_score_bincount(date=valid.date.values, 
123 |                                             weight=valid.weight.values,
124 |                                             resp=valid.resp.values, 
125 |                                             action=valid_pred)
126 |         model_file = MODEL_DIR+f"/resmlp_seed_{SEED}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
127 |         es(valid_auc, model, model_path=model_file, epoch_utility_score=valid_score)
128 | 
129 |         pbar.set_description(f"EPOCH:{epoch:2d} tr_loss:{train_loss:.2f}  "
130 |                     f"val_utitlity:{valid_score:.2f} valid_auc:{valid_auc:.4f}  "
131 |                     f"epoch time: {time() - start_time:.1f}sec  "
132 |                     f"early stop counter: {es.counter}")
133 |         
134 |         if es.early_stop:
135 |             print("\nEarly stopping")
136 |             break
137 |         pbar.update()
138 | #%%
139 | if True:
140 |     valid_pred = np.zeros((len(valid), len(target_cols_all)))
141 |     for _fold in range(NFOLDS):
142 |         torch.cuda.empty_cache()
143 |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
144 |         model = ResidualMLP(output_size=len(target_cols_all))
145 |         model.to(device)
146 |         model_file = MODEL_DIR + '/resmlp_seed_802_util_2413_auc_0.5475.pth'
147 |         # model_file = MODEL_DIR+f"/resmlp_seed_{SEED}_util_2217_auc_0.5526.pth"
148 |         # model_file = MODEL_DIR + '/resmlp_seed_802_util_2229.pth'
149 |         model.load_state_dict(torch.load(model_file))
150 |         valid_pred_fold = valid_epoch(model, valid_loader, device).reshape(-1, len(target_cols_all))
151 |         valid_pred += valid_pred_fold / NFOLDS
152 |     valid_auc = roc_auc_score(valid[target_cols_all].values.astype(float), valid_pred)
153 |     logloss_score = log_loss(valid[target_cols_all].values.astype(float), valid_pred)
154 | 
155 |     # valid_pred = f(valid_pred[...,:len(target_cols)], axis=-1) # only first 5
156 |     valid_pred = f(valid_pred, axis=-1) # all
157 |     valid_pred = np.where(valid_pred >= THRESHOLD, 1, 0).astype(int)
158 |     valid_score = utility_score_bincount(date=valid.date.values, 
159 |                                          weight=valid.weight.values, 
160 |                                          resp=valid.resp.values,
161 |                                          action=valid_pred)
162 |     valid_score_max = utility_score_bincount(date=valid.date.values, 
163 |                                          weight=valid.weight.values, 
164 |                                          resp=valid.resp.values,
165 |                                          action=(valid.resp.values>0))
166 |     print(f'{NFOLDS} models valid score: {valid_score:.2f}') 
167 |     print(f'Max possible valid score: {valid_score_max:.2f}')
168 |     print(f'auc_score: {valid_auc:.4f} \t logloss_score: {logloss_score:.4f}')
169 | # %%
170 | 


--------------------------------------------------------------------------------
/mlp/run_train_denoise.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | from torchsummary import summary
  3 | import os
  4 | import sys
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torch.nn as nn
  8 | torch.backends.cudnn.deterministic = True  # for bincount
  9 | 
 10 | current_path = os.path.dirname(os.path.abspath(__file__))
 11 | HOME = os.path.dirname(current_path)
 12 | MODEL_DIR = os.path.join(HOME,  'models')
 13 | DATA_DIR = os.path.join(HOME,  'data')
 14 | sys.path.append(HOME)
 15 | 
 16 | from utils import *
 17 | from mlp import *
 18 | # %%
 19 | 
 20 | '''
 21 | Training script finetuning using resp colums as regularizer with an additional denoised target
 22 | '''
 23 | 
 24 | DEBUG = False
 25 | LOAD_PRETRAIN = False
 26 | TRAINING_START = 86  # 86 by default
 27 | 
 28 | FINETUNE_BATCH_SIZE = 2048_00
 29 | BATCH_SIZE = 8196
 30 | EPOCHS = 120
 31 | EARLYSTOP_NUM = 10
 32 | 
 33 | LEARNING_RATE = 1e-4
 34 | WEIGHT_DECAY = 1e-5
 35 | SCALING = 10
 36 | THRESHOLD = 0.5
 37 | NUM_DENOISE = 1
 38 | DAYS_TO_DROP = [2, 36, 270, 294]
 39 | SEED = 1127802
 40 | get_seed(SEED)
 41 | 
 42 | # f = np.median
 43 | # f = np.mean
 44 | f = median_avg
 45 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 46 | 
 47 | 
 48 | feat_cols = [f'feature_{i}' for i in range(130)]
 49 | # f_mean = np.mean(train[feat_cols[1:]].values, axis=0)
 50 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
 51 | 
 52 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
 53 | resp_cols_all = resp_cols
 54 | target_cols = ['action','action_1', 'action_2', 'action_3', 'action_4']
 55 | 
 56 | for c in range(NUM_DENOISE):
 57 |     resp_cols += [f'resp_dn_{c}']
 58 |     target_cols += [f'action_dn_{c}']
 59 | 
 60 | # util_cols = ['resp', 'resp_1', 'resp_2']
 61 | # util_cols = ['resp', 'resp_4']
 62 | # util_cols = ['resp']
 63 | util_cols = resp_cols
 64 | 
 65 | resp_index = [resp_cols_all.index(r) for r in util_cols]
 66 | 
 67 | 
 68 | # %%
 69 | with timer("Preprocessing train"):
 70 |     # train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 71 |     train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
 72 |     train, valid = preprocess_pt(train_parquet, 
 73 |                                  day_start=TRAINING_START, 
 74 |                                  drop_days=DAYS_TO_DROP,
 75 |                                  drop_zero_weight=True, 
 76 |                                  zero_weight_thresh=None,
 77 |                                  denoised_resp=True, 
 78 |                                  num_dn_target=NUM_DENOISE)
 79 | 
 80 | print(f'action based on resp mean:   ', train['action'].astype(int).mean())
 81 | print(f'action based on resp_dn_0 mean:', train[f'action_dn_0'].astype(int).mean())
 82 | 
 83 | for c in range(1, 5):
 84 |     print(f'action based on resp_{c} mean: ', train['action_'+str(c)].astype(int).mean())
 85 | 
 86 | 
 87 | # %%
 88 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols)
 89 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=10)
 90 | 
 91 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols)
 92 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=10)
 93 | 
 94 | 
 95 | # %%
 96 | 
 97 | # regularizer = RespMSELoss(alpha=1e-1, scaling=1, resp_index=resp_index)
 98 | regularizer = UtilityLoss(alpha=5e-2, scaling=12, normalize=None, resp_index=resp_index)
 99 | 
100 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
101 | 
102 | # all_train = pd.concat([train, valid], axis=0)
103 | # all_train_set = ExtendedMarketDataset(all_train, features=feat_cols, targets=target_cols, resp=resp_cols)
104 | # train_loader = DataLoader(all_train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
105 | 
106 | 
107 | model = ResidualMLP(hidden_size=128, output_size=len(target_cols))
108 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols))
109 | model.to(device)
110 | summary(model, input_size=(len(feat_cols), ))
111 | 
112 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
113 | # optimizer = RAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
114 | # optimizer = Lookahead(optimizer=optimizer, alpha=1e-1)
115 | # scheduler = None
116 | 
117 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE,
118 | #                                                     steps_per_epoch=len(train_loader),
119 | #                                                     epochs=EPOCHS)
120 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
121 |                                                                  T_0=10, T_mult=2, 
122 |                                                                  eta_min=LEARNING_RATE*1e-3, last_epoch=-1)
123 | 
124 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8)
125 | 
126 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3)
127 | 
128 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=1200)
129 | 
130 | # %%
131 | if LOAD_PRETRAIN:
132 |     print("Loading model for finetune.")
133 |     _fold = 0
134 |     model_weights = os.path.join(MODEL_DIR, f"resmlp_{_fold}.pth")
135 |     # model_weights = os.path.join(MODEL_DIR, f"resmlp_ft_old_fold_{_fold}.pth")
136 |     # model_weights = os.path.join(MODEL_DIR, f"resmlp_finetune_fold_{_fold}.pth")
137 |     try:
138 |         model.load_state_dict(torch.load(model_weights))
139 |     except:
140 |         model.load_state_dict(torch.load(
141 |             model_weights, map_location=torch.device('cpu')))
142 |     model.eval()
143 |     valid_pred = valid_epoch(model, valid_loader, device)
144 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
145 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
146 | 
147 |     print(f"valid_utility:{valid_score:.2f} \t valid_auc:{valid_auc:.4f}")
148 | # %%
149 | _fold = 6
150 | SEED = 1127802
151 | get_seed(SEED+SEED*_fold)
152 | lr = []
153 | 
154 | for epoch in range(EPOCHS):
155 | 
156 |     # train_loss = train_epoch(model, optimizer, scheduler,loss_fn, train_loader, device)
157 |     train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device)
158 |     lr.append(optimizer.param_groups[0]['lr'])
159 |     if (epoch+1) % 10 == 0:
160 |         _ = train_epoch_finetune(model, finetune_optimizer, scheduler,
161 |                                  regularizer, finetune_loader, device, loss_fn=loss_fn)
162 | 
163 |     valid_pred = valid_epoch(model, valid_loader, device)
164 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
165 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
166 |     model_file = MODEL_DIR + \
167 |         f"/dn_fold_{_fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
168 |     early_stop(epoch, valid_auc, model, model_path=model_file,
169 |                epoch_utility_score=valid_score)
170 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}")
171 |     tqdm.write(
172 |         f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}")
173 |     tqdm.write(
174 |         f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ")
175 |     tqdm.write(
176 |         f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
177 |     if early_stop.early_stop:
178 |         print("\nEarly stopping")
179 |         break
180 | 
181 | if DEBUG:
182 |     torch.save(model.state_dict(), MODEL_DIR + f"/resmlp_interleave_fold_{_fold}.pth")
183 | 
184 | # %%
185 | model_file = f"resmlp_interleave_0_util_7437_auc_0.6389.pth"
186 | print(f"Loading {model_file} for cv check.")
187 | model_weights = os.path.join(MODEL_DIR, model_file)
188 | 
189 | try:
190 |     model.load_state_dict(torch.load(model_weights))
191 | except:
192 |     model.load_state_dict(torch.load(
193 |         model_weights, map_location=torch.device('cpu')))
194 | model.eval();
195 | 
196 | # %%
197 | CV_START_DAY = 100
198 | CV_DAYS = 25
199 | all_train = pd.concat([train, valid], axis=0)
200 | print_all_valid_score(all_train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
201 |                         batch_size = 8192, f=median_avg, threshold=0.5, 
202 |                         target_cols=target_cols, feat_cols=feat_cols,resp_cols=resp_cols)
203 | # %%
204 | 


--------------------------------------------------------------------------------
/mlp/debug_ae_tf.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
  3 | from tensorflow.keras.models import Model, Sequential
  4 | from tensorflow.keras.losses import BinaryCrossentropy
  5 | from tensorflow.keras.optimizers import Adam
  6 | from tensorflow.keras.callbacks import EarlyStopping
  7 | from tensorflow.keras.layers.experimental.preprocessing import Normalization
  8 | from tensorflow.keras.metrics import AUC
  9 | import tensorflow as tf
 10 | import kerastuner as kt
 11 | import numpy as np
 12 | import pandas as pd
 13 | import pickle
 14 | from sklearn.model_selection import GroupKFold
 15 | 
 16 | from tqdm import tqdm
 17 | from random import choices
 18 | 
 19 | import os, sys
 20 | 
 21 | HOME = os.path.abspath(os.path.join('.', os.pardir))
 22 | MODEL_DIR = os.path.join(HOME,  'models')
 23 | DATA_DIR = os.path.join(HOME,  'data')
 24 | sys.path.append(HOME) 
 25 | from utils import *
 26 | from utils_js import *
 27 | #%%
 28 | TRAINING = True
 29 | TRAINING_AE = True
 30 | HP_SEARCH = True
 31 | GPU = True
 32 | USE_FINETUNE = True
 33 | FOLDS = 5
 34 | SEED = 1127
 35 | 
 36 | if GPU:
 37 |     gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
 38 |     tf.config.experimental.set_visible_devices(devices=gpus[0], device_type="GPU")
 39 |     tf.config.experimental.set_memory_growth(device=gpus[0], enable=True)
 40 | 
 41 | # %% loading data
 42 | with timer("Loading train parquet"):
 43 |     train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 44 |     train = pd.read_parquet(train_parquet)
 45 | print(train.info())
 46 | 
 47 | # %%
 48 | with timer("preprocess train"):
 49 |     train = preprocess(train)
 50 | 
 51 | #%%
 52 | features = [c for c in train.columns if 'feature' in c]
 53 | 
 54 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
 55 | 
 56 | X = train[features].values
 57 | y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget
 58 | 
 59 | f_mean = np.mean(train[features[1:]].values,axis=0)
 60 | # %% AE
 61 | 
 62 | def create_autoencoder(input_dim, output_dim, noise=0.05, dropout=0.15):
 63 |     i = Input(input_dim)
 64 |     encoded = BatchNormalization()(i)
 65 |     encoded = GaussianNoise(noise)(encoded)
 66 |     encoded = Dense(64,activation='relu')(encoded)
 67 |     decoded = Dropout(dropout)(encoded)
 68 |     decoded = BatchNormalization()(decoded)
 69 |     decoded = Dense(input_dim,name='decoded')(decoded)
 70 |     x = Dense(32,activation='relu')(decoded)
 71 |     x = BatchNormalization()(x)
 72 |     x = Dropout(dropout)(x)
 73 |     x = Dense(32,activation='relu')(x)
 74 |     x = BatchNormalization()(x)
 75 |     x = Dropout(dropout)(x)    
 76 |     x = Dense(output_dim, activation='sigmoid', name='label_output')(x)
 77 |     
 78 |     encoder = Model(inputs=i,outputs=encoded)
 79 |     autoencoder = Model(inputs=i,outputs=[decoded,x])
 80 |     
 81 |     autoencoder.compile(optimizer=Adam(0.001),
 82 |                         loss={'decoded':'mse',
 83 |                               'label_output':'binary_crossentropy'})
 84 |     return autoencoder, encoder
 85 | 
 86 | def create_model(hp,input_dim,output_dim,encoder):
 87 |     inputs = Input(input_dim)
 88 |     
 89 |     x = encoder(inputs)
 90 |     x = Concatenate()([x,inputs, x]) #use both raw and encoded features
 91 |     x = BatchNormalization()(x)
 92 |     x = Dropout(hp.Float('init_dropout',0.0,0.5))(x)
 93 |     
 94 |     for i in range(hp.Int('num_layers',1,5)):
 95 |         x = Dense(hp.Int(f'num_units_{i}',64,256))(x)
 96 |         x = BatchNormalization()(x)
 97 |         x = Lambda(tf.keras.activations.swish)(x)
 98 |         x = Dropout(hp.Float(f'dropout_{i}',0.0,0.5))(x)
 99 |     x = Dense(output_dim,activation='sigmoid')(x)
100 |     model = Model(inputs=inputs,outputs=x)
101 |     model.compile(optimizer=Adam(hp.Float('lr',0.00001,0.1,
102 |                                 default=0.001)),
103 |                                 loss=BinaryCrossentropy(label_smoothing=hp.Float('label_smoothing',0.0,0.1)),
104 |                                 metrics=[AUC(name = 'auc')])
105 |     return model
106 | # %%
107 | autoencoder, encoder = create_autoencoder(X.shape[-1],y.shape[-1],noise=0.1)
108 | if TRAINING_AE:
109 |     autoencoder.fit(X, (X,y),
110 |                     epochs=1000,
111 |                     batch_size=4096*2, 
112 |                     validation_split=0.1,
113 |                     callbacks=[EarlyStopping('val_loss',
114 |                                patience=10,
115 |                                restore_best_weights=True)])
116 |     encoder.save_weights(MODEL_DIR+'/encoder.hdf5')
117 | else:
118 |     encoder.load_weights(MODEL_DIR+'/encoder.hdf5')
119 | 
120 | encoder.trainable = True
121 | 
122 | #%%
123 | 
124 | class CVTuner(kt.engine.tuner.Tuner):
125 |     def run_trial(self, trial, X, y, splits, batch_size=32, verbose=2, epochs=1, callbacks=None):
126 |         val_losses = []
127 |         for idx_tr, idx_val in splits:
128 |             X_train, X_val = [x[idx_tr] for x in X], [x[idx_val] for x in X]
129 |             y_train, y_val = [a[idx_tr] for a in y], [a[idx_val] for a in y]
130 |             if len(X_train) < 2:
131 |                 X_train = X_train[0]
132 |                 X_val = X_val[0]
133 |             if len(y_train) < 2:
134 |                 y_train = y_train[0]
135 |                 y_val = y_val[0]
136 |             
137 |             model = self.hypermodel.build(trial.hyperparameters)
138 |             hist = model.fit(X_train,y_train,
139 |                       validation_data=(X_val,y_val),
140 |                       epochs=epochs,
141 |                       batch_size=batch_size,
142 |                       callbacks=callbacks,
143 |                       verbose=verbose)
144 |             
145 |             val_losses.append([hist.history[k][-1] for k in hist.history])
146 | 
147 |         val_losses = np.asarray(val_losses)
148 |         self.oracle.update_trial(trial.trial_id, 
149 |                 {k:np.mean(val_losses[:,i]) for i,k in enumerate(hist.history.keys())})
150 |         self.save_model(trial.trial_id, model)
151 | 
152 | model_fn = lambda hp: create_model(hp,X.shape[-1],y.shape[-1], encoder)
153 | 
154 | tuner = CVTuner(
155 |         hypermodel=model_fn,
156 |         directory=f'ae_mlp_{SEED}',
157 |         oracle=kt.oracles.BayesianOptimization(
158 |         objective= kt.Objective('val_auc', direction='max'),
159 |         num_initial_points=10,
160 |         max_trials=50))
161 | 
162 | gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, group_gap=5)
163 | splits = list(gkf.split(y, groups=train['date'].values))
164 | #%%
165 | if HP_SEARCH:
166 |     tuner.search((X,),(y,),
167 |                  splits=splits,
168 |                  batch_size=8192,
169 |                  epochs=50,
170 |                  verbose=2,
171 |                  callbacks=[EarlyStopping('val_auc', 
172 |                                           mode='max',
173 |                                           patience=5)])
174 |     hp  = tuner.get_best_hyperparameters(1)[0]
175 | 
176 |     with open(MODEL_DIR+f'/best_hp_{SEED}.pkl', 'wb') as f:
177 |         pickle.dump(hp, f, protocol=pickle.HIGHEST_PROTOCOL)
178 |     tuner.results_summary()
179 | #%%
180 | if TRAINING:
181 |     with open(MODEL_DIR+f'/best_hp_{SEED}.pkl', 'rb') as f:
182 |         hp = pickle.load(f)
183 | 
184 |     for fold, (idx_tr, idx_val) in enumerate(splits):
185 |         model = model_fn(hp)
186 |         X_train, X_val = X[idx_tr], X[idx_val]
187 |         y_train, y_val = y[idx_tr], y[idx_val]
188 |         model.fit(X_train,
189 |                   y_train,
190 |                   validation_data=(X_val,y_val),
191 |                   epochs=100, 
192 |                   batch_size=8192,
193 |                   callbacks=[EarlyStopping('val_auc',
194 |                                            mode='max',
195 |                                            patience=10,
196 |                                            restore_best_weights=True)])
197 |         model.save_weights(MODEL_DIR + f'/model_{SEED}_{fold}.hdf5')
198 |         model.compile(Adam(hp.get('lr')/100),loss='binary_crossentropy')
199 | 
200 |         model.fit(X_val, y_val, epochs=3, batch_size=8192)
201 |         model.save_weights(MODEL_DIR+f'/model_{SEED}_{fold}_finetune.hdf5')
202 |     
203 | else:
204 |     models = []
205 |     hp = pd.read_pickle(MODEL_DIR+f'/best_hp_{SEED}.pkl')
206 |     for f in range(FOLDS):
207 |         model = model_fn(hp)
208 |         if USE_FINETUNE:
209 |             model.load_weights(MODEL_DIR+f'/model_{SEED}_{f}_finetune.hdf5')
210 |         else:
211 |             model.load_weights(MODEL_DIR+f'/model_{SEED}_{f}.hdf5')
212 |         models.append(model)
213 | # %%
214 | 


--------------------------------------------------------------------------------
/iter_cv.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import os
  3 | import sys
  4 | import pandas as pd
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | from sklearn.metrics import roc_auc_score
  8 | import torch
  9 | import tensorflow as tf
 10 | from numba import njit
 11 | import random
 12 | import datetime
 13 | 
 14 | HOME = os.path.dirname(os.path.abspath(__file__))
 15 | MODEL_DIR = HOME+'/models/'
 16 | DATA_DIR = HOME+'/data/'
 17 | from utils import *
 18 | from utils_js import *
 19 | # from nn.mlp import *
 20 | 
 21 | DEBUG = False
 22 | SEED = 1111
 23 | START_SIMU_TEST = 490 # this day to 499 as simulated test days
 24 | END_SIMU_TEST = 499
 25 | TQDM_INT = 20
 26 | batch_size = 5000
 27 | label_smoothing = 1e-2
 28 | learning_rate = 1e-3
 29 | 
 30 | GPU = False
 31 | 
 32 | if GPU:
 33 |     gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
 34 |     tf.config.experimental.set_visible_devices(devices=gpus[0], device_type="GPU")
 35 |     tf.config.experimental.set_memory_growth(device=gpus[0], enable=True)
 36 | else:
 37 |     cpus = tf.config.experimental.list_physical_devices(device_type='CPU')
 38 |     tf.config.experimental.set_visible_devices(devices= cpus, device_type='CPU')
 39 | 
 40 | 
 41 | #%%
 42 | '''
 43 | The mock test set is taken after the Purged Time series CV split last fold's test set:
 44 | i.e., START_SIMU_TEST date needs to be > 382
 45 | 
 46 | Reference:
 47 | https://www.kaggle.com/jorijnsmit/found-the-holy-grail-grouptimeseriessplit
 48 | https://www.kaggle.com/tomwarrens/purgedgrouptimeseriessplit-stacking-ensemble-mode
 49 | '''
 50 | 
 51 | with timer("Loading train parquet"):
 52 |     train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 53 |     train = pd.read_parquet(train_parquet)
 54 | # print(train.info())
 55 | 
 56 | train['action'] = (train['resp'] > 0)
 57 | for c in range(1,5):
 58 |     train['action'] = train['action'] & ((train['resp_'+str(c)] > 0))
 59 | features = [c for c in train.columns if 'feature' in c]
 60 | 
 61 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
 62 | 
 63 | # X = train[features].values
 64 | # y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget
 65 | 
 66 | f_mean = np.mean(train[features[1:]].values, axis=0)
 67 | 
 68 | simu_test = train.query(f'date > {START_SIMU_TEST} & date <= {END_SIMU_TEST}').reset_index(drop = True) 
 69 | print(f"Simulated public test file length: {len(simu_test)}")
 70 | 
 71 | #%%
 72 | class Iter_Valid(object):
 73 | 
 74 |     global predicted
 75 |     predicted = []
 76 | 
 77 |     def __init__(self, df, features, batch_size = 1):
 78 |         df = df.reset_index(drop=True)
 79 |         self.columns = ['weight'] + features + ['date']
 80 |         self.df = df[self.columns]
 81 |         self.weight = df['weight'].astype(float).values
 82 |         self.action = df['action'].astype(int).values
 83 |         self.pred_df = df[['action']]
 84 |         # self.pred_df[['action']] = 0
 85 |         self.len = len(df)
 86 |         self.current = 0
 87 |         self.batch_size = batch_size
 88 | 
 89 |     def __iter__(self):
 90 |         return self
 91 | 
 92 |     def __next__(self):
 93 |         pre_start = self.current
 94 |         self.current += self.batch_size
 95 |         if self.current <= self.len:
 96 |             df = self.df[pre_start:self.current].copy()
 97 |             pred_df = self.pred_df[pre_start:self.current].copy()
 98 |             return df, pred_df
 99 |         elif self.current > self.len and (self.current - self.len < self.batch_size):
100 |             df = self.df[pre_start:self.len].copy()
101 |             pred_df = self.pred_df[pre_start::self.len].copy()
102 |             return df, pred_df
103 |         else:
104 |             raise StopIteration()
105 | 
106 |     def predict(self,pred_df):
107 |         predicted.append(pred_df)
108 | # %% seed 1111 overfit model
109 | hidden_units = [150, 150, 150]
110 | dropout_rates = [0.2, 0.2, 0.2, 0.2]
111 | 
112 | def create_mlp_tf(
113 |     num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
114 | ):
115 | 
116 |     inp = tf.keras.layers.Input(shape=(num_columns,))
117 |     x = tf.keras.layers.BatchNormalization()(inp)
118 |     x = tf.keras.layers.Dropout(dropout_rates[0])(x)
119 |     for i in range(len(hidden_units)):
120 |         x = tf.keras.layers.Dense(hidden_units[i])(x)
121 |         x = tf.keras.layers.BatchNormalization()(x)
122 |         x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
123 |         x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
124 | 
125 |     x = tf.keras.layers.Dense(num_labels)(x)
126 |     out = tf.keras.layers.Activation("sigmoid")(x)
127 | 
128 |     model = tf.keras.models.Model(inputs=inp, outputs=out)
129 |     model.compile(
130 |         optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
131 |         loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
132 |         metrics=tf.keras.metrics.AUC(name="AUC"),
133 |     )
134 |     return model
135 | 
136 | 
137 | model = create_mlp_tf(num_columns=len(features), 
138 |                       num_labels=5, 
139 |                       hidden_units=hidden_units, 
140 |                       dropout_rates=dropout_rates, 
141 |                       label_smoothing=label_smoothing, 
142 |                       learning_rate=learning_rate)
143 | 
144 | model.load_weights(os.path.join(MODEL_DIR,f'model_{SEED}.hdf5'))
145 | model.summary()
146 | models = []
147 | models.append(model)
148 | 
149 | #%% 10k pytorch model
150 | 
151 | #%%
152 | if DEBUG:
153 |     '''
154 |     Old testing code here: using class is much faster than iterrows() of pandas
155 |     '''
156 |     test_columns = ['weight'] + features + ['date']
157 |     predicted = []
158 |     def set_predict(df):
159 |         predicted.append(df)
160 | 
161 |     test_len = 1_000
162 |     start = time()
163 |     with tqdm(total=test_len) as pbar:
164 |         for idx, row in simu_test.iterrows():
165 |             row = pd.DataFrame(row.values.reshape(1,-1), columns=list(row.index))
166 |             test_df = row[test_columns].astype(float)
167 |             pred_df = row[['action']].astype(int)
168 |             pred_df.action = (random.random() > 0.7)
169 |             set_predict(pred_df)
170 |             
171 |             time_taken = time() - start
172 |             total_time_est = time_taken / (idx+1) * 1000000 / 60
173 |             pbar.set_description(f"Current speed = {total_time_est:.2f} minutes to complete inference")
174 |             pbar.update(1)
175 | 
176 |             if idx >= test_len:
177 |                 break
178 | 
179 | 
180 | # %%
181 | 
182 | if __name__ == '__main__':
183 |     '''
184 |     inference simulation
185 |     Using a customized class
186 | 
187 | 
188 |     For the seed = 1111 overfit model for day 490-499:
189 |     np.mean: 815.71 
190 |     np.median: 893.32
191 |     avg median: 838.97
192 |     thresh 0.51 + np.median: 824.71
193 |     thresh 0.501 + np.median: 878.82
194 |     thresh 0.498 + np.median: 902.64
195 |     thresh 0.499 + np.median: 893.70
196 |     thresh 0.4985 + np.median: 908.28
197 | 
198 | 
199 |     '''
200 |     date = simu_test['date'].values
201 |     weight = simu_test['weight'].values
202 |     resp = simu_test['resp'].values
203 |     action = simu_test['action'].values
204 | 
205 |     # f = np.mean # 
206 |     f = np.median 
207 |     # f = median_avg 
208 | 
209 |     THRESHOLD = 0.4985
210 | 
211 |     iter_test = Iter_Valid(simu_test, features, batch_size=1)
212 |     start = time()
213 | 
214 |     pbar = tqdm(total=len(simu_test))
215 |     for idx, (test_df, pred_df) in enumerate(iter_test):
216 | 
217 |         if test_df['weight'].item() > 0:
218 |             x_tt = test_df.loc[:, features].values
219 |             if np.isnan(x_tt[:, 1:].sum()):
220 |                 x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
221 |             pred = np.mean([model(x_tt, training = False).numpy() for model in models],axis=0)
222 |             pred = f(pred.squeeze())
223 |             pred_df.action = np.where(pred >= THRESHOLD, 1, 0).astype(int)
224 |         else:
225 |             pred_df.action = 0
226 | 
227 |         iter_test.predict(pred_df)
228 | 
229 |         time_taken = time() - start
230 |         total_time_est = time_taken / (idx+1) * 1000000 / 60
231 |         pbar.set_description(f"Current speed = {total_time_est:.2f} minutes to complete inference")
232 |         pbar.update()
233 | 
234 |     y_true = simu_test['action']
235 |     y_pred = pd.concat(predicted)['action']
236 |     print('\nValidation auc:', roc_auc_score(y_true, y_pred))
237 |     score = utility_score_bincount(date, weight, resp, y_true)
238 |     score_pred = utility_score_bincount(date, weight, resp, y_pred)
239 |     print('\nMax possible utility score:', score)
240 |     print('\nModel utility score:       ', score_pred)


--------------------------------------------------------------------------------
/data/data_rolling.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import os
  3 | import sys
  4 | import pandas as pd
  5 | import numpy as np
  6 | from tqdm.auto import tqdm
  7 | from collections import deque
  8 | import collections
  9 | 
 10 | current_path = os.path.dirname(os.path.abspath(__file__))
 11 | HOME = os.path.dirname(current_path)
 12 | MODEL_DIR = HOME+'/models/'
 13 | DATA_DIR = HOME+'/data/'
 14 | 
 15 | from utils import *
 16 | from utils_js import *
 17 | # %%
 18 | '''
 19 | 1. Using the past day mean as fillna
 20 | 2. For certain features use EWM (maybe too slow?)
 21 | 
 22 | Past day mean
 23 | Reference: Lucas Morin's notebook
 24 | https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012
 25 | 
 26 | Modified by Shuhao Cao and Ethan Zheng to 
 27 | 1. able to return past day trading numbers.
 28 | 2. able to use feature 64 to predict whether a day is ''busy''
 29 | 
 30 | '''
 31 | 
 32 | 
 33 | class RunningPDA:
 34 |     '''
 35 |     https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012
 36 |     '''
 37 |     def __init__(self, past_mean=0, start=1000, end=2500, slope=0.00116):
 38 |         self.day = -1
 39 |         self.past_mean = past_mean # past day mean, initialized as the mean
 40 |         self.cum_sum = 0
 41 |         self.day_instances = 0 # current day instances
 42 |         self.past_value = past_mean # the previous row's value, initialized as the mean
 43 |         self.past_instances = 0 # instances in the past day
 44 |         
 45 |         self.start = start
 46 |         self.end = end
 47 |         self.slope = slope
 48 |         self.start_value = None
 49 |         self.end_value = None
 50 | 
 51 |     def clear(self):
 52 |         self.n = 0
 53 |         self.windows.clear()
 54 | 
 55 |     def push(self, x, date):
 56 |         x = fast_fillna(x, self.past_value)
 57 |         self.past_value = x
 58 |         
 59 |         # change of day
 60 |         if date > self.day:
 61 |             self.day = date
 62 |             if self.day_instances > 0:
 63 |                 self.past_mean = self.cum_sum/self.day_instances
 64 |             self.past_instances = self.day_instances
 65 |             self.day_instances = 1
 66 |             self.cum_sum = x
 67 |             
 68 |             self.start_value, self.end_value = None, None
 69 |             
 70 |         else:
 71 |             self.day_instances += 1
 72 |             self.cum_sum += x
 73 |         
 74 |         if self.day_instances == self.start:
 75 |             self.start_value = x[:, 64]
 76 |         if self.day_instances == self.end:
 77 |             self.end_value = x[:, 64]
 78 | 
 79 |     def get_mean(self):
 80 |         return self.cum_sum/self.day_instances
 81 | 
 82 |     def get_past_mean(self):
 83 |         return self.past_mean
 84 | 
 85 |     def get_past_trade(self):
 86 |         return self.past_instances
 87 |     
 88 |     def predict_today_busy(self):
 89 |         if self.start_value is None or self.end_value is None:
 90 |             return False
 91 |         return (self.end_value - self.start_value) / (self.end - self.start) < self.slope
 92 | 
 93 | class RunningEWMeanDay:
 94 |     '''
 95 |     Reference: Lucas Morin
 96 |     https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012
 97 |     Modified to do the rolling mean only intraday
 98 |     '''
 99 |     def __init__(self, window=20, num_feat = 1, lt_mean = None):
100 |         if lt_mean is not None:
101 |             self.s = lt_mean
102 |         else:
103 |             self.s = np.zeros(num_feat)
104 |         self.past_value = np.zeros(num_feat)
105 |         self.alpha = 2 /(window + 1)
106 |         self.day = -1
107 | 
108 |     def clear(self):
109 |         self.s = 0
110 | 
111 |     def push(self, x, date):
112 |         
113 |         x = fast_fillna(x, self.past_value)
114 |         self.past_value = x
115 | 
116 |         if date > self.day:
117 |             self.day = date
118 |             self.clear()
119 |             self.s = x
120 |         else:
121 |             self.s = self.alpha * x + (1 - self.alpha) * self.s
122 |         
123 |     def get_mean(self):
124 |         return self.s
125 | 
126 | 
127 | class RunningMeanDay:
128 |     '''
129 |     Reference: Lucas Morin
130 |     https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012
131 |     Modified to do the rolling mean only intraday
132 |     '''
133 |     def __init__(self, window=1000, num_feat = 1):
134 |         self.day = -1
135 |         self.n = 0
136 |         self.mean = 0
137 |         self.run_var = 0
138 |         self.window = window
139 |         self.past_value = 0
140 |         self.windows = deque(maxlen=window+1)
141 |         self.num_feat=num_feat
142 | 
143 |     def clear(self):
144 |         self.n = 0
145 |         self.windows.clear()
146 | 
147 |     def push(self, x, date):
148 |         
149 |         x = fast_fillna(x, self.past_value)
150 |         self.past_value = x
151 | 
152 |         if date > self.day:
153 |             self.day = date
154 |             self.clear()
155 |             self.windows.append(x)
156 |             self.n = 1
157 |             self.mean = x
158 |             self.run_var = 0
159 |         else:
160 |             self.windows.append(x)
161 | 
162 |             if self.n < self.window:
163 |                 # Calculating first variance
164 |                 self.n += 1
165 |                 delta = x - self.mean
166 |                 self.mean += delta / self.n
167 |                 self.run_var += delta * (x - self.mean)
168 |             else:
169 |                 # Adjusting variance
170 |                 x_removed = self.windows.popleft()
171 |                 old_m = self.mean
172 |                 self.mean += (x - x_removed) / self.window
173 |                 self.run_var += (x + x_removed - old_m - self.mean) * (x - x_removed)
174 | 
175 |     def get_mean(self):
176 |         return self.mean if self.n else np.zeros(self.num_feat)
177 | 
178 |     def get_var(self):
179 |         return self.run_var / (self.n) if self.n > 1 else np.zeros(self.num_feat)
180 | 
181 |     def get_std(self):
182 |         return math.sqrt(self.get_var())
183 | 
184 |     def get_all(self):
185 |         return list(self.windows)
186 | 
187 |     def __str__(self):
188 |         return "Current window values: {}".format(list(self.windows))
189 | 
190 | 
191 | #%%
192 | def load_train(drop_days=None, zero_weight=True):
193 |     with timer("Loading train parquet"):
194 |         train_parquet = os.path.join(DATA_DIR, 'train.parquet')
195 |         train = pd.read_parquet(train_parquet)
196 |         if drop_days:
197 |             train = train.query(f'date not in {drop_days}').reset_index (drop = True)
198 |         
199 |         if not zero_weight:
200 |             train = train.query('weight > 0').reset_index (drop = True)
201 |         
202 |         feat_cols = [f'feature_{i}' for i in range(130)]
203 |         # train[feat_cols].mean().to_csv(os.path.join(DATA_DIR, 'f_mean_final.csv'), 
204 |         #                                index_label=['features'], header=['mean'])
205 |         f_mean = train[feat_cols].mean().values.reshape(1,-1)
206 |         if zero_weight:
207 |             np.save(DATA_DIR+'f_mean_after_85_include_zero_weight.npy', f_mean)
208 |         else:
209 |             np.save(DATA_DIR+'f_mean_after_85_positive_weight.npy', f_mean)
210 |     return train
211 | 
212 | 
213 | def process_train_rolling(train, debug=False):
214 |     TRAIN_ROWS = 50_000
215 |     if debug:
216 |         train = train[:TRAIN_ROWS]
217 | 
218 |     f_mean = train.mean().values
219 | 
220 |     train_dtypes = {'date': np.int32,
221 |                     'ts_id': np.int64,
222 |                     'resp': np.float64,
223 |                     'weight': np.float64,
224 |                     }
225 |     for c in range(1,5):
226 |         train_dtypes['resp_'+str(c)] = np.float64
227 |     for c in range(130):
228 |         train_dtypes['feature_'+str(c)] = np.float32
229 | 
230 |     pdm = RunningPDA(past_mean=f_mean)
231 | 
232 |     with tqdm(total=len(train)) as pbar:
233 |         row_vals = []
234 |         for _, row in train.iterrows(): 
235 |             date = row['date']
236 |             pdm.push(np.array(row), date)
237 |             
238 |             past_day_mean = pdm.get_past_mean()
239 | 
240 |             x_tt = row.values
241 |             if np.isnan(x_tt.sum()):
242 |                 x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * past_day_mean
243 | 
244 |             row_vals.append(x_tt)
245 |             pbar.update()
246 |     
247 |     train_pdm = pd.DataFrame(row_vals, columns=train.columns, index=train.index).astype(train_dtypes)
248 | 
249 |     if not debug:
250 |         train_pdm.to_parquet(os.path.join(DATA_DIR, 'train_pdm.parquet'), index=False)
251 | 
252 |   
253 | # %%
254 | 
255 | if __name__ == '__main__':
256 |     get_system()
257 |     train = load_train(drop_days=[2, 36, 270, 294])
258 |     # train = load_train(drop_days=list(range(0,86))+[270, 294])
259 |     process_train_rolling(train, debug=True)
260 | 


--------------------------------------------------------------------------------
/mlp/run_train_final_4.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | import os
  3 | import sys
  4 | current_path = os.path.dirname(os.path.abspath(__file__))
  5 | HOME = os.path.dirname(current_path)
  6 | sys.path.append(HOME)
  7 | 
  8 | from utils import *
  9 | from utils_js import *
 10 | 
 11 | import pandas as pd
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | from torchsummary import summary
 16 | 
 17 | 
 18 | from mlp import *
 19 | pd.set_option('display.max_rows', 100)
 20 | pd.set_option('display.max_columns', 100)
 21 | #%%
 22 | '''
 23 | Final model spikenet:
 24 | 
 25 | 1. subtract the most common values from columns with a spike in the histogram to form cat features.
 26 | '''
 27 | 
 28 | 
 29 | # %%
 30 | BATCH_SIZE = 8192
 31 | FINETUNE_BATCH_SIZE = 4096_00
 32 | 
 33 | LEARNING_RATE = 1e-4
 34 | WEIGHT_DECAY = 1e-5
 35 | EPOCHS = 100
 36 | EARLYSTOP_NUM = 5
 37 | ALPHA = 0.6
 38 | EPSILON = 5e-2 # strength of the regularizer
 39 | VOLATILE_MODEL = True
 40 | 
 41 | s = 4
 42 | SEED = 1127*s
 43 | np.random.seed(SEED)
 44 | pd.core.common.random_state(SEED)
 45 | torch.manual_seed(SEED)
 46 | torch.cuda.manual_seed(SEED)
 47 | torch.backends.cudnn.deterministic = True
 48 | torch.backends.cudnn.benchmark = False
 49 | if torch.cuda.is_available():
 50 |     torch.cuda.manual_seed_all(SEED)
 51 | 
 52 | splits = {
 53 |           'train_days': (range(0,457), range(0,424), range(0,391)),
 54 |           'valid_days': (range(467, 500), range(434, 466), range(401, 433)),
 55 |           }
 56 | fold = 2
 57 | 
 58 | if fold == 0:
 59 |     SAVE_THRESH = 1300
 60 |     VAL_OFFSET = 100
 61 | elif fold == 1:
 62 |     SAVE_THRESH = 1200
 63 |     VAL_OFFSET = 150
 64 | elif fold == 2:
 65 |     SAVE_THRESH = 90
 66 |     VAL_OFFSET = 100
 67 |     EPOCHS = 40
 68 |     LEARNING_RATE = 1e-3
 69 |     EPSILON = 1e-2
 70 | 
 71 | VOLATILE_DAYS = [1,  4,  5,  12,  16,  18,  24,  37,  38,  43,  44,  45,  47,
 72 |              59,  63,  80,  85, 161, 168, 452, 459, 462]
 73 | 
 74 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 75 | # %%
 76 | with timer("Preprocessing train"):
 77 |     # train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 78 |     train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
 79 |     train = pd.read_parquet(train_parquet)
 80 | # %%
 81 | # feat_reg_index = [0, 17, 18, 37, 39, 40, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 57, 58]
 82 | # feat_reg_index += list(range(60,69))
 83 | # feat_reg_index += [89, 101, 108, 113, 119, 120, 121, 122, 124, 125, 126, 128]
 84 | # feat_spike_index_temp = list(set(range(130)).difference(feat_reg_index))
 85 | # features_reg = [f'feature_{i}' for i in feat_reg_index]
 86 | # features_spike = [f'feature_{i}' for i in feat_spike_index_temp]
 87 | 
 88 | 
 89 | # %%
 90 | # feat_spike_index = [eval(s) for s in feat_spike_index]
 91 | # for f in feat_spike_index:
 92 | #     print(f'{f},', end=' ')
 93 | # %%
 94 | # feat_spike_index = [1, 2, 3, 4, 5, 6, 14, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118]
 95 | feat_spike_index = [1, 2, 3, 4, 5, 6, 10, 14, 16, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85,
 96 |                     86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118]
 97 | feat_reg_index = list(set(range(130)).difference(feat_spike_index))
 98 | features_reg = [f'feature_{i}' for i in feat_reg_index]
 99 | features_spike = [f'feature_{i}' for i in feat_spike_index]
100 | 
101 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4', ]
102 | target_cols = ['action_1', 'action_2', 'action_3', 'action', 'action_4']
103 | 
104 | feat_cols = [f'feature_{i}' for i in range(130)]
105 | # feat_cols = features_reg
106 | cat_cols = [f+'_c' for f in features_spike]
107 | print(f"Number of features with spike: {len(cat_cols)}")
108 | # %%
109 | 
110 | feat_spike_index = []
111 | most_common_vals = []
112 | most_common_vals = np.load(DATA_DIR+'spike_common_vals_42.npy').reshape(-1)
113 | 
114 | for i, feat in tqdm(enumerate(features_spike)):
115 |     # sorted_counts = train[feat].value_counts().sort_values(ascending=False)
116 |     # print(sorted_counts.head(5), '\n\n')
117 |     # if sorted_counts.iloc[0]/sorted_counts.iloc[1] > 30 and sorted_counts.iloc[0] > 5000:
118 |     # feat_spike_index.append(sorted_counts.name.split('_')[-1])
119 |     # most_common_val = sorted_counts.index[0]
120 |     # most_common_vals.append(most_common_val)
121 |     train[feat+'_c'] = (train[feat] - most_common_vals[i]).astype(int)
122 |     # print(train[feat+'_c'].astype(int).value_counts()[:5])
123 |     
124 | # %%
125 | train = train.query(f'date not in {[2, 36, 270, 294]}').reset_index(drop=True)
126 | 
127 | 
128 | if not VOLATILE_MODEL:
129 |     train = train.query('date > 85').reset_index(drop=True)
130 | # train = train.query(f'date not in {VOLATILE_DAYS}').reset_index(drop=True)
131 | # train.fillna(train.mean(), inplace=True)
132 | train = train[train['weight'] != 0].reset_index(drop=True)
133 | train['action'] = (train['resp'] > 0).astype('int')
134 | 
135 | for c in range(1, 5):
136 |     train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(np.int32)
137 | 
138 | valid = train.loc[train.date.isin(splits['valid_days'][fold])].reset_index(drop=True)
139 | train = train.loc[train.date.isin(splits['train_days'][fold])].reset_index(drop=True)
140 | # %%
141 | 
142 | 
143 | train_set = MarketDatasetCat(train,
144 |                              features=feat_cols, cat_features=cat_cols,
145 |                              targets=target_cols, resp=resp_cols)
146 | train_loader = DataLoader(
147 |     train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
148 | 
149 | valid_set = MarketDatasetCat(valid, features=feat_cols, cat_features=cat_cols,
150 |                              targets=target_cols, resp=resp_cols)
151 | valid_loader = DataLoader(
152 |     valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
153 | # %%
154 | util_cols = resp_cols
155 | # util_cols = ['resp']
156 | resp_index = [resp_cols.index(r) for r in util_cols]
157 | regularizer = UtilityLoss(alpha=EPSILON, scaling=12,
158 |                           normalize=None, resp_index=resp_index)
159 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
160 | 
161 | model = SpikeNet()
162 | model.to(device)
163 | summary(model, [(len(feat_cols),), (len(cat_cols),)])
164 | 
165 | optimizer = torch.optim.Adam(
166 |     model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
167 | 
168 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE,
169 | #                                                 steps_per_epoch=len(
170 | #                                                     train_loader),
171 | #                                                 epochs=EPOCHS)
172 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
173 |                                                                  T_0=50, T_mult=2,
174 |                                                                  eta_min=LEARNING_RATE*1e-4, last_epoch=-1)
175 | 
176 | finetune_loader = DataLoader(
177 |     train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8)
178 | 
179 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-2)
180 | 
181 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", 
182 |                           save_threshold=SAVE_THRESH, util_offset=VAL_OFFSET)
183 | 
184 | # %%
185 | 
186 | lr = []
187 | 
188 | for epoch in range(EPOCHS):
189 | 
190 |     train_loss = train_epoch_cat(
191 |         model, optimizer, scheduler, loss_fn, train_loader, device)
192 |     # train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device)
193 |     lr.append(optimizer.param_groups[0]['lr'])
194 | 
195 |     if (epoch+1) % 10 == 0:
196 |         _ = train_epoch_ft_cat(model, finetune_optimizer, scheduler,
197 |                                regularizer, finetune_loader, device, loss_fn=loss_fn)
198 | 
199 |     valid_pred = valid_epoch(model, valid_loader, device, cat_input=True)
200 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
201 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
202 |     model_file = MODEL_DIR + \
203 |         f"/emb_fold_{fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
204 |     early_stop(epoch, valid_auc, model, model_path=model_file,
205 |                epoch_utility_score=valid_score)
206 | 
207 |     # if early_stop.model_saved:
208 |     #     for g in optimizer.param_groups:
209 |     #         g['lr'] *= 0.1
210 |     #     lr[-1] = optimizer.param_groups[0]['lr']
211 |     #     tqdm.write(f"\nNew learning rate: {lr[-1]:.4e}")
212 |         
213 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {fold}")
214 |     tqdm.write(
215 |         f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}")
216 |     tqdm.write(
217 |         f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ")
218 |     tqdm.write(
219 |         f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
220 |     if early_stop.early_stop:
221 |         print("\nEarly stopping")
222 |         break
223 | # %%
224 | 


--------------------------------------------------------------------------------
/mlp/run_train_finetune.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | from torchsummary import summary
  3 | import os
  4 | import sys
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torch.nn as nn
  8 | torch.backends.cudnn.deterministic = True  # for bincount
  9 | 
 10 | current_path = os.path.dirname(os.path.abspath(__file__))
 11 | HOME = os.path.dirname(current_path)
 12 | MODEL_DIR = os.path.join(HOME,  'models')
 13 | DATA_DIR = os.path.join(HOME,  'data')
 14 | sys.path.append(HOME)
 15 | 
 16 | from utils import *
 17 | from mlp import *
 18 | # %%
 19 | 
 20 | '''
 21 | Training script finetuning using resp colums as regularizer
 22 | '''
 23 | 
 24 | DEBUG = False
 25 | LOAD_PRETRAIN = False
 26 | TRAINING_START = 86  # 86 by default
 27 | FINETUNE_BATCH_SIZE = 2048_00
 28 | BATCH_SIZE = 8196
 29 | EPOCHS = 120
 30 | LEARNING_RATE = 1e-3
 31 | WEIGHT_DECAY = 1e-5
 32 | EARLYSTOP_NUM = 6
 33 | NFOLDS = 1
 34 | SCALING = 10
 35 | THRESHOLD = 0.5
 36 | DAYS_TO_DROP = [2, 36, 270, 294]
 37 | CV_START_DAY = 100
 38 | CV_DAYS = 50
 39 | 
 40 | SEED = 1127802
 41 | get_seed(SEED)
 42 | 
 43 | # f = np.median
 44 | # f = np.mean
 45 | f = median_avg
 46 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 47 | 
 48 | # %%
 49 | with timer("Preprocessing train"):
 50 |     # train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 51 |     train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
 52 |     train, valid = preprocess_pt(train_parquet, day_start=TRAINING_START, 
 53 |                                  drop_days=DAYS_TO_DROP,
 54 |                                  drop_zero_weight=True, denoised_resp=False)
 55 | 
 56 | print(f'action based on resp mean:   ', train['action'].astype(int).mean())
 57 | for c in range(1, 5):
 58 |     print(f'action based on resp_{c} mean: ',
 59 |           train['action_'+str(c)].astype(int).mean())
 60 | 
 61 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
 62 | resp_cols_all = resp_cols
 63 | target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4']
 64 | feat_cols = [f'feature_{i}' for i in range(130)]
 65 | 
 66 | 
 67 | # f_mean = np.mean(train[feat_cols[1:]].values, axis=0)
 68 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
 69 | 
 70 | ###### adding weight to the features #######
 71 | # feat_cols.extend(['weight'])
 72 | # %%
 73 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols)
 74 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
 75 | 
 76 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols)
 77 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
 78 | 
 79 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols))
 80 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols))
 81 | model.to(device)
 82 | summary(model, input_size=(len(feat_cols), ))
 83 | # %%
 84 | '''
 85 | fine-tuning the trained model based on resp or utils
 86 | current fine-tuning train set is all train
 87 | max batch_size:
 88 | 3 resps: 102400
 89 | 
 90 | current best setting: 
 91 | '''
 92 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
 93 | 
 94 | # util_cols = ['resp', 'resp_1', 'resp_2']
 95 | # util_cols = ['resp', 'resp_4']
 96 | util_cols = resp_cols
 97 | 
 98 | resp_index = [resp_cols_all.index(r) for r in util_cols]
 99 | 
100 | # regularizer = RespMSELoss(alpha=1e-1, scaling=1, resp_index=resp_index)
101 | regularizer = UtilityLoss(alpha=5e-2, scaling=12, normalize=None, resp_index=resp_index)
102 | 
103 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
104 | 
105 | all_train = pd.concat([train, valid], axis=0)
106 | all_train_set = ExtendedMarketDataset(all_train, features=feat_cols, targets=target_cols, resp=resp_cols)
107 | train_loader = DataLoader(all_train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
108 | 
109 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
110 | # optimizer = RAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
111 | # optimizer = Lookahead(optimizer=optimizer, alpha=1e-1)
112 | 
113 | scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE,
114 |                                                     steps_per_epoch=len(train_loader),
115 |                                                     epochs=EPOCHS)
116 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
117 | #                                                                  T_0=10, T_mult=1, 
118 | #                                                                  eta_min=LEARNING_RATE*1e-3, last_epoch=-1)
119 | 
120 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8)
121 | 
122 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3)
123 | 
124 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, 
125 |                            mode="max", save_threshold=6000)
126 | 
127 | # %%
128 | if LOAD_PRETRAIN:
129 |     print("Loading model for finetune.")
130 |     _fold = 0
131 |     model_weights = os.path.join(MODEL_DIR, f"resmlp_{_fold}.pth")
132 |     # model_weights = os.path.join(MODEL_DIR, f"resmlp_ft_old_fold_{_fold}.pth")
133 |     # model_weights = os.path.join(MODEL_DIR, f"resmlp_finetune_fold_{_fold}.pth")
134 |     try:
135 |         model.load_state_dict(torch.load(model_weights))
136 |     except:
137 |         model.load_state_dict(torch.load(
138 |             model_weights, map_location=torch.device('cpu')))
139 |     model.eval()
140 |     valid_pred = valid_epoch(model, valid_loader, device)
141 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
142 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
143 | 
144 |     print(f"valid_utility:{valid_score:.2f} \t valid_auc:{valid_auc:.4f}")
145 | # %%
146 | _fold = 1
147 | SEED = 1127802
148 | get_seed(SEED+SEED*_fold)
149 | 
150 | for epoch in range(EPOCHS):
151 | 
152 |     train_loss = train_epoch(model, optimizer, scheduler,loss_fn, train_loader, device)
153 |     # train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device)
154 |     lr = optimizer.param_groups[0]['lr']
155 |     if (epoch+1) % 10 == 0:
156 |         _ = train_epoch_finetune(model, finetune_optimizer, scheduler,
157 |                                  regularizer, finetune_loader, device, loss_fn=loss_fn)
158 |         
159 |         print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
160 |                                 batch_size =2*8192, f=median_avg, threshold=0.5, 
161 |                                 target_cols=target_cols, 
162 |                                 feat_cols=feat_cols,
163 |                                 resp_cols=resp_cols)
164 | 
165 |     valid_pred = valid_epoch(model, valid_loader, device)
166 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
167 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
168 |     # model_file = MODEL_DIR + \
169 |     #     f"/resmlp_interleave_{_fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
170 |     model_file = MODEL_DIR + \
171 |         f"/resw_interleave_{_fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
172 |     early_stop(epoch, valid_auc, model, model_path=model_file,
173 |                epoch_utility_score=valid_score)
174 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}")
175 |     tqdm.write(
176 |         f"Train loss: {train_loss:.4f} \t Current learning rate: {lr:.4e}")
177 |     tqdm.write(
178 |         f"Best util: {early_stop.best_utility_score:.2f} \t {early_stop.message} ")
179 |     tqdm.write(
180 |         f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
181 |     if early_stop.early_stop:
182 |         print("\nEarly stopping")
183 |         break
184 | 
185 | if DEBUG:
186 |     torch.save(model.state_dict(), MODEL_DIR + f"/resmlp_interleave_fold_{_fold}.pth")
187 | # %%
188 | _fold = 4
189 | # model_file = f"resmlp_interleave_0_util_7437_auc_0.6389.pth"
190 | # model_file = f"resmlp_ft_old_fold_{_fold}.pth" # fold 1, 3, 4 good
191 | # model_file = f"resmlp_finetune_fold_{_fold}.pth"
192 | model_file = f"resw_interleave_1_util_6455_auc_0.6237.pth"
193 | # model_file = f"resw_interleave_1_util_6333_auc_0.6211.pth"
194 | # model_file = f"resmlp_{_fold}.pth"
195 | print(f"Loading {model_file} for cv check.\n")
196 | model_weights = os.path.join(MODEL_DIR, model_file)
197 | 
198 | model.to(device)
199 | feat_cols = [f'feature_{i}' for i in range(130)]
200 | feat_cols.extend(['weight'])
201 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
202 | 
203 | 
204 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols))
205 | model.to(device)
206 | model.load_state_dict(torch.load(model_weights))
207 | # model.load_state_dict(torch.load(
208 | #     model_weights, map_location=torch.device('cpu')))
209 | model.eval();
210 | 
211 | train_parquet = os.path.join(DATA_DIR, 'train.parquet')
212 | train = preprocess_pt(train_parquet, day_start=0, day_split=None, 
213 |                   drop_zero_weight=False)
214 | 
215 | 
216 | # %%
217 | 


--------------------------------------------------------------------------------
/data/data_final.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import os
  3 | import sys
  4 | current_path = os.path.dirname(os.path.abspath(__file__))
  5 | HOME = os.path.dirname(current_path)
  6 | sys.path.append(HOME)
  7 | 
  8 | import pandas as pd
  9 | pd.set_option('display.max_rows', 100)
 10 | pd.set_option('display.max_columns', 100)
 11 | 
 12 | import numpy as np
 13 | import datatable as dt
 14 | from tqdm.auto import tqdm
 15 | from collections import deque
 16 | import matplotlib.pyplot as plt
 17 | import seaborn as sns
 18 | sns.set(style="darkgrid", context="talk")
 19 | from jupyterthemes import jtplot
 20 | jtplot.style(theme='onedork', context='notebook', ticks=True, grid=False)
 21 | 
 22 | 
 23 | MODEL_DIR = HOME+'/models/'
 24 | DATA_DIR = HOME+'/data/'
 25 | from utils import *
 26 | from utils_js import *
 27 | from data.data_rolling import RunningPDA, RunningEWMeanDay, RunningMeanDay
 28 | 
 29 | # %%
 30 | '''
 31 | data preparation for the final submission (in order)
 32 | 
 33 | 1. Drop outliers [2, 294], low volume days [36, 270].
 34 | 2. fillna() uses past day mean including all weight zero rows. 
 35 | 3. Most common values fillna for spike features rows (a small random noise added).
 36 | 4. all data, only drop the two partial days and the two <2k ts_id days.
 37 | 5. smoother data, aside from 1, query day > 85, drop ts_id > 8700 days.
 38 | 6. Final training uses only weight > 0 rows, but with a randomly
 39 | selected 40% of weight zero rows' weight being replaced by 1e-7 to
 40 | reduce overfitting.
 41 | 7. a new denoised target is generated with all five targets.
 42 | 
 43 | testing out new features
 44 | - ewm for feature_0
 45 | - moving average for feature_0
 46 | 
 47 | Reference: Carl McBride Ellis
 48 | https://www.kaggle.com/carlmcbrideellis/semper-augustus-pre-process-training-data
 49 | 
 50 | Past day mean/EW mean push
 51 | Reference: Lucas Morin's notebook
 52 | https://www.kaggle.com/lucasmorin/running-algos-fe-for-fast-inference?scriptVersionId=50754012
 53 | '''
 54 | # %%
 55 | with timer("Loading train"):
 56 |     train_csv = os.path.join(DATA_DIR, 'train.csv')
 57 |     train = dt.fread(train_csv).to_pandas()
 58 |     
 59 |     # train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 60 |     # train = pd.read_parquet(train_parquet)
 61 | 
 62 | # train = train.set_index('ts_id')
 63 | train = train.query('date not in [2, 36, 270, 294]').reset_index(drop=True)
 64 | # %%
 65 | # the first one is used for model
 66 | # feat_spike_index = [1, 2, 3, 4, 5, 6, 10, 14, 16, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85,
 67 | #                     86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118]
 68 | 
 69 | # this one is used for fillna
 70 | feat_spike_index = [1, 2, 69, 71, 85, 87, 88, 91, 93, 94, 97, 99, 100, 103, 105, 106]
 71 | 
 72 | noisy_index = [3, 4, 5, 6, 8, 10, 12, 14, 16, 37, 38, 39, 40, 72, 73, 74, 75, 76,
 73 |                 78, 79, 80, 81, 82, 83]
 74 | negative_index = [73, 75, 76, 77, 79, 81, 82]
 75 | hybrid_index = [55, 56, 57, 58, 59]
 76 | running_indices = sorted([0]+noisy_index+negative_index+hybrid_index)
 77 | features_running = [f'feature_{i}' for i in running_indices]
 78 | 
 79 | feat_reg_index = list(set(range(130)).difference(feat_spike_index))
 80 | features_reg = [f'feature_{i}' for i in feat_reg_index]
 81 | features_spike = [f'feature_{i}' for i in feat_spike_index]
 82 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4', ]
 83 | 
 84 | feat_cols = [f'feature_{i}' for i in range(130)]
 85 | # feat_cols = features_reg
 86 | feat_cols_c = feat_cols + [f+'_c' for f in features_spike]
 87 | print(f"Number of features: {len(feat_cols)}")
 88 | print(f"Number of spike fillna features: {len(features_spike)}")
 89 | # %%
 90 | try:
 91 |     feat_mean = np.load(DATA_DIR+'f_mean_all_days_include_zero_weight.npy')
 92 | except:
 93 |     feat_mean = train[feat_cols].mean().values.reshape(1,-1)
 94 |     np.save(DATA_DIR+'f_mean_all_days_include_zero_weight.npy', feat_mean)
 95 | all_mean = train.mean().values
 96 | #%%
 97 | # %%
 98 | try:
 99 |     spike_fillna_val = np.load(DATA_DIR+'fillna_val_spike_feats.npy')
100 | except:
101 |     most_common_vals = []
102 |     # most_common_vals = np.load(DATA_DIR+'spike_common_vals_42.npy').reshape(-1)
103 | 
104 |     for i, feat in enumerate(features_spike):
105 |         sorted_counts = train[feat].value_counts().sort_values(ascending=False)
106 |         print(sorted_counts.head(5), '\n\n')
107 |         # if sorted_counts.iloc[0]/sorted_counts.iloc[1] > 30 and sorted_counts.iloc[0] > 5000:
108 |         # feat_spike_index.append(sorted_counts.name.split('_')[-1])
109 |         most_common_val = sorted_counts.index[0]
110 |         most_common_vals.append(most_common_val)
111 | 
112 |     spike_fillna_val = np.zeros((len(feat_cols), ))
113 |     spike_fillna_val[feat_spike_index] = np.array(most_common_vals)
114 |     np.save(DATA_DIR+'fillna_val_spike_feats.npy', spike_fillna_val)
115 | 
116 | #%%
117 | 
118 | class RunningPDAFinal():
119 |     '''
120 |     The subclass only for data-preparation, not for final submission pipeline
121 |     '''
122 |     def __init__(self, past_mean=all_mean):
123 |         self.day = -1
124 |         self.past_mean = past_mean # past day mean, initialized as the mean
125 |         self.cum_sum = 0
126 |         self.day_instances = 0 # current day instances
127 |         self.past_value = past_mean # the previous row's value, initialized as the mean
128 |         self.past_instances = 0 # instances in the past day
129 |         self.past_day_data = np.zeros_like(past_mean)
130 |         self.current_day_data = past_mean
131 |     
132 |     def push(self, x, date):
133 |         x = fast_fillna(x, self.past_value)
134 |         self.past_value = x
135 |         
136 |         # change of day
137 |         if date > self.day:
138 |             self.day = date
139 |             if self.day_instances > 0:
140 |                 self.past_mean = self.cum_sum/self.day_instances
141 |             self.past_instances = self.day_instances
142 |             self.day_instances = 1
143 |             self.cum_sum = x
144 |             self.past_day_data = np.array(self.current_day_data)
145 |             # print(self.past_day_data[0])
146 |             self.current_day_data = []
147 |             self.current_day_data.append(list(x))
148 |             # print(self.current_day_data)
149 |             # print(x[0])
150 | 
151 |         else:
152 |             self.day_instances += 1
153 |             self.cum_sum += x
154 |             self.current_day_data.append(list(x))
155 |        
156 | 
157 |     def get_mean(self):
158 |         return self.cum_sum/self.day_instances
159 | 
160 |     def get_past_mean(self):
161 |         return self.past_mean
162 | 
163 |     def get_past_mean_numpy(self):
164 |         return np.mean(self.past_day_data, axis=0)
165 | 
166 |     def get_past_std(self):
167 |         return np.std(self.past_day_data, axis=0)
168 | #%%
169 | feat_mean = feat_mean.reshape(-1)
170 | pdm = RunningPDAFinal(past_mean=feat_mean)
171 | 
172 | feat_vals = []
173 | # nonfeat_cols = ['date', 'weight', 'resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4',]
174 | n_feats = len(feat_cols)
175 | spike_fillna_val = spike_fillna_val.reshape(-1)
176 | 
177 | with tqdm(total=len(train)) as pbar:
178 |     
179 |     for _, row in train.iterrows(): 
180 |         date = row['date']
181 |         x_tt = row.values[7:-1]
182 |         assert x_tt[0] == 1 or x_tt[0] == -1
183 |         pdm.push(x_tt, date)
184 |         
185 |         past_day_mean = pdm.get_past_mean().reshape(-1)
186 |         past_day_mean[feat_spike_index] = 0
187 |         fillna_val = past_day_mean + spike_fillna_val
188 |         if np.isnan(x_tt.sum()):
189 |             # x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt)*spike_fillna_val # bug!!!!!!
190 |             x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt)*fillna_val
191 | 
192 |         feat_vals.append(x_tt)
193 |         pbar.update()
194 | #%%
195 | train_dtypes = {'date': np.int32,
196 |                 'ts_id': np.int64,
197 |                 'resp': np.float64,
198 |                 'weight': np.float64,
199 |                 }
200 | for c in range(1,5):
201 |     train_dtypes['resp_'+str(c)] = np.float64
202 | for c in range(130):
203 |     train_dtypes['feature_'+str(c)] = np.float32
204 | 
205 | #%%    
206 | feature_df = pd.DataFrame(feat_vals, columns=feat_cols, index=train.index)
207 | 
208 | # %%
209 | train_final = train.copy()
210 | train_final[feat_cols] = feature_df
211 | train_final = train_final.astype(train_dtypes)
212 | # %%
213 | # train_final = train_final.astype(train_dtypes)
214 | train_final.to_parquet(os.path.join(DATA_DIR, 'train_final.parquet'), index=False)
215 | # %%
216 | train_final.to_feather(os.path.join(DATA_DIR, 'train_final.feather'))
217 | # %%
218 | trades_per_day = train_final.groupby(['date'])['ts_id'].count()
219 | volatile_days  = pd.DataFrame(trades_per_day[trades_per_day > 8600])
220 | print("Number of volatile days",volatile_days.count())
221 | filter_list    = volatile_days.index.to_list()
222 | 
223 | #%%
224 | filter_list = [1,  4,  5,  12,  16,  18,  24,  37,  38,  43,  44,  45,  47,
225 |              59,  63,  80,  85, 161, 168, 452, 459, 462]
226 | train_final_regular = train_final.query('date != @filter_list').reset_index(drop = True)
227 | train_final_regular = train_final.query('date >85').reset_index(drop = True)
228 | # %%
229 | train_final_regular.to_parquet(os.path.join(DATA_DIR, 'train_final_regular.parquet'), index=False)
230 | # %%
231 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import sys
  3 | import os
  4 | def add_sys_path():
  5 |     try:
  6 |         for f in ['/home/scao/anaconda3/lib/python3.8/lib-dynload',
  7 |                  '/home/scao/anaconda3/lib/python3.8/site-packages']:
  8 |             sys.path.append(f)
  9 |     except:
 10 |         RuntimeError
 11 |         print("Path not added")
 12 | add_sys_path()
 13 | 
 14 | 
 15 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 16 | import random as rd
 17 | from contextlib import contextmanager
 18 | from collections import defaultdict
 19 | from time import time
 20 | import matplotlib.pyplot as plt
 21 | from datetime import date
 22 | import math
 23 | import numpy as np
 24 | import pandas as pd
 25 | import psutil
 26 | import torch
 27 | import pickle
 28 | import seaborn as sns
 29 | sns.set()
 30 | from sklearn.metrics import roc_auc_score
 31 | 
 32 | 
 33 | 
 34 | SEED = 1127 
 35 | 
 36 | def get_size(bytes, suffix='B'):
 37 |     ''' 
 38 |     by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified
 39 |     Scale bytes to its proper format
 40 |     e.g:
 41 |         1253656 => '1.20MiB'
 42 |         1253656678 => '1.17GiB'
 43 |     '''
 44 |     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
 45 |         if abs(bytes) < 1024.0:
 46 |             return f"{bytes:3.2f} {unit}{suffix}"
 47 |         bytes /= 1024.0
 48 |     return f"{bytes:3.2f} 'Yi'{suffix}"
 49 | 
 50 | def get_file_size(filename):
 51 |     file_size = os.stat(filename)
 52 |     return get_size(file_size.st_size)
 53 | 
 54 | 
 55 | def get_system():
 56 |     print("="*40, "CPU Info", "="*40)
 57 |     # number of cores
 58 |     print("Physical cores    :", psutil.cpu_count(logical=False))
 59 |     print("Total cores       :", psutil.cpu_count(logical=True))
 60 |     # CPU frequencies
 61 |     cpufreq = psutil.cpu_freq()
 62 |     print(f"Max Frequency    : {cpufreq.max:.2f} Mhz")
 63 |     print(f"Min Frequency    : {cpufreq.min:.2f} Mhz")
 64 |     print(f"Current Frequency: {cpufreq.current:.2f} Mhz")
 65 | 
 66 |     print("="*40, "Memory Info", "="*40)
 67 |     # get the memory details
 68 |     svmem = psutil.virtual_memory()
 69 |     print(f"Total     : {get_size(svmem.total)}")
 70 |     print(f"Available : {get_size(svmem.available)}")
 71 |     print(f"Used      : {get_size(svmem.used)}")
 72 | 
 73 | 
 74 |     print("="*40, "Software Info", "="*40)
 75 |     print('Python     : ' + sys.version.split('\n')[0])
 76 |     print('Numpy      : ' + np.__version__)
 77 |     print('Pandas     : ' + pd.__version__)
 78 |     print('PyTorch    : ' + torch.__version__)
 79 |     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 80 |    
 81 |     if device.type == 'cuda':
 82 |         print("="*40, "GPU Info", "="*40)
 83 |         print(f'Device     : {device}')
 84 |         print(torch.cuda.get_device_name(0))
 85 |         print(f"{'Mem total': <15}: {round(torch.cuda.get_device_properties(0).total_memory/1024**3,1)} GB")
 86 |         print(f"{'Mem allocated': <15}: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
 87 |         print(f"{'Mem cached': <15}: {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB")
 88 |     
 89 |     print("="*30, "system info print done", "="*30)
 90 | 
 91 | def get_seed(s):
 92 |     rd.seed(s)
 93 |     os.environ['PYTHONHASHSEED'] = str(s)
 94 |     np.random.seed(s)
 95 |     pd.core.common.random_state(s)
 96 |     # Torch
 97 |     torch.manual_seed(s)
 98 |     torch.cuda.manual_seed(s)
 99 |     torch.backends.cudnn.deterministic = True
100 |     torch.backends.cudnn.benchmark = False
101 |     if torch.cuda.is_available():
102 |         torch.cuda.manual_seed_all(s)
103 | 
104 | @contextmanager
105 | def simple_timer(title):
106 |     t0 = time()
107 |     yield
108 |     print("{} - done in {:.1f} seconds.\n".format(title, time() - t0))
109 | 
110 | class Colors:
111 |     """Defining Color Codes to color the text displayed on terminal.
112 |     """
113 | 
114 |     blue = "\033[94m"
115 |     green = "\033[92m"
116 |     yellow = "\033[93m"
117 |     magenta = "\033[95m"
118 |     red = "\033[91m"
119 |     end = "\033[0m"
120 | 
121 | def color(string: str, color: Colors = Colors.yellow) -> str:
122 |     return f"{color}{string}{Colors.end}"
123 | 
124 | @contextmanager
125 | def timer(label: str, compact=False) -> None:
126 |     '''
127 |     https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/203020#1111022
128 |     print 
129 |     1. the time the code block takes to run
130 |     2. the memory usage.
131 |     '''
132 |     p = psutil.Process(os.getpid())
133 |     m0 = p.memory_info()[0] / 2. ** 30
134 |     start = time()  # Setup - __enter__
135 |     if not compact:
136 |         print(color(f"{label}: start at {start:.2f};", color=Colors.blue))
137 |         print(color(f"LOCAL RAM USAGE AT START: {m0:.2f} GB" , color=Colors.green))
138 |         try:
139 |             yield  # yield to body of `with` statement
140 |         finally:  # Teardown - __exit__
141 |             m1 = p.memory_info()[0] / 2. ** 30
142 |             delta = m1 - m0
143 |             sign = '+' if delta >= 0 else '-'
144 |             delta = math.fabs(delta)
145 |             end = time()
146 |             print(color(f"{label}: done at {end:.2f} ({end - start:.6f} secs elapsed);", color=Colors.blue))
147 |             print(color(f"LOCAL RAM USAGE AT END: {m1:.2f}GB ({sign}{delta:.2f}GB)", color=Colors.green))
148 |             print('\n')
149 |     else:
150 |         yield
151 |         print(color(f"{label} - done in {time() - start:.6f} seconds. \n", color=Colors.blue))
152 |     
153 | 
154 | def get_memory(num_var=10):
155 |     for name, size in sorted(((name, sys.getsizeof(value)) for name, value in globals().items()), key= lambda x: -x[1])[:num_var]:
156 |         print(color(f"{name:>30}:", color=Colors.green), 
157 |               color(f"{get_size(size):>8}", color=Colors.magenta))
158 | 
159 | def find_files(name, path):
160 |     result = []
161 |     for root, dirs, files in os.walk(path):
162 |         for _file in files:
163 |             if name in _file:
164 |                 result.append(os.path.join(root, _file))
165 |     return result
166 | 
167 | def print_file_size(files):
168 |     for file in files:
169 |         size=get_file_size(file)
170 |         filename = file.split('/')[-1]
171 |         filesize = get_file_size(file)
172 |         print(color(f"{filename:>30}:", color=Colors.green), 
173 |               color(f"{filesize:>8}", color=Colors.magenta))
174 | 
175 | @contextmanager
176 | def trace(title: str):
177 |     t0 = time()
178 |     p = psutil.Process(os.getpid())
179 |     m0 = p.memory_info()[0] / 2. ** 30
180 |     yield
181 |     m1 = p.memory_info()[0] / 2. ** 30
182 |     delta = m1 - m0
183 |     sign = '+' if delta >= 0 else '-'
184 |     delta = math.fabs(delta)
185 |     print(f"[{m1:.1f}GB ({sign}{delta:.3f}GB): {time() - t0:.2f}sec] {title} ", file=sys.stderr)
186 | 
187 | def get_cmap(n, cmap='hsv'):
188 |     '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
189 |     RGB color; the keyword argument name must be a standard mpl colormap name.'''
190 |     return plt.cm.get_cmap(cmap, n)
191 | 
192 | def get_date():
193 |     today = date.today()
194 |     return today.strftime("%b-%d-%Y")
195 | 
196 | def roc_auc_compute_fn(y_targets, y_preds):
197 |     '''
198 |     roc_auc func for torch tensors
199 |     '''
200 |     y_true = y_targets.cpu().numpy()
201 |     y_pred = y_preds.cpu().numpy()
202 |     return roc_auc_score(y_true, y_pred)
203 | 
204 | def argmax(lst):
205 |   return lst.index(max(lst))
206 | 
207 | def get_num_params(model):
208 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
209 |     params = sum([np.prod(p.size()) for p in model_parameters])
210 |     return params
211 | 
212 | def reduce_mem_usage(df, verbose=True):
213 |     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
214 |     start_mem = df.memory_usage().sum() / 1024**2    
215 |     for col in df.columns:
216 |         col_type = df[col].dtypes
217 |         if col_type in numerics:
218 |             c_min = df[col].min()
219 |             c_max = df[col].max()
220 |             if str(col_type)[:3] == 'int':
221 |                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
222 |                     df[col] = df[col].astype(np.int8)
223 |                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
224 |                     df[col] = df[col].astype(np.int16)
225 |                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
226 |                     df[col] = df[col].astype(np.int32)
227 |                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
228 |                     df[col] = df[col].astype(np.int64)  
229 |             else:
230 |                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
231 |                     df[col] = df[col].astype(np.float16)
232 |                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
233 |                     df[col] = df[col].astype(np.float32)
234 |                 else:
235 |                     df[col] = df[col].astype(np.float64)    
236 |     end_mem = df.memory_usage().sum() / 1024**2
237 |     if verbose: print(f'Mem. usage decreased to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
238 |     return df
239 | 
240 | def save_pickle(var, save_path):
241 |     with open(save_path, 'wb') as f:
242 |         pickle.dump(var, f)
243 | 
244 | def load_pickle(load_path):
245 |     with open(load_path, 'rb') as f:
246 |         u = pickle.load(f)
247 |     return u
248 | 
249 | 
250 | if __name__ == "__main__":
251 |     get_system()
252 |     get_memory()


--------------------------------------------------------------------------------
/mlp/debug_embedding_1.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | import os
  3 | import sys
  4 | 
  5 | import pandas as pd
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | 
 12 | current_path = os.path.dirname(os.path.abspath(__file__))
 13 | HOME = os.path.dirname(current_path)
 14 | sys.path.append(HOME)
 15 | for f in ['/home/scao/anaconda3/lib/python3.8/lib-dynload',
 16 |           '/home/scao/anaconda3/lib/python3.8/site-packages']:
 17 |     sys.path.append(f)
 18 | 
 19 | from torchsummary import summary
 20 | from utils import *
 21 | from utils_js import *
 22 | 
 23 | from mlp import *
 24 | pd.set_option('display.max_rows', 100)
 25 | pd.set_option('display.max_columns', 100)
 26 | 
 27 | # %%
 28 | BATCH_SIZE = 8192
 29 | FINETUNE_BATCH_SIZE = 4096_00
 30 | 
 31 | LEARNING_RATE = 1e-4
 32 | WEIGHT_DECAY = 1e-5
 33 | EPOCHS = 100
 34 | EARLYSTOP_NUM = 20
 35 | SAVE_THRESH = 3240
 36 | 
 37 | ALPHA = 0.6
 38 | 
 39 | _fold = 0
 40 | SEED = 802
 41 | get_seed(SEED+SEED*_fold)
 42 | 
 43 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 44 | # %%
 45 | with timer("Preprocessing train"):
 46 |     # train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 47 |     # train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
 48 |     train_parquet = os.path.join(DATA_DIR, 'train_final.parquet')
 49 |     train = pd.read_parquet(train_parquet)
 50 | # %%
 51 | # feat_reg_index = [0, 17, 18, 37, 39, 40, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 57, 58]
 52 | # feat_reg_index += list(range(60,69))
 53 | # feat_reg_index += [89, 101, 108, 113, 119, 120, 121, 122, 124, 125, 126, 128]
 54 | # feat_spike_index_temp = list(set(range(130)).difference(feat_reg_index))
 55 | # features_reg = [f'feature_{i}' for i in feat_reg_index]
 56 | # features_spike = [f'feature_{i}' for i in feat_spike_index_temp]
 57 | 
 58 | 
 59 | # %%
 60 | # feat_spike_index = [eval(s) for s in feat_spike_index]
 61 | # for f in feat_spike_index:
 62 | #     print(f'{f},', end=' ')
 63 | # %%
 64 | feat_spike_index = [1, 2, 3, 4, 5, 6, 10, 14, 16, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85,
 65 |                     86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118]
 66 | feat_reg_index = list(set(range(130)).difference(feat_spike_index))
 67 | features_reg = [f'feature_{i}' for i in feat_reg_index]
 68 | features_spike = [f'feature_{i}' for i in feat_spike_index]
 69 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4', ]
 70 | 
 71 | feat_cols = [f'feature_{i}' for i in range(130)]
 72 | # feat_cols = features_reg
 73 | feat_cols += [f+'_c' for f in features_spike]
 74 | print(f"Number of features: {len(feat_cols)}")
 75 | # %%
 76 | 
 77 | feat_spike_index = []
 78 | most_common_vals = []
 79 | most_common_vals = np.load(DATA_DIR+'spike_common_vals_42.npy').reshape(-1)
 80 | 
 81 | for i, feat in tqdm(enumerate(features_spike)):
 82 |     # sorted_counts = train[feat].value_counts().sort_values(ascending=False)
 83 |     # print(sorted_counts.head(5), '\n\n')
 84 |     # if sorted_counts.iloc[0]/sorted_counts.iloc[1] > 30 and sorted_counts.iloc[0] > 5000:
 85 |     # feat_spike_index.append(sorted_counts.name.split('_')[-1])
 86 |     # most_common_val = sorted_counts.index[0]
 87 |     # most_common_vals.append(most_common_val)
 88 |     train[feat+'_c'] = train[feat] - most_common_vals[i]
 89 |     
 90 | # %%
 91 | train = train.query('date not in [2, 36, 270, 294]').reset_index(drop=True)
 92 | train = train.query('date > 85').reset_index(drop=True)
 93 | 
 94 | train = train[train['weight'] != 0].reset_index(drop=True)
 95 | train['action'] = (train['resp'] > 0).astype('int')
 96 | 
 97 | for c in range(1, 5):
 98 |     train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype('int')
 99 | 
100 | # fold_0 470, 475
101 | # fold_1 450, 455
102 | valid = train.loc[train.date >= 475].reset_index(drop=True)
103 | train = train.loc[train.date <= 470].reset_index(drop=True)
104 | # %%
105 | '''
106 | simpler model, not very promising
107 | '''
108 | 
109 | class SpikeNetC(nn.Module):
110 |     def __init__(self, hidden_size=256,
111 |                  output_size=len(resp_cols),
112 |                  input_size=len(feat_cols),
113 |                  dropout_rate=0.2,):
114 |         super(SpikeNetC, self).__init__()
115 | 
116 |         self.batch_norm0 = nn.BatchNorm1d(input_size)
117 |         self.dropout0 = nn.Dropout(0.2)
118 | 
119 |         self.dense1 = nn.Linear(input_size, hidden_size)
120 |         # nn.init.kaiming_normal_(self.dense1.weight.data)
121 |         self.batch_norm1 = nn.BatchNorm1d(hidden_size)
122 |         self.dropout1 = nn.Dropout(dropout_rate)
123 | 
124 |         self.dense2 = nn.Linear(hidden_size+input_size, hidden_size)
125 |         # nn.init.kaiming_normal_(self.dense2.weight.data)
126 |         self.batch_norm2 = nn.BatchNorm1d(hidden_size)
127 |         self.dropout2 = nn.Dropout(dropout_rate)
128 | 
129 |         self.dense3 = nn.Linear(hidden_size+hidden_size, hidden_size)
130 |         # nn.init.kaiming_normal_(self.dense3.weight.data)
131 |         self.batch_norm3 = nn.BatchNorm1d(hidden_size)
132 |         self.dropout3 = nn.Dropout(dropout_rate)
133 | 
134 |         self.dense4 = nn.Linear(hidden_size+hidden_size, output_size)
135 |         # nn.init.kaiming_normal_(self.dense4.weight.data)
136 | 
137 |         self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True)
138 | 
139 |     def forward(self, x):
140 |         x = self.batch_norm0(x)
141 |         x = self.dropout0(x)
142 | 
143 |         x1 = self.dense1(x)
144 |         x1 = self.batch_norm1(x1)
145 |         x1 = self.LeakyReLU(x1)
146 |         x1 = self.dropout1(x1)
147 | 
148 |         x = torch.cat([x, x1], 1)
149 | 
150 |         x2 = self.dense2(x)
151 |         x2 = self.batch_norm2(x2)
152 |         x2 = self.LeakyReLU(x2)
153 |         x2 = self.dropout2(x2)
154 | 
155 |         x = torch.cat([x1, x2], 1)
156 | 
157 |         x3 = self.dense3(x)
158 |         x3 = self.batch_norm3(x3)
159 |         x3 = self.LeakyReLU(x3)
160 |         x3 = self.dropout3(x3)
161 | 
162 |         x = torch.cat([x2, x3], 1)
163 | 
164 |         x = self.dense4(x)
165 | 
166 |         return x
167 | 
168 | # %%
169 | 
170 | train_set = ExtendedMarketDataset(train,
171 |                              features=feat_cols,
172 |                              targets=target_cols, resp=resp_cols)
173 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
174 | 
175 | valid_set = ExtendedMarketDataset(valid, features=feat_cols,
176 |                              targets=target_cols, resp=resp_cols)
177 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
178 | # %%
179 | util_cols = resp_cols
180 | # util_cols = ['resp']
181 | resp_index = [resp_cols.index(r) for r in util_cols]
182 | regularizer = UtilityLoss(alpha=5e-2, scaling=12,
183 |                           normalize=None, resp_index=resp_index)
184 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
185 | 
186 | model = SpikeNetC()
187 | model.to(device)
188 | summary(model, (len(feat_cols),))
189 | 
190 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
191 | 
192 | scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE,
193 |                                                 steps_per_epoch=len(train_loader),
194 |                                                 epochs=EPOCHS)
195 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
196 | #                                                                  T_0=10, T_mult=2,
197 | #                                                                  eta_min=LEARNING_RATE*1e-3, last_epoch=-1)
198 | 
199 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8)
200 | 
201 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3)
202 | 
203 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM,
204 |                            mode="max", save_threshold=SAVE_THRESH)
205 | 
206 | # %%
207 | 
208 | lr = []
209 | 
210 | for epoch in range(EPOCHS):
211 | 
212 |     # train_loss = train_epoch(
213 |     #     model, optimizer, scheduler, loss_fn, train_loader, device)
214 |     train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device)
215 |     lr.append(optimizer.param_groups[0]['lr'])
216 | 
217 |     if (epoch+1) % 8 == 0:
218 |         _ = train_epoch_finetune(model, finetune_optimizer, scheduler,
219 |                                regularizer, finetune_loader, device, loss_fn=loss_fn)
220 | 
221 |     valid_pred = valid_epoch(model, valid_loader, device)
222 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
223 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
224 |     model_file = MODEL_DIR + \
225 |         f"/emb_fold_{_fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
226 |     early_stop(epoch, valid_auc, model, model_path=model_file,
227 |                epoch_utility_score=valid_score)
228 | 
229 |     if early_stop.model_saved:
230 |         for g in optimizer.param_groups:
231 |             g['lr'] *= 0.1
232 |         lr[-1] = optimizer.param_groups[0]['lr']
233 |         tqdm.write(f"\nNew learning rate: {lr[-1]:.4e}")
234 |         
235 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}")
236 |     tqdm.write(
237 |         f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}")
238 |     tqdm.write(
239 |         f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ")
240 |     tqdm.write(
241 |         f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
242 |     if early_stop.early_stop:
243 |         print("\nEarly stopping")
244 |         break
245 | # %% debug, un-necessary
246 | # sample = next(iter(train_loader))
247 | # cat_dims = [int(train[col].nunique()) for col in cat_cols]
248 | # emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
249 | # emb_layers = nn.ModuleList([nn.Embedding(x, y)
250 | #                                      for x, y in emb_dims])
251 | # x = [emb_layer(sample['cat_features'][0,i].long())
252 | #            for i,emb_layer in enumerate(emb_layers)]
253 | # %%
254 | 


--------------------------------------------------------------------------------
/mlp/debug_resnet_tf.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
  3 | from tensorflow.keras.models import Model, Sequential
  4 | from tensorflow.keras.losses import BinaryCrossentropy
  5 | from tensorflow.keras.optimizers import Adam
  6 | from tensorflow.keras.callbacks import EarlyStopping, Callback
  7 | from tensorflow.keras.layers.experimental.preprocessing import Normalization
  8 | import tensorflow as tf
  9 | import tensorflow_addons as tfa
 10 | from tensorflow.keras import backend as K
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | from tqdm.auto import tqdm
 15 | from random import choices
 16 | 
 17 | current_path = os.path.dirname(os.path.abspath(__file__))
 18 | HOME = os.path.dirname(current_path)
 19 | MODEL_DIR = os.path.join(HOME,  'models')
 20 | DATA_DIR = os.path.join(HOME,  'data')
 21 | sys.path.append(HOME)
 22 | 
 23 | from utils import *
 24 | from mlp import *
 25 | 
 26 | # %%
 27 | '''
 28 | baseline, dropped outlier days, fillna with mean, drop weight zero trades after. Using a feature split based on Carl's notebook. "Minor" features go through a linear layer block with high dropout rate first. Epoch = 50
 29 | 
 30 | Added a util score callback for keras fit API, epoch 80, the util score is for every 50 days after day 100. This model reaches 5k util in the last 50 days in under 50 epochs, too good to be true?
 31 | '''
 32 | 
 33 | SEED = 1127802
 34 | BETA = 0.7 # 5 preds then the middle 3
 35 | 
 36 | # split features for a ResNet feature 2 is more important
 37 | features_2_list = [0, 1, 2, 3, 4, 5, 6, 15, 16, 25, 26, 35, 
 38 |              36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 
 39 |              49, 50, 51, 52, 53, 54, 59, 60, 61, 62, 63, 64, 65, 
 40 |              66, 67, 68, 69, 70, 71, 76, 77, 82, 83, 88, 89, 94, 
 41 |              95, 100, 101, 106, 107, 112, 113, 118, 119, 128, 129]
 42 | 
 43 | features_1_list = [0] + list(set(range(130)).difference(features_2_list))
 44 | 
 45 | features_1 = [f'feature_{i}' for i in features_1_list]
 46 | 
 47 | features_2 = [f'feature_{i}' for i in features_2_list]
 48 | 
 49 | # %%
 50 | all_train = pd.read_parquet(DATA_DIR+'train.parquet')
 51 | all_train = all_train.query('date > 85').reset_index(drop = True) 
 52 | all_train = all_train.query('date not in [2, 36, 270, 294]').reset_index(drop=True)
 53 | 
 54 | all_train.fillna(all_train.mean(), inplace=True)
 55 | 
 56 | features = [f'feature_{i}' for i in range(130)]
 57 | f_mean = np.mean(all_train[features].values,axis=0)
 58 | # np.save('f_mean_after_85_include_zero_weight.npy', f_mean)
 59 | 
 60 | all_train = all_train[all_train['weight'] != 0].reset_index(drop=True)
 61 | 
 62 | all_train = all_train.astype({feat: np.float32 for feat in features})
 63 | #%%
 64 | _fold = 0
 65 | split = [('date > 450','date <= 450'),
 66 |          ('date <= 450 and date > 400','date <= 400 or date>450'),
 67 |          ('date <= 400 and date > 350','date <= 350 or date>400'),
 68 |          ('date <= 350 and date > 300','date <= 300 or date>350'),
 69 |          ('date <= 300 and date > 250','date <= 250 or date>300'),
 70 |          ('date <= 250 and date > 200','date <= 200 or date>250'),
 71 |          ('date <= 200 and date > 150','date <= 150 or date>200'),
 72 |          ('date <= 150 and date > 100','date <= 100 or date>150'),]
 73 | 
 74 | valid = all_train.query(split[_fold][0]).reset_index(drop = True)
 75 | train = all_train.query(split[_fold][1]).reset_index(drop = True)  
 76 | 
 77 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
 78 | 
 79 | y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T
 80 | y_val = np.stack([(valid[c] > 0).astype('int') for c in resp_cols]).T
 81 | 
 82 | X_train = [train.loc[:, features_1].values, 
 83 |            train.loc[:, features_2].values]
 84 | X_val = [valid.loc[:, features_1].values, 
 85 |            valid.loc[:, features_2].values]
 86 | 
 87 | print(len(train), len(valid))
 88 | # %%
 89 | class Mish(tf.keras.layers.Layer):
 90 | 
 91 |     def __init__(self, **kwargs):
 92 |         super(Mish, self).__init__(**kwargs)
 93 |         self.supports_masking = True
 94 | 
 95 |     def call(self, inputs):
 96 |         return inputs * K.tanh(K.softplus(inputs))
 97 | 
 98 |     def get_config(self):
 99 |         base_config = super(Mish, self).get_config()
100 |         return dict(list(base_config.items()) + list(config.items()))
101 | 
102 |     def compute_output_shape(self, input_shape):
103 |         return input_shape
104 | 
105 | def mish(x):
106 | 	return tf.keras.layers.Lambda(lambda x: x*K.tanh(K.softplus(x)))(x)
107 | 
108 | 
109 | tf.keras.utils.get_custom_objects().update({'mish': tf.keras.layers.Activation(mish)})
110 | 
111 | def create_resnet(n_features, n_features_2, n_labels, hidden_size, 
112 |                   learning_rate=1e-3, label_smoothing = 0.005):    
113 |     input_1 = tf.keras.layers.Input(shape = (n_features,), name = 'Input1')
114 |     input_2 = tf.keras.layers.Input(shape = (n_features_2,), name = 'Input2')
115 | 
116 |     head_1 = tf.keras.Sequential([
117 |         tf.keras.layers.BatchNormalization(),
118 |         tf.keras.layers.Dropout(0.4),
119 |         tf.keras.layers.Dense(hidden_size, activation="mish"), 
120 |         tf.keras.layers.BatchNormalization(),
121 |         tf.keras.layers.Dropout(0.4),
122 |         tf.keras.layers.Dense(hidden_size//2, activation = "mish")
123 |         ],name='Head1') 
124 | 
125 |     input_3 = head_1(input_1)
126 |     input_3_concat = tf.keras.layers.Concatenate()([input_2, input_3])
127 | 
128 |     head_2 = tf.keras.Sequential([
129 |         tf.keras.layers.BatchNormalization(),
130 |         tf.keras.layers.Dropout(0.2),
131 |         tf.keras.layers.Dense(hidden_size, "mish"),
132 |         tf.keras.layers.BatchNormalization(),
133 |         tf.keras.layers.Dropout(0.2),
134 |         tf.keras.layers.Dense(hidden_size, "mish"),
135 |         ],name='Head2')
136 | 
137 |     input_4 = head_2(input_3_concat)
138 |     input_4_concat = tf.keras.layers.Concatenate()([input_3, input_4]) 
139 | 
140 |     head_3 = tf.keras.Sequential([
141 |         tf.keras.layers.BatchNormalization(),
142 |         tf.keras.layers.Dense(hidden_size, kernel_initializer='lecun_normal', activation='mish'),
143 |         tf.keras.layers.BatchNormalization(),
144 |         tf.keras.layers.Dropout(0.3),
145 |         tf.keras.layers.Dense(hidden_size//2, kernel_initializer='lecun_normal', activation='mish'),
146 |         tf.keras.layers.BatchNormalization(),
147 |         tf.keras.layers.Dropout(0.3),
148 |         tf.keras.layers.Dense(n_labels, activation="sigmoid")
149 |         ],name='Head3')
150 | 
151 |     output = head_3(input_4_concat)
152 | 
153 | 
154 |     model = tf.keras.models.Model(inputs = [input_1, input_2], outputs = output)
155 |     model.compile(optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate), 
156 |                   loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing), 
157 |                   metrics=['AUC'])
158 |     
159 |     return model
160 | 
161 | class UtilEvaluation(Callback):
162 |     def __init__(self, val_df=None, interval=3, start_day=100, end_day=500, num_days=50):
163 |         super(UtilEvaluation, self).__init__()
164 | 
165 |         self.interval = interval
166 |         self.val_df = val_df
167 |         self.start_day = start_day
168 |         self.end_day = end_day
169 |         self.num_days = num_days
170 | 
171 |     def on_epoch_end(self, epoch, logs={}):
172 |         if (epoch+1) % self.interval == 0:
173 |             print("*"*40)
174 |             print(f"Epoch [{epoch+1:d}/{EPOCHS}]:")
175 |             all_score = []
176 |             all_val_pred = self.val_df[['date', 'weight', 'resp']].copy()
177 |             all_val_pred['action'] = 0
178 | 
179 |             for day in range(self.start_day, self.end_day, self.num_days):
180 |                 valid = self.val_df[self.val_df.date.isin(range(day, day+self.num_days))]
181 |                 valid = valid[valid.weight > 0]
182 | 
183 |                 x_tt = valid.loc[:, features].values
184 |                 x_tt_1 = x_tt.take(features_1_list, axis=-1)
185 |                 x_tt_2 = x_tt.take(features_2_list, axis=-1)
186 |                 val_pred = self.model([x_tt_1, x_tt_2], training = False).numpy()
187 |                 val_pred = median_avg(val_pred)
188 |                 val_pred = np.where(val_pred >= 0.5, 1, 0).astype(int)
189 |                 valid_score = utility_score_bincount(date=valid.date.values, 
190 |                                                  weight=valid.weight.values,
191 |                                                  resp=valid.resp.values, 
192 |                                                  action=val_pred)
193 |                 all_score.append(valid_score)
194 |                 all_val_pred.loc[self.val_df.date.isin(range(day, day+self.num_days)), 'action']=val_pred
195 |                 all_val_pred.to_csv(f'val_pred_fold_{_fold}.csv', index=False)
196 |                 print(f"Day {day:3d}-{day+self.num_days-1:3d} - util score: {valid_score:.2f}")
197 |             
198 |             print(f"Utility score mean with {self.num_days} span: {np.mean(all_score):.2f} ")
199 |             print(f"Utility score std with {self.num_days} span: {np.std(all_score):.2f}")
200 |             print("*"*40, '\n')
201 | 
202 | #%%
203 | tf.keras.backend.clear_session()
204 | SEED = 1127
205 | tf.random.set_seed(SEED)
206 | tf_model = create_resnet(len(features_1), len(features_2), len(resp_cols), 
207 |                          hidden_size=300, learning_rate=1e-4, label_smoothing=5e-03)
208 | util_cb = UtilEvaluation(val_df=valid, start_day=valid.date.min(), end_day=valid.date.max())
209 | tf_model.summary()
210 | # %%
211 | EPOCHS = 50
212 | tf_model.fit(X_train, y_train, epochs=EPOCHS, batch_size=8192, 
213 |              validation_data=(X_val, y_val),
214 |              verbose=1,
215 |              callbacks=[util_cb]
216 |             )
217 | 
218 | # save model
219 | tf_model.save(f'tf_res_fold_{_fold}_ep_{EPOCHS}.h5')
220 | # %%
221 | 
222 | all_val_preds = []
223 | for i in range(6):
224 |     val_preds = pd.read_csv(MODEL_DIR+f'val_pred_fold_{5-i}.csv')
225 |     all_val_preds.append(val_preds)
226 | #%%
227 | all_val_preds = pd.concat(all_val_preds,axis=0)
228 | all_val_preds = all_val_preds.query('date >= 249 and date <=499')
229 | valid_score = utility_score_bincount(date=all_val_preds.date.values, 
230 |                                     weight=all_val_preds.weight.values,
231 |                                     resp=all_val_preds.resp.values, 
232 |                                     action=all_val_preds.action.values)
233 | # %%
234 | 


--------------------------------------------------------------------------------
/mlp/run_train_final_2_overfit.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | 
  3 | import os
  4 | import sys
  5 | current_path = os.path.dirname(os.path.abspath(__file__))
  6 | HOME = os.path.dirname(current_path)
  7 | MODEL_DIR = os.path.join(HOME,  'models')
  8 | DATA_DIR = os.path.join(HOME,  'data')
  9 | sys.path.append(HOME)
 10 | 
 11 | from utils import *
 12 | from mlp import *
 13 | 
 14 | import torch
 15 | import torch.nn.functional as F
 16 | import torch.nn as nn
 17 | torch.backends.cudnn.deterministic = True  # for bincount
 18 | 
 19 | 
 20 | from torchsummary import summary
 21 | # %%
 22 | '''
 23 | Training script (excluding volatile days):
 24 | 1. data: after day 85, excluding (2, 294, 36, 270)
 25 | 2. data: the fillna is using the past day mean (after excluding the days above)
 26 | 3. data: all five resps
 27 | 4. training: finetuning using resp columns as regularizer, every 10 iterations
 28 | '''
 29 | 
 30 | DEBUG = False
 31 | LOAD_PRETRAIN = False
 32 | 
 33 | DROP_ZERO_WEIGHT = True
 34 | 
 35 | TRAINING_START = 0 
 36 | FINETUNE_BATCH_SIZE = 4096_00
 37 | BATCH_SIZE = 8192
 38 | EPOCHS = 60
 39 | FINETUNE_EPOCHS = 2
 40 | LEARNING_RATE = 1e-4
 41 | WEIGHT_DECAY = 1e-5
 42 | EARLYSTOP_NUM = 5
 43 | NFOLDS = 1
 44 | SCALING = 12
 45 | THRESHOLD = 0.5
 46 | 
 47 | DAYS_TO_DROP = list(range(86))+[270, 294]
 48 | VOLATILE_DAYS = [1,  4,  5,  12,  16,  18,  24,  37,  38,  43,  44,  45,  47,
 49 |              59,  63,  80,  85, 161, 168, 452, 459, 462]
 50 | VOLATILE_MODEL = True
 51 | 
 52 | 
 53 | SEED = 802
 54 | np.random.seed(SEED)
 55 | pd.core.common.random_state(SEED)
 56 | torch.manual_seed(SEED)
 57 | torch.cuda.manual_seed(SEED)
 58 | torch.backends.cudnn.deterministic = True
 59 | torch.backends.cudnn.benchmark = False
 60 | if torch.cuda.is_available():
 61 |     torch.cuda.manual_seed_all(SEED)
 62 | 
 63 | splits = {
 64 |           'train_days': (range(0,500), range(0,466), range(0,433)),
 65 |           'valid_days': (range(467, 500), range(434, 466), range(401, 433)),
 66 |           }
 67 | 
 68 | fold = 2
 69 | 
 70 | if fold == 0:
 71 |     SAVE_THRESH = 2000
 72 |     VAL_OFFSET = 70
 73 | elif fold == 1:
 74 |     SAVE_THRESH = 1800
 75 |     VAL_OFFSET = 70
 76 | elif fold == 2:
 77 |     SAVE_THRESH = 1000
 78 |     VAL_OFFSET = 70
 79 |     EPSILON = 1e-2
 80 | 
 81 | if VOLATILE_MODEL:
 82 |     resp_cols = ['resp_3','resp','resp_4']
 83 |     resp_cols_all = resp_cols
 84 |     util_cols = ['resp_3','resp','resp_4']
 85 |     # util_cols =['resp_3','resp', 'resp_4']
 86 |     resp_index = [resp_cols_all.index(r) for r in util_cols]
 87 |     target_cols = ['action_3','action',  'action_4']
 88 | else:
 89 |     resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3','resp_4']
 90 |     resp_cols_all = resp_cols
 91 |     util_cols =['resp_1','resp_2', 'resp_3', 'resp', 'resp_4']
 92 |     # util_cols =['resp_3','resp', 'resp_4']
 93 |     resp_index = [resp_cols_all.index(r) for r in util_cols]
 94 |     target_cols = ['action', 'action_1','action_2','action_3', 'action_4']
 95 | 
 96 | feat_cols = [f'feature_{i}' for i in range(130)]
 97 | feat_cols += ['cross_41_42_43', 'cross_1_2']
 98 | 
 99 | 
100 | noisy_index = [3, 4, 5, 6, 8, 10, 12, 14, 16, 37, 38, 39, 40, 72, 73, 74, 75, 76,
101 |                 78, 79, 80, 81, 82, 83]
102 | negative_index = [73, 75, 76, 77, 79, 81, 82]
103 | hybrid_index = [55, 56, 57, 58, 59]
104 | running_indices = sorted([0]+noisy_index+negative_index+hybrid_index)
105 | 
106 | rm_500_cols = ['feature_' + str(i) + '_rm_500' for i in running_indices]
107 | 
108 | #### adding the running mean
109 | # feat_cols += rm_500_cols
110 | 
111 | ###### adding weight to the features #######
112 | # feat_cols.extend(['weight'])
113 | 
114 | 
115 | 
116 | 
117 | f = median_avg
118 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
119 | 
120 | # %%
121 | with timer("Preprocessing train"):
122 |     # train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
123 |     train_parquet = os.path.join(DATA_DIR, 'train.parquet')
124 |     train = pd.read_parquet(train_parquet)
125 | 
126 |     # feat_add_parquet = os.path.join(DATA_DIR, 'feat_rm_500.parquet')
127 |     # feat_add_df = pd.read_parquet(feat_add_parquet)
128 | 
129 |     # train = pd.concat([train, feat_add_df], axis=1)
130 |     
131 |     if not VOLATILE_MODEL:
132 |     # train = train.query(f'date not in {VOLATILE_DAYS}').reset_index(drop = True)
133 |         train = train.query('date > 85').reset_index(drop=True)
134 |     train.fillna(train.mean(), inplace=True)
135 | 
136 |     train = train[train['weight'] > 0].reset_index(drop = True)
137 |     
138 |     # index_zero_weight =  (train['weight']==0)
139 |     # index_zero_weight = np.where(index_zero_weight)[0]
140 |     # index_zero_weight = np.random.choice(index_zero_weight, size=int(0.4*len(index_zero_weight)))
141 |     # train.loc[index_zero_weight, ['weight']] = train.loc[index_zero_weight, ['weight']].clip(1e-7)
142 |     # # train = train[train['weight'] > 0].reset_index(drop = True)
143 | 
144 |     train['action'] = (train['resp'] > 0).astype(int)
145 |     for c in range(1,5):
146 |         train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int)
147 |     
148 |     train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43']
149 |     train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5).astype(np.float32)
150 | 
151 |     #### concat with moving mean features
152 | 
153 |     valid = train.loc[train.date.isin(splits['valid_days'][fold])].reset_index(drop=True)
154 |     train = train.loc[train.date.isin(splits['train_days'][fold])].reset_index(drop=True)
155 | 
156 | # %%
157 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols)
158 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
159 | 
160 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols)
161 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
162 | 
163 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols))
164 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols))
165 | # model = ResidualMLPLite(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols))
166 | model.to(device)
167 | summary(model, input_size=(len(feat_cols), ))
168 | # %%
169 | regularizer = UtilityLoss(alpha=EPSILON, scaling=SCALING, normalize=None, resp_index=resp_index)
170 | 
171 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
172 | 
173 | optimizer = torch.optim.Adam(model.parameters(), 
174 |                              lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY,)
175 | 
176 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
177 |                                                                  T_0=50, T_mult=2, 
178 |                                                                  eta_min=LEARNING_RATE*1e-3, last_epoch=-1)
179 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-8)
180 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 
181 | #                                                 steps_per_epoch=len(train_loader), epochs=EPOCHS)
182 | # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=LEARNING_RATE*1e-2, 
183 | #                                              max_lr=LEARNING_RATE, step_size_up=5, 
184 | #                                              mode="triangular2")
185 | # scheduler_add = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20,39], gamma=0.1)
186 | # scheduler_add = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
187 | 
188 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=10)
189 | 
190 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3)
191 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=SAVE_THRESH, util_offset=VAL_OFFSET)
192 | # %%
193 | for epoch in range(EPOCHS):
194 | 
195 |     # train_loss = train_epoch(model, optimizer, scheduler, loss_fn, train_loader, device)
196 |     train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device)
197 |     # scheduler_add.step()
198 |     lr = optimizer.param_groups[0]['lr']
199 |     if (epoch+1) % 10 == 0:
200 |         _ = train_epoch_finetune(model, finetune_optimizer, scheduler,
201 |                                  regularizer, finetune_loader, device, loss_fn=loss_fn)
202 | 
203 |     valid_pred = valid_epoch(model, valid_loader, device)
204 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
205 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
206 |     if VOLATILE_MODEL:
207 |         model_file = MODEL_DIR + f"/pt_vol_overfit_{fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
208 |     else:
209 |         model_file = MODEL_DIR + f"/pt_overfit_{fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
210 |     early_stop(epoch, valid_auc, model, 
211 |                model_path=model_file,
212 |                epoch_utility_score=valid_score)
213 | 
214 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {fold}")
215 |     tqdm.write(f"Train loss: {train_loss:.4e} \t Current learning rate: {lr:.4e}")
216 |     tqdm.write(f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ")
217 |     tqdm.write(f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
218 |     if early_stop.early_stop:
219 |         print("\nEarly stopping")
220 |         break
221 | # %%
222 | 
223 | feat_cols = [f'feature_{i}' for i in range(130)]
224 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
225 | 
226 | 
227 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, 
228 |                     output_size=len(target_cols))
229 | model.to(device)
230 | try:
231 |     print(f"Loading {early_stop.model_path} for cv check.\n")
232 |     model_weights = early_stop.model_path
233 |     model.load_state_dict(torch.load(model_weights))
234 |     model.eval();
235 | 
236 |     train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
237 |     train = preprocess_final(train_parquet, drop_zero_weight=True)
238 | 
239 |     CV_START_DAY = 401
240 |     CV_DAYS = 32
241 |     print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
242 |                             batch_size =2*8192, f=median_avg, threshold=0.5, 
243 |                             target_cols=target_cols, 
244 |                             feat_cols=feat_cols,
245 |                             resp_cols=resp_cols)
246 | except:
247 |     FileNotFoundError
248 |     print("Model not found")
249 | # %%
250 | 


--------------------------------------------------------------------------------
/mlp/run_train_final_2.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | from torchsummary import summary
  3 | import os
  4 | import sys
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torch.nn as nn
  8 | torch.backends.cudnn.deterministic = True  # for bincount
  9 | 
 10 | current_path = os.path.dirname(os.path.abspath(__file__))
 11 | HOME = os.path.dirname(current_path)
 12 | MODEL_DIR = os.path.join(HOME,  'models')
 13 | DATA_DIR = os.path.join(HOME,  'data')
 14 | sys.path.append(HOME)
 15 | 
 16 | from utils import *
 17 | from mlp import *
 18 | # %%
 19 | '''
 20 | Training script (excluding volatile days):
 21 | 1. data: after day 85, excluding (2, 294, 36, 270)
 22 | 2. data: the fillna is using the past day mean (after excluding the days above)
 23 | 3. data: all five resps
 24 | 4. training: finetuning using resp columns as regularizer, every 10 iterations
 25 | '''
 26 | 
 27 | DEBUG = False
 28 | LOAD_PRETRAIN = False
 29 | 
 30 | DROP_ZERO_WEIGHT = True
 31 | 
 32 | FINETUNE_BATCH_SIZE = 4096_00
 33 | BATCH_SIZE = 8192
 34 | EPOCHS = 80
 35 | FINETUNE_EPOCHS = 2
 36 | LEARNING_RATE = 1e-4
 37 | WEIGHT_DECAY = 1e-5
 38 | EARLYSTOP_NUM = 5
 39 | SCALING = 12
 40 | THRESHOLD = 0.5
 41 | 
 42 | DAYS_TO_DROP = list(range(86))+[270, 294]
 43 | VOLATILE_DAYS = [1,  4,  5,  12,  16,  18,  24,  37,  38,  43,  44,  45,  47,
 44 |              59,  63,  80,  85, 161, 168, 452, 459, 462]
 45 | VOLATILE_MODEL = True
 46 | 
 47 | fold = 2
 48 | 
 49 | # s = 11 for fold 1
 50 | SEED = 1127802//8+fold
 51 | np.random.seed(SEED)
 52 | pd.core.common.random_state(SEED)
 53 | torch.manual_seed(SEED)
 54 | torch.cuda.manual_seed(SEED)
 55 | torch.backends.cudnn.deterministic = True
 56 | torch.backends.cudnn.benchmark = False
 57 | if torch.cuda.is_available():
 58 |     torch.cuda.manual_seed_all(SEED)
 59 | 
 60 | splits = {
 61 |           'train_days': (range(0,457), range(0,424), range(0,391)),
 62 |           'valid_days': (range(467, 500), range(434, 466), range(401, 433)),
 63 |           }
 64 | 
 65 | if fold == 0:
 66 |     SAVE_THRESH = 1000
 67 |     VAL_OFFSET = 150
 68 | elif fold == 1:
 69 |     LEARNING_RATE = 1e-3
 70 |     SAVE_THRESH = 1100
 71 |     VAL_OFFSET = 150
 72 | elif fold == 2:
 73 |     SAVE_THRESH = 100
 74 |     VAL_OFFSET = 100
 75 |     EPOCHS = 40
 76 |     LEARNING_RATE = 1e-3
 77 |     EPSILON = 1e-2
 78 | 
 79 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3','resp_4']
 80 | resp_cols_all = resp_cols
 81 | target_cols = ['action', 'action_1','action_2','action_3', 'action_4']
 82 | feat_cols = [f'feature_{i}' for i in range(130)]
 83 | feat_cols += ['cross_41_42_43', 'cross_1_2']
 84 | 
 85 | 
 86 | noisy_index = [3, 4, 5, 6, 8, 10, 12, 14, 16, 37, 38, 39, 40, 72, 73, 74, 75, 76,
 87 |                 78, 79, 80, 81, 82, 83]
 88 | negative_index = [73, 75, 76, 77, 79, 81, 82]
 89 | hybrid_index = [55, 56, 57, 58, 59]
 90 | running_indices = sorted([0]+noisy_index+negative_index+hybrid_index)
 91 | 
 92 | rm_500_cols = ['feature_' + str(i) + '_rm_500' for i in running_indices]
 93 | 
 94 | #### adding the running mean
 95 | # feat_cols += rm_500_cols
 96 | 
 97 | ###### adding weight to the features #######
 98 | # feat_cols.extend(['weight'])
 99 | 
100 | util_cols =['resp_1','resp_2', 'resp_3', 'resp', 'resp_4']
101 | # util_cols =['resp_3','resp', 'resp_4']
102 | resp_index = [resp_cols_all.index(r) for r in util_cols]
103 | 
104 | 
105 | f = median_avg
106 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
107 | 
108 | # %%
109 | with timer("Preprocessing train"):
110 |     train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
111 |     train = pd.read_parquet(train_parquet)
112 | 
113 |     # feat_add_parquet = os.path.join(DATA_DIR, 'feat_rm_500.parquet')
114 |     # feat_add_df = pd.read_parquet(feat_add_parquet)
115 | 
116 |     # train = pd.concat([train, feat_add_df], axis=1)
117 | 
118 |     if not VOLATILE_MODEL:
119 |         train = train.query(f'date not in {VOLATILE_DAYS}').reset_index(drop = True)
120 |         train = train.query('date > 85').reset_index(drop=True)
121 | 
122 |     train = train[train['weight'] > 0].reset_index(drop = True)
123 | 
124 |     train['action'] = (train['resp'] > 0).astype(int)
125 |     for c in range(1,5):
126 |         train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int)
127 |     
128 |     train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43']
129 |     train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5).astype(np.float32)
130 | 
131 |     #### concat with moving mean features
132 | 
133 |     valid = train.loc[train.date.isin(splits['valid_days'][fold])].reset_index(drop=True)
134 |     train = train.loc[train.date.isin(splits['train_days'][fold])].reset_index(drop=True)
135 | 
136 | # %%
137 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols)
138 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
139 | 
140 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols)
141 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
142 | 
143 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols))
144 | # model = MLP(hidden_units=(None,160,160,160), input_dim=len(feat_cols), output_dim=len(target_cols))
145 | # model = ResidualMLPLite(input_size=len(feat_cols), hidden_size=256, output_size=len(target_cols))
146 | model.to(device)
147 | summary(model, input_size=(len(feat_cols), ))
148 | # %%
149 | regularizer = UtilityLoss(alpha=5e-2, scaling=SCALING, normalize=None, resp_index=resp_index)
150 | 
151 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
152 | 
153 | optimizer = torch.optim.Adam(model.parameters(), 
154 |                              lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY,)
155 | 
156 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
157 |                                                                  T_0=50, T_mult=2, 
158 |                                                                  eta_min=LEARNING_RATE*1e-3, last_epoch=-1)
159 | # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-8)
160 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, 
161 | #                                                 steps_per_epoch=len(train_loader), epochs=EPOCHS)
162 | # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=LEARNING_RATE*1e-2, 
163 | #                                              max_lr=LEARNING_RATE, step_size_up=5, 
164 | #                                              mode="triangular2")
165 | # scheduler_add = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20,39], gamma=0.1)
166 | # scheduler_add = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
167 | 
168 | finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=10)
169 | 
170 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3)
171 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=SAVE_THRESH, util_offset=VAL_OFFSET)
172 | # %%
173 | for epoch in range(EPOCHS):
174 | 
175 |     # train_loss = train_epoch(model, optimizer, scheduler, loss_fn, train_loader, device)
176 |     train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device)
177 |     # scheduler_add.step()
178 |     lr = optimizer.param_groups[0]['lr']
179 |     if (epoch+1) % 10 == 0:
180 |         _ = train_epoch_finetune(model, finetune_optimizer, scheduler,
181 |                                  regularizer, finetune_loader, device, loss_fn=loss_fn)
182 | 
183 |     valid_pred = valid_epoch(model, valid_loader, device)
184 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
185 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
186 |     if VOLATILE_MODEL:
187 |         model_file = os.path.join(MODEL_DIR, 
188 |                                  f"pt_volatile_{fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth")
189 |     else:
190 |         model_file = os.path.join(MODEL_DIR, 
191 |                                  f"pt_{fold}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth")
192 |     early_stop(epoch, valid_auc, model, 
193 |                model_path=model_file,
194 |                epoch_utility_score=valid_score)
195 | 
196 |     # if early_stop.model_saved:
197 |     #     for g in optimizer.param_groups:
198 |     #         g['lr'] *= 0.1
199 |     #     lr = optimizer.param_groups[0]['lr']
200 |     #     print(f"\nNew learning rate: {lr:.4e}")
201 | 
202 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {fold}")
203 |     tqdm.write(f"Train loss: {train_loss:.4e} \t Current learning rate: {lr:.4e}")
204 |     tqdm.write(f"Best util: {early_stop.best_utility_score:.2f} at epoch {early_stop.best_epoch} \t {early_stop.message} ")
205 |     tqdm.write(f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
206 |     if early_stop.early_stop:
207 |         print("\nEarly stopping")
208 |         break
209 | 
210 | #%%
211 | # for epoch in range(FINETUNE_EPOCHS):
212 | #     util_loss, train_loss = train_epoch_finetune(model, finetune_optimizer, scheduler,
213 | #                                  regularizer, finetune_loader, device, loss_fn=loss_fn)
214 | 
215 | #     valid_pred = valid_epoch(model, valid_loader, device)
216 | #     valid_auc, valid_score = get_valid_score(valid_pred, valid,
217 | #                                              f=median_avg, threshold=0.5, target_cols=target_cols)
218 | 
219 | #     print(f"\n[Finetune epoch {epoch+1}/{FINETUNE_EPOCHS}] \t Fold {_fold}")
220 | #     print(f"Train loss: {train_loss:.4e} \t Util loss: {util_loss:.2f}")
221 | #     print(f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
222 | 
223 | # if DEBUG:
224 | #     torch.save(model.state_dict(), MODEL_DIR + f"/model_{_fold}.pth")
225 | # %%
226 | 
227 | 
228 | feat_cols = [f'feature_{i}' for i in range(130)]
229 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
230 | 
231 | 
232 | model = ResidualMLP(input_size=len(feat_cols), hidden_size=256, 
233 |                     output_size=len(target_cols))
234 | model.to(device)
235 | try:
236 |     print(f"Loading {early_stop.model_path} for cv check.\n")
237 |     model_weights = early_stop.model_path
238 |     # model_weights = os.path.join(MODEL_DIR, 'final_1_util_865_auc_0.5450.pth')
239 |     model.load_state_dict(torch.load(model_weights))
240 |     model.eval();
241 | 
242 |     train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
243 |     train = preprocess_final(train_parquet, drop_zero_weight=True)
244 | 
245 |     CV_START_DAY = 401
246 |     CV_DAYS = 32
247 |     print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
248 |                             batch_size =2*8192, f=median_avg, threshold=0.5, 
249 |                             target_cols=target_cols, 
250 |                             feat_cols=feat_cols,
251 |                             resp_cols=resp_cols)
252 | except:
253 |     FileNotFoundError
254 |     print("Model not found")
255 | 
256 | # %%
257 | 


--------------------------------------------------------------------------------
/mlp/debug_embedding_tag.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import os, sys
  3 | import pandas as pd
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from torchsummary import summary
  8 | # from fastai.tabular.all import TabularPandas, RandomSplitter, CategoryBlock, MultiCategoryBlock, range_of, accuracy, tabular_learner, TabularDataLoaders
  9 | 
 10 | current_path = os.path.dirname(os.path.abspath(__file__))
 11 | HOME = os.path.dirname(current_path)
 12 | MODEL_DIR = os.path.join(HOME,  'models')
 13 | DATA_DIR = os.path.join(HOME,  'data')
 14 | sys.path.append(HOME)
 15 | 
 16 | from mlp import *
 17 | from utils import *
 18 | from utils_js import *
 19 | #%%
 20 | 
 21 | HIDDEN_LAYERS = [400, 400, 400] # hidden layer size for the embedding model 
 22 | N_FEAT_TAGS = 29
 23 | N_TARGETS = 6
 24 | 
 25 | BATCH_SIZE = 8196
 26 | EARLYSTOP_NUM = 5
 27 | FINETUNE_BATCH_SIZE = 51200
 28 | 
 29 | EPOCHS = 100
 30 | 
 31 | N_DENOISED_TARGET = 1
 32 | LEARNING_RATE = 1e-4
 33 | WEIGHT_DECAY = 1e-4
 34 | 
 35 | N_FEATURES = 130
 36 | N_FEAT_TAGS = 29
 37 | 
 38 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 39 | 
 40 | dtype = {
 41 |     'feature'  : 'str', 
 42 |     'tag_0'    : 'int8'
 43 | }
 44 | for i in range (1, 29):
 45 |     k = 'tag_' + str (i)
 46 |     dtype[k] = 'int8'
 47 | 
 48 | features_df = pd.read_csv (os.path.join(DATA_DIR, 'features.csv'), usecols=range(1,30), dtype=dtype)
 49 | # N_FEATURES  = features_df.shape[0]  # the features.csv has 130 features (1st row) = no of features in train.csv (feature_0 to feature_129)
 50 | # N_FEAT_TAGS = features_df.shape[1]  # the features.csv has 29 tags
 51 | 
 52 | resp_cols  = ['resp_1', 'resp_2', 'resp_3','resp_4', 'resp']    
 53 | feat_cols = [f'feature_{i}' for i in range(130)]
 54 | resp_cols = ['resp', 'resp_dn_0', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
 55 | target_cols = ['action', 'action_dn_0', 'action_1', 'action_2', 'action_3', 'action_4']
 56 | # %%
 57 | with timer("Preprocessing train"):
 58 |     train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 59 |     train = pd.read_parquet(train_parquet)
 60 |     train = train.query ('date > 85').reset_index (drop = True)
 61 |         # df = df[df['weight'] != 0].reset_index (drop = True)
 62 | 
 63 | train.fillna(train.mean(),inplace=True)
 64 | train = add_denoised_target(train, num_dn_target=N_DENOISED_TARGET)
 65 | y = np.stack ([(train[c] > 0).astype ('int') for c in resp_cols]).T
 66 | # train.drop (columns=['weight', 'date', 'ts_id']+resp_cols, inplace=True)
 67 | train['action'] = (train['resp'] > 0).astype('int')
 68 | for c in range(1,5):
 69 |     train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype('int')
 70 | valid = train.loc[train.date > 450].reset_index(drop=True)
 71 | # %%
 72 | class FeatureFFN (nn.Module):
 73 |     
 74 |     def __init__(self, inputCount=130, 
 75 |                  outputCount=5, 
 76 |                  hiddenLayerCounts=[150, 150, 150], 
 77 |                  drop_prob=0.2, 
 78 |                  activation=nn.SiLU() # this is swish activation
 79 |                  ):
 80 |         '''
 81 |         Feature generation embedding net, no output
 82 |         '''
 83 |         super(FeatureFFN, self).__init__()
 84 |         
 85 |         self.activation = activation
 86 |         self.dropout    = nn.Dropout (drop_prob)
 87 |         self.batchnorm0 = nn.BatchNorm1d (inputCount)
 88 |         self.dense1     = nn.Linear (inputCount, hiddenLayerCounts[0])
 89 |         self.batchnorm1 = nn.BatchNorm1d (hiddenLayerCounts[0])
 90 |         self.dense2     = nn.Linear(hiddenLayerCounts[0], hiddenLayerCounts[1])
 91 |         self.batchnorm2 = nn.BatchNorm1d (hiddenLayerCounts[1])
 92 |         self.dense3     = nn.Linear(hiddenLayerCounts[1], hiddenLayerCounts[2])
 93 |         self.batchnorm3 = nn.BatchNorm1d (hiddenLayerCounts[2])        
 94 |         self.outDense   = None
 95 |         if outputCount > 0:
 96 |             self.outDense   = nn.Linear(hiddenLayerCounts[-1], outputCount)
 97 | 
 98 |     def forward (self, x):
 99 |         
100 |         # x = self.dropout (self.batchnorm0 (x))
101 |         x = self.batchnorm0(x)
102 |         x = self.dropout (self.activation (self.batchnorm1 (self.dense1 (x))))
103 |         x = self.dropout (self.activation (self.batchnorm2 (self.dense2 (x))))
104 |         x = self.dropout (self.activation (self.batchnorm3 (self.dense3 (x))))
105 |         # x = self.outDense (x)
106 |         return x
107 | # %%
108 | class EmbedFNN (nn.Module):
109 |     
110 |     def __init__(self, hidden_layers=HIDDEN_LAYERS, 
111 |                        embed_dim=N_FEAT_TAGS, 
112 |                        features_tag_matrix=features_df):
113 |         
114 |         super(EmbedFNN, self).__init__()
115 |         global N_FEAT_TAGS
116 |         N_FEAT_TAGS = 29
117 |         
118 |         # store the features to tags mapping as a datframe tdf, feature_i mapping is in tdf[i, :]
119 |         # dtype = {'tag_0' : 'int8'}
120 |         # for i in range (1, 29):
121 |         #     k = 'tag_' + str (i)
122 |         #     dtype[k] = 'int8'
123 |         # t_df = pd.read_csv ('features.csv', usecols=range (1,N_FEAT_TAGS+1), dtype=dtype)
124 |         # tag_29 is for feature_0
125 |         features_tag_matrix['tag_29'] = np.array ([1] + ([0]*(N_FEATURES-1)) ).astype ('int8')
126 |         self.features_tag_matrix = torch.tensor(features_tag_matrix.values, dtype=torch.float32)
127 |         # torch.tensor(t_df.to_numpy())
128 |         N_FEAT_TAGS += 1
129 |         
130 |         
131 |         # embeddings for the tags. Each feature is taken a an embedding which is an avg. of its' tag embeddings
132 |         self.embed_dim  = embed_dim
133 |         self.tag_embedding = nn.Embedding(N_FEAT_TAGS+1, embed_dim) # create a special tag if not known tag for any feature
134 |         self.tag_weights = nn.Linear(N_FEAT_TAGS, 1)
135 |         
136 |         drop_prob = 0.5
137 |         self.ffn = FeatureFFN(inputCount=(N_FEATURES+embed_dim), 
138 |                               outputCount=0, 
139 |                               hiddenLayerCounts=[(hidden_layers[0]+embed_dim), 
140 |                                                (hidden_layers[1]+embed_dim), 
141 |                                                (hidden_layers[2]+embed_dim)], 
142 |                               drop_prob=drop_prob)
143 |         self.outDense = nn.Linear (hidden_layers[2]+embed_dim, N_TARGETS)
144 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
145 |         return
146 |     
147 |     def features2emb (self):
148 |         """
149 |         idx : int feature index 0 to N_FEATURES-1 (129)
150 |         """
151 |         
152 |         all_tag_idxs = torch.LongTensor(np.arange(N_FEAT_TAGS)) # (29,)
153 |         tag_bools = self.features_tag_matrix.to(self.device) # (130, 29)
154 |         # print ('tag_bools.shape =', tag_bools.size())
155 |         all_tag_idxs = all_tag_idxs.to(self.device)
156 |         f_emb = self.tag_embedding(all_tag_idxs).repeat(N_FEATURES, 1, 1)    
157 |         #;print ('1. f_emb =', f_emb) # (29, 7) * (130, 1, 1) = (130, 29, 7)
158 |         # print ('f_emb.shape =', f_emb.size())
159 |         f_emb = f_emb * tag_bools[:, :, None]                           
160 |         #;print ('2. f_emb =', f_emb) # (130, 29, 7) * (130, 29, 1) = (130, 29, 7)
161 |         # print ('f_emb.shape =', f_emb.size())
162 |         
163 |         # Take avg. of all the present tag's embeddings to get the embedding for a feature
164 |         s = torch.sum (tag_bools, dim=1) # (130,)       
165 |         f_emb = torch.sum (f_emb, dim=-2) / s[:, None]
166 |         # (130, 7)
167 |         # print ('f_emb =', f_emb)        
168 |         # print ('f_emb.shape =', f_emb.shape)
169 |         
170 |         # take a linear combination of the present tag's embeddings
171 |         # f_emb = f_emb.permute (0, 2, 1) # (130, 7, 29)
172 |         # f_emb = self.tag_weights (f_emb)                      
173 |         # #;print ('3. f_emb =', f_emb)    # (130, 7, 1)
174 |         # f_emb = torch.squeeze (f_emb, dim=-1)                 
175 |         # #;print ('4. f_emb =', f_emb)   # (130, 7)
176 |         return f_emb.detach().to(self.device)
177 |     
178 |     def forward (self, features, cat_featrs=None):
179 |         """
180 |         when you call `model (x ,y, z, ...)` then this method is invoked
181 |         """
182 |         
183 |         # cat_featrs = None
184 |         features   = features.view (-1, N_FEATURES)
185 |         f_emb = self.features2emb()
186 |         features_2 = torch.matmul (features, f_emb)
187 |         
188 |         # Concatenate the two features (features + their embeddings)
189 |         features = torch.hstack ((features, features_2))       
190 |         
191 |         x = self.ffn(features)
192 |         out = self.outDense(x)
193 |         return out
194 | 
195 | # %%
196 | train_set = ExtendedMarketDataset(train, features=feat_cols, targets=target_cols, resp=resp_cols)
197 | train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
198 | 
199 | valid_set = ExtendedMarketDataset(valid, features=feat_cols, targets=target_cols, resp=resp_cols)
200 | valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
201 | 
202 | # %%
203 | util_cols = resp_cols
204 | resp_index = [resp_cols.index(r) for r in util_cols]
205 | 
206 | regularizer = UtilityLoss(alpha=5e-2, scaling=12, normalize=None, resp_index=resp_index)
207 | 
208 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
209 | 
210 | model = EmbedFNN()
211 | # model.to(device);
212 | # optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
213 | optimizer = RAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
214 | 
215 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE,
216 | #                                                     steps_per_epoch=len(train_loader),
217 | #                                                     epochs=EPOCHS)
218 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
219 |                                                                  T_0=10, T_mult=1, 
220 |                                                                  eta_min=LEARNING_RATE*1e-3, last_epoch=-1)
221 | 
222 | finetune_loader = DataLoader(
223 |     train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8)
224 | 
225 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3)
226 | 
227 | early_stop = EarlyStopping(patience=EARLYSTOP_NUM, mode="max", save_threshold=5900)
228 | 
229 | # %%
230 | _fold = 7
231 | SEED = 802
232 | get_seed(SEED+SEED*_fold)
233 | lr = []
234 | 
235 | for epoch in range(EPOCHS):
236 | 
237 |     train_loss = train_epoch(model, optimizer, scheduler,loss_fn, train_loader, device)
238 |     # train_loss = train_epoch_weighted(model, optimizer, scheduler, loss_fn, train_loader, device)
239 |     lr.append(optimizer.param_groups[0]['lr'])
240 |     if (epoch+1) % 10 == 0:
241 |         _ = train_epoch_finetune(model, finetune_optimizer, scheduler,
242 |                                  regularizer, finetune_loader, device, loss_fn=loss_fn)
243 | 
244 |     valid_pred = valid_epoch(model, valid_loader, device)
245 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
246 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
247 |     model_file = MODEL_DIR + \
248 |         f"/emb_fold_{_fold}_ep_{epoch}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
249 |     early_stop(valid_auc, model, model_path=model_file,
250 |                epoch_utility_score=valid_score)
251 |     tqdm.write(f"\n[Epoch {epoch+1}/{EPOCHS}] \t Fold {_fold}")
252 |     tqdm.write(
253 |         f"Train loss: {train_loss:.4f} \t Current learning rate: {lr[-1]:.4e}")
254 |     tqdm.write(
255 |         f"Best util: {early_stop.best_utility_score:.2f} \t {early_stop.message} ")
256 |     tqdm.write(
257 |         f"Valid utility: {valid_score:.2f} \t Valid AUC: {valid_auc:.4f}\n")
258 |     if early_stop.early_stop:
259 |         print("\nEarly stopping")
260 |         break
261 | # %%
262 | 


--------------------------------------------------------------------------------
/mlp/debug_train_utility_finetune.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | 
  3 | from torchsummary import summary
  4 | import os
  5 | import sys
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import torch.nn as nn
  9 | torch.backends.cudnn.deterministic = True  # for bincount
 10 | 
 11 | 
 12 | current_path = os.path.dirname(os.path.abspath(__file__))
 13 | HOME = os.path.dirname(current_path)
 14 | MODEL_DIR = os.path.join(HOME,  'models')
 15 | DATA_DIR = os.path.join(HOME,  'data')
 16 | sys.path.append(HOME)
 17 | 
 18 | from mlp import *
 19 | from utils_js import *
 20 | from utils import *
 21 | # %%
 22 | 
 23 | '''
 24 | Training script finetuning using a utility regularizer
 25 | '''
 26 | 
 27 | DEBUG = True
 28 | FINETUNE = True
 29 | BATCH_SIZE = 4096
 30 | 
 31 | FINETUNE_BATCH_SIZE = 1024_00
 32 | EPOCHS = 50
 33 | FINETUNE_EPOCHS = 20
 34 | LEARNING_RATE = 1e-3
 35 | WEIGHT_DECAY = 1e-5
 36 | EARLYSTOP_NUM = 10
 37 | NFOLDS = 1
 38 | SCALING = 10
 39 | THRESHOLD = 0.5
 40 | SEED = 802
 41 | get_seed(SEED)
 42 | 
 43 | # f = np.median
 44 | # f = np.mean
 45 | f = median_avg
 46 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 47 | 
 48 | # %%
 49 | with timer("Preprocessing train"):
 50 |     train_parquet = os.path.join(DATA_DIR, 'train.parquet')
 51 |     train, valid = preprocess_pt(train_parquet, drop_weight=True)
 52 | 
 53 | for c in range(1, 5):
 54 |     print(f'action based on resp_{c} mean: ', ' '*10,
 55 |           train['action_'+str(c)].astype(int).mean())
 56 | 
 57 | resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
 58 | resp_cols_all = resp_cols
 59 | target_cols = ['action_0', 'action_1', 'action_2', 'action_3', 'action_4']
 60 | feat_cols = [f'feature_{i}' for i in range(130)]
 61 | # f_mean = np.mean(train[feat_cols[1:]].values, axis=0)
 62 | feat_cols.extend(['cross_41_42_43', 'cross_1_2'])
 63 | # %%
 64 | train_set = ExtendedMarketDataset(
 65 |     train, features=feat_cols, targets=target_cols, resp=resp_cols)
 66 | train_loader = DataLoader(
 67 |     train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
 68 | 
 69 | valid_set = ExtendedMarketDataset(
 70 |     valid, features=feat_cols, targets=target_cols, resp=resp_cols)
 71 | valid_loader = DataLoader(
 72 |     valid_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
 73 | 
 74 | # sanity check
 75 | # item = next(iter(train_loader))
 76 | # print(item)
 77 | # %%
 78 | model = ResidualMLP(output_size=len(target_cols))
 79 | model.to(device)
 80 | summary(model, input_size=(len(feat_cols), ))
 81 | 
 82 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
 83 | # optimizer = Lookahead(optimizer=optimizer, k=10, alpha=0.5)
 84 | scheduler = None
 85 | # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
 86 | #                                                 max_lr=1e-2, epochs=EPOCHS,
 87 | #                                                 steps_per_epoch=len(train_loader))
 88 | loss_fn = SmoothBCEwLogits(smoothing=0.005)
 89 | 
 90 | es = EarlyStopping(patience=EARLYSTOP_NUM, mode="max")
 91 | 
 92 | # %%
 93 | 
 94 | 
 95 | class UtilityLoss(nn.Module):
 96 |     def __init__(self, weight=None, alpha=None, scaling=None, normalize='mean', resp_index=None):
 97 |         super(UtilityLoss, self).__init__()
 98 |         self.alpha = alpha if normalize == 'mean' else alpha * \
 99 |             1e-3  # the strength of this regularization
100 |         self.normalize = normalize
101 |         self.scaling = scaling
102 |         self.weight = weight
103 |         self.resp_index = resp_index
104 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
105 | 
106 |     def forward(self, inputs, targets, weights=None, date=None):
107 |         '''
108 |         inputs: prediction of the model (without sigmoid, processed with a scaling)
109 |         targets: resp columns
110 |         negative of the utility for minimization
111 |         '''
112 |         if (self.resp_index is not None) and (len(self.resp_index) < 5):
113 |             inputs = inputs[..., self.resp_index]
114 |             targets = targets[..., self.resp_index]
115 | 
116 |         inputs = F.sigmoid(self.scaling*inputs)
117 |         n_targets = inputs.size(-1)
118 |         if n_targets > 1:
119 |             weights = weights.repeat((n_targets, 1))
120 |             date = date.repeat((n_targets, 1))
121 | 
122 |         # flatten label and prediction tensors
123 |         inputs = inputs.view(-1)
124 |         targets = targets.view(-1)
125 |         weights = weights.view(-1)
126 |         date = date.view(-1)
127 | 
128 |         dates = date.unique().detach()
129 |         ndays = len(dates)
130 | 
131 |         Pi = torch.zeros((ndays, 1), device=self.device, dtype=torch.float32)
132 |         for i, day in enumerate(dates):
133 |             mask = (date == day)
134 |             Pi[i] = (weights[mask]*targets[mask]*inputs[mask]).sum()
135 | 
136 |         # a single day
137 |         # DEBUG notes: bincount is not differentiable for autograd
138 |         # Pi = torch.bincount(date, weight * targets * inputs)
139 |         # loss = Pi.sum()*(Pi.sum().clamp(min=0))/(Pi.square().sum())
140 |         # loss = (Pi.sum()).square()/(Pi.square().sum())
141 | 
142 |         sumPi = Pi.sum()
143 |         if self.normalize == 'mean':
144 |             loss = -self.alpha*sumPi * \
145 |                 (sumPi.clamp(min=0))/(Pi.square().sum())/ndays
146 |         else:
147 |             loss = -self.alpha*sumPi*(sumPi.clamp(min=0))/ndays
148 | 
149 |         return loss
150 | 
151 | 
152 | # %%
153 | _fold = 0
154 | if FINETUNE:
155 |     model_weights = os.path.join(MODEL_DIR, f"resmlp_{_fold}.pth")
156 |     # model_weights = os.path.join(MODEL_DIR, f"resmlp_ft_old_fold_{_fold}.pth")
157 |     # model_weights = os.path.join(MODEL_DIR, f"resmlp_finetune_fold_{_fold}.pth")
158 |     try:
159 |         model.load_state_dict(torch.load(model_weights))
160 |     except:
161 |         model.load_state_dict(torch.load(
162 |             model_weights, map_location=torch.device('cpu')))
163 |     model.eval()
164 |     valid_pred = valid_epoch(model, valid_loader, device)
165 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
166 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
167 | 
168 |     print(f"valid_utility:{valid_score:.2f} \t valid_auc:{valid_auc:.4f}")
169 | # %%
170 | '''
171 | fine-tuning the trained model utility score
172 | max batch_size:
173 | 3 resps: 409600
174 | 5 resps: 204800
175 | 
176 | current best setting: 
177 | fold 0, batch_size = 409600, lr *= 1e-3, alpha=5e-2, 1 epoch with loss
178 | fold 1, batch_size = 102400, lr *= 1e-3, 2 epochs
179 | fold 2,  batch_size = 102400, lr *= 1e-2, 2 epochs
180 | fold 3, batch_size = 409600, lr *= 1e-3, alpha=1e-1, 1 epoch without loss
181 | fold 4, batch_size = 12800, lr *= 1e-2, alpha=1, 1 epoch without loss
182 | to-do: using the least square loss to model w_{ij} res[ij]
183 | '''
184 | get_seed(1127)
185 | # resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
186 | 
187 | # resp_cols = ['resp', 'resp_1', 'resp_2']
188 | resp_cols = ['resp', 'resp_4']
189 | resp_index = [resp_cols_all.index(r) for r in resp_cols]  # resp_1, resp_2
190 | 
191 | regularizer = UtilityLoss(alpha=1e-1, scaling=12, normalize=None, resp_index=resp_index)
192 | finetune_loader = DataLoader(
193 |     train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8)
194 | train_loader = DataLoader(train_set, batch_size=400_000,
195 |                           shuffle=True, num_workers=8)
196 | finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3)
197 | 
198 | # %%
199 | FINETUNE_EPOCHS = 1
200 | for epoch in range(FINETUNE_EPOCHS):
201 |     tqdm.write(f"\nFine tuning epoch {epoch+1} for model {_fold}")
202 |     # train_loss = train_epoch(model, finetune_optimizer, scheduler,
203 |     #                          loss_fn, train_loader, device)
204 |     _ = train_epoch_utility(model, finetune_optimizer, scheduler,
205 |                             regularizer, finetune_loader, device, loss_fn=loss_fn)
206 |     valid_pred = valid_epoch(model, valid_loader, device)
207 |     valid_auc, valid_score = get_valid_score(valid_pred, valid,
208 |                                              f=median_avg, threshold=0.5, target_cols=target_cols)
209 | 
210 |     tqdm.write(f"\nval_utility:{valid_score:.2f}  valid_auc:{valid_auc:.4f}")
211 | # %%
212 | # regularizer = UtilityLoss(alpha=1e-4, scaling=12)
213 | 
214 | # finetune_loader = DataLoader(train_set, batch_size=FINETUNE_BATCH_SIZE, shuffle=True, num_workers=8)
215 | # finetune_optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*1e-3)
216 | 
217 | 
218 | # for epoch in range(EPOCHS):
219 | 
220 | #     start_time = time()
221 | #     train_loss = train_epoch(model, optimizer, scheduler, loss_fn, train_loader, device)
222 | 
223 | #     train_loss = train_epoch_utility(model, finetune_optimizer, scheduler,
224 | #                                          loss_fn, regularizer, finetune_loader, device)
225 | 
226 | #     valid_pred = valid_epoch(model, valid_loader, device)
227 | #     valid_auc, valid_score = get_valid_score(valid_pred, valid,
228 | #                                         f=median_avg, threshold=0.5, target_cols=target_cols)
229 | #     model_file = MODEL_DIR+f"/resmlp_seed_{SEED}_util_{int(valid_score)}_auc_{valid_auc:.4f}.pth"
230 | #     es(valid_auc, model, model_path=model_file, epoch_utility_score=valid_score)
231 | 
232 | #     print(f"\nEPOCH:{epoch:2d} tr_loss:{train_loss:.2f}  "
233 | #                 f"val_utility:{valid_score:.2f} valid_auc:{valid_auc:.4f}  "
234 | #                 f"epoch time: {time() - start_time:.1f}sec  "
235 | #                 f"early stop counter: {es.counter}\n")
236 | 
237 | #     if es.early_stop:
238 | #         print("\nEarly stopping")
239 | #         break
240 | 
241 | # torch.save(model.state_dict(), MODEL_DIR+f"/resmlp_finetune_fold_{_fold}.pth")
242 | # %%
243 | if DEBUG:
244 |     resp_cols = ['resp', 'resp_4']
245 |     resp_index = [resp_cols_all.index(r) for r in resp_cols]
246 |     regularizer = UtilityLoss(alpha=1e-1, scaling=12,
247 |                               normalize=None, resp_index=resp_index)
248 |     data = next(iter(finetune_loader))
249 |     optimizer.zero_grad()
250 |     features = data['features'].to(device)
251 |     label = data['label'].to(device)
252 |     weights = data['weight'].to(device)
253 |     resp = data['resp'].to(device)
254 |     date = data['date'].to(device)
255 |     model.eval()
256 |     outputs = model(features)
257 |     loss = loss_fn(outputs, label)
258 |     # reg = regularizer(outputs, resp, weights=weight, date=date)
259 | 
260 |     targets = resp
261 |     inputs = outputs
262 |     alpha = 1e-3
263 |     if resp_index is not None and len(resp_index) < 5:
264 |         inputs = outputs[..., resp_index]
265 |         targets = targets[..., resp_index]
266 | 
267 |     inputs = F.sigmoid(10*inputs)
268 |     n_targets = inputs.size(-1)
269 |     if n_targets > 1:
270 |         weights = weights.repeat((n_targets, 1))
271 |         date = date.repeat((n_targets, 1))
272 | 
273 |     # flatten label and prediction tensors
274 |     inputs = inputs.view(-1)
275 |     targets = targets.view(-1)
276 |     weights = weights.view(-1)
277 |     date = date.view(-1)
278 | 
279 |     dates = date.unique().detach()
280 |     ndays = len(dates)
281 | 
282 |     Pi = torch.zeros((ndays, 1), device=device, dtype=torch.float32)
283 |     for i, day in enumerate(dates):
284 |         mask = (date == day)
285 |         Pi[i] = (weights[mask]*targets[mask]*inputs[mask]).sum()
286 | 
287 |     sumPi = Pi.sum()
288 |     loss = -alpha*sumPi*(sumPi.clamp(min=0))/ndays
289 | 
290 |     # loss.backward()
291 | # %%
292 | # %%
293 | if DEBUG:
294 |     model.train()
295 |     final_loss = 0
296 |     data = next(iter(train_loader))
297 |     optimizer.zero_grad()
298 |     _features = data['features'].to(device)
299 |     _label = data['label'].to(device)
300 |     _weights = torch.log(1+data['weight']).to(device)
301 |     _outputs = model(_features)
302 | 
303 |     targets = SmoothBCEwLogits._smooth(_label, _outputs.size(-1), 0.005)
304 |     _loss = F.binary_cross_entropy_with_logits(_outputs, _label, weight=_weights)


--------------------------------------------------------------------------------
/lgb/v01_explore.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Import"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# This Python 3 environment comes with many helpful analytics libraries installed\n",
 26 |     "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
 27 |     "# For example, here's several helpful packages to load\n",
 28 |     "\n",
 29 |     "import numpy as np # linear algebra\n",
 30 |     "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 31 |     "\n",
 32 |     "# Input data files are available in the read-only \"../input/\" directory\n",
 33 |     "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
 34 |     "\n",
 35 |     "import os\n",
 36 |     "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
 37 |     "    for filename in filenames:\n",
 38 |     "        print(os.path.join(dirname, filename))\n",
 39 |     "        \n",
 40 |     "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
 41 |     "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 4,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "data": {
 51 |       "text/html": [
 52 |        "<style type='text/css'>\n",
 53 |        ".datatable table.frame { margin-bottom: 0; }\n",
 54 |        ".datatable table.frame thead { border-bottom: none; }\n",
 55 |        ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
 56 |        ".datatable .bool    { background: #DDDD99; }\n",
 57 |        ".datatable .object  { background: #565656; }\n",
 58 |        ".datatable .int     { background: #5D9E5D; }\n",
 59 |        ".datatable .float   { background: #4040CC; }\n",
 60 |        ".datatable .str     { background: #CC4040; }\n",
 61 |        ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
 62 |        ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
 63 |        ".datatable th:nth-child(2) { padding-left: 12px; }\n",
 64 |        ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
 65 |        ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
 66 |        ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
 67 |        ".datatable .footer { font-size: 9px; }\n",
 68 |        ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
 69 |        "</style>\n"
 70 |       ],
 71 |       "text/plain": [
 72 |        "<IPython.core.display.HTML object>"
 73 |       ]
 74 |      },
 75 |      "metadata": {},
 76 |      "output_type": "display_data"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "import sys, inspect\n",
 81 |     "import os, gc\n",
 82 |     "import numpy as np\n",
 83 |     "from numba import njit\n",
 84 |     "import datatable as dtable\n",
 85 |     "import pandas as pd\n",
 86 |     "import xgboost as xgb\n",
 87 |     "from hyperopt import hp, fmin, tpe, Trials\n",
 88 |     "from hyperopt.pyll.base import scope\n",
 89 |     "from sklearn.metrics import roc_auc_score, roc_curve\n",
 90 |     "from sklearn.model_selection import GroupKFold\n",
 91 |     "import matplotlib.pyplot as plt\n",
 92 |     "from matplotlib.colors import ListedColormap\n",
 93 |     "from tqdm.notebook import tqdm\n",
 94 |     "from joblib import dump, load"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 5,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "TRAINING = True\n",
104 |     "ENSEMBLE = False\n",
105 |     "FOLDS = 4\n",
106 |     "SEED = 42\n",
107 |     "DATA_ROOT = '/storage1/lu/Active/tianyang/Workspace/janestreet/'\n",
108 |     "DATA_FILE = DATA_ROOT + 'train.feather'  # feather is faster than csv\n",
109 |     "\n",
110 |     "# janestreet\n",
111 |     "currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))\n",
112 |     "libdir = os.path.join(currentdir, '..', 'data')\n",
113 |     "sys.path.insert(0, libdir) "
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "# Train"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "raw",
125 |    "metadata": {},
126 |    "source": [
127 |     "train = pd.read_feather(DATA_FILE)\n",
128 |     "train = train.query('date > 85').reset_index(drop = True) \n",
129 |     "train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use\n",
130 |     "train.fillna(train.mean(),inplace=True)\n",
131 |     "train = train.query('weight > 0').reset_index(drop = True)\n",
132 |     "#train['action'] = (train['resp'] > 0).astype('int')\n",
133 |     "train['action'] = ((train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & \\\n",
134 |     "                   (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) & \\\n",
135 |     "                    (train['resp'] > 0 )).astype('int')\n",
136 |     "features = [c for c in train.columns if 'feature' in c]\n",
137 |     "\n",
138 |     "resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']\n",
139 |     "\n",
140 |     "X = train[features].values\n",
141 |     "y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget\n",
142 |     "\n",
143 |     "f_mean = np.mean(train[features[1:]].values,axis=0)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 6,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "Loading...\n",
156 |       "Filling...\n",
157 |       "Finish.\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "print('Loading...')\n",
163 |     "train = pd.read_feather(DATA_FILE)\n",
164 |     "features = [c for c in train.columns if 'feature' in c]\n",
165 |     "\n",
166 |     "print('Filling...')\n",
167 |     "train = train.query('weight > 0').reset_index(drop = True)\n",
168 |     "train[features] = train[features].fillna(method = 'ffill').fillna(0)\n",
169 |     "train['action'] = (train['resp'] > 0).astype('int')\n",
170 |     "\n",
171 |     "print('Finish.')\n",
172 |     "\n",
173 |     "X_tr, y_tr = train.loc[train['date'] > 85, features].values, train.loc[train['date'] > 85, 'action'].values\n",
174 |     "d_tr = xgb.DMatrix(X_tr, y_tr)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 7,
180 |    "metadata": {},
181 |    "outputs": [
182 |     {
183 |      "name": "stdout",
184 |      "output_type": "stream",
185 |      "text": [
186 |       "CPU times: user 5h 47min 35s, sys: 11min 22s, total: 5h 58min 58s\n",
187 |       "Wall time: 22min 37s\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "%%time\n",
193 |     "from sklearn.model_selection import cross_validate\n",
194 |     "from sklearn.model_selection import TimeSeriesSplit\n",
195 |     "from xgboost import XGBClassifier\n",
196 |     "\n",
197 |     "# Seed Blending\n",
198 |     "models = []\n",
199 |     "\n",
200 |     "p_best = {\n",
201 |     "    'learning_rate': 0.014106988708201764,\n",
202 |     "    'max_depth': 8, \n",
203 |     "    'gamma': 9.800749651802157, \n",
204 |     "    'min_child_weight': 0.3032862674190433, \n",
205 |     "    'subsample': 0.4648851101943981, \n",
206 |     "    'colsample_bytree': 0.994909039539885, \n",
207 |     "    'objective': 'binary:logistic',\n",
208 |     "    'eval_metric': 'auc', \n",
209 |     "    'tree_method': 'hist', \n",
210 |     " }\n",
211 |     "    \n",
212 |     "if TRAINING:\n",
213 |     "    # scores = cross_validate(clf, X, yy, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=10)\n",
214 |     "    \n",
215 |     "    if ENSEMBLE:\n",
216 |     "        for seed in range(5):\n",
217 |     "            p_best['random_state'] = seed\n",
218 |     "            clf = xgb.train(p_best, d_tr, 950)\n",
219 |     "            models.append(clf)\n",
220 |     "\n",
221 |     "            rubbish = gc.collect()\n",
222 |     "    else:\n",
223 |     "        clf = xgb.train(p_best, d_tr, 950)\n",
224 |     "        models = [clf]\n",
225 |     "\n",
226 |     "    dump(clf, 'xgb.joblib')\n",
227 |     "    \n",
228 |     "else:\n",
229 |     "    clf = load('xgb.joblib')\n",
230 |     "    models = [clf]\n",
231 |     "    "
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "# Predict"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 8,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "@njit\n",
248 |     "def fast_fillna(array, values):\n",
249 |     "    if np.isnan(array.sum()):\n",
250 |     "        array = np.where(np.isnan(array), values, array)\n",
251 |     "    return array"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 9,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "application/vnd.jupyter.widget-view+json": {
262 |        "model_id": "15004d286cef44d0ac949661c7771428",
263 |        "version_major": 2,
264 |        "version_minor": 0
265 |       },
266 |       "text/plain": [
267 |        "|          | 0/? [00:00<?, ?it/s]"
268 |       ]
269 |      },
270 |      "metadata": {},
271 |      "output_type": "display_data"
272 |     }
273 |    ],
274 |    "source": [
275 |     "import janestreet\n",
276 |     "env = janestreet.make_env()\n",
277 |     "env_iter = env.iter_test()\n",
278 |     "\n",
279 |     "opt_th = 0.505\n",
280 |     "tmp = np.zeros(len(features))\n",
281 |     "for (test_df, pred_df) in tqdm(env_iter):\n",
282 |     "    if test_df['weight'].item() > 0:\n",
283 |     "        x_tt = test_df.loc[:, features].values\n",
284 |     "        x_tt[0, :] = fast_fillna(x_tt[0, :], tmp)\n",
285 |     "        tmp = x_tt[0, :]\n",
286 |     "        d_tt = xgb.DMatrix(x_tt)\n",
287 |     "        pred = 0.\n",
288 |     "        for clf in models:\n",
289 |     "            pred += clf.predict(d_tt) / len(models)\n",
290 |     "        pred_df.action = np.where(pred >= opt_th, 1, 0).astype(int)\n",
291 |     "    else:\n",
292 |     "        pred_df.action = 0\n",
293 |     "    env.predict(pred_df)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 10,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "data": {
303 |       "text/plain": [
304 |        "6143"
305 |       ]
306 |      },
307 |      "execution_count": 10,
308 |      "metadata": {},
309 |      "output_type": "execute_result"
310 |     }
311 |    ],
312 |    "source": [
313 |     "pred = pd.read_csv('submission.csv')\n",
314 |     "pred['action'].sum()"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 11,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "data": {
324 |       "text/plain": [
325 |        "15219"
326 |       ]
327 |      },
328 |      "execution_count": 11,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "len(pred['action'])"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": []
343 |   }
344 |  ],
345 |  "metadata": {
346 |   "kernelspec": {
347 |    "display_name": "Python 3",
348 |    "language": "python",
349 |    "name": "python3"
350 |   },
351 |   "language_info": {
352 |    "codemirror_mode": {
353 |     "name": "ipython",
354 |     "version": 3
355 |    },
356 |    "file_extension": ".py",
357 |    "mimetype": "text/x-python",
358 |    "name": "python",
359 |    "nbconvert_exporter": "python",
360 |    "pygments_lexer": "ipython3",
361 |    "version": "3.7.9"
362 |   }
363 |  },
364 |  "nbformat": 4,
365 |  "nbformat_minor": 4
366 | }
367 | 


--------------------------------------------------------------------------------
/cv_final.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import datetime
  3 | import gc
  4 | import os
  5 | HOME = os.path.dirname(os.path.abspath(__file__))
  6 | MODEL_DIR = HOME+'/models/'
  7 | DATA_DIR = HOME+'/data/'
  8 | # from mlp.mlp import *
  9 | from utils import *
 10 | from utils_js import *
 11 | from mlp.tf_models import *
 12 | from mlp.mlp import *
 13 | 
 14 | import random
 15 | import sys
 16 | 
 17 | import datatable as dt
 18 | import matplotlib.pyplot as plt
 19 | import numpy as np
 20 | import pandas as pd
 21 | import torch
 22 | from numba import njit
 23 | from sklearn.metrics import roc_auc_score
 24 | from tqdm import tqdm
 25 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 26 | tf.config.optimizer.set_jit(True)
 27 | 
 28 | device = torch.device('cpu')
 29 | # %%
 30 | 
 31 | '''
 32 | Various setup for different models
 33 | '''
 34 | CV_START_DAY = 401
 35 | CV_DAYS = 32
 36 | 
 37 | features = [f'feature_{i}' for i in range(130)]
 38 | 
 39 | features_t = features+ ['cross_41_42_43', 'cross_1_2']
 40 | 
 41 | resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
 42 | target_cols = ['action_1', 'action_2', 'action_3', 'action', 'action_4']
 43 | 
 44 | resp_cols_vol = ['resp_3', 'resp', 'resp_4']
 45 | target_cols_vol = ['action_3', 'action', 'action_4']
 46 | # split features for a ResNet feature 2 is more important
 47 | features_2_index = [0, 1, 2, 3, 4, 5, 6, 15, 16, 25, 26, 35, 
 48 |              36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 
 49 |              49, 50, 51, 52, 53, 54, 59, 60, 61, 62, 63, 64, 65, 
 50 |              66, 67, 68, 69, 70, 71, 76, 77, 82, 83, 88, 89, 94, 
 51 |              95, 100, 101, 106, 107, 112, 113, 118, 119, 128, 129]
 52 | 
 53 | features_1_index = [0] + list(set(range(130)).difference(features_2_index))
 54 | 
 55 | features_1 = [f'feature_{i}' for i in features_1_index]
 56 | 
 57 | features_2 = [f'feature_{i}' for i in features_2_index]
 58 | 
 59 | 
 60 | # split features for a ResNet feature 2 is more important
 61 | features_1_index_v = [0,
 62 |                    7, 8, 17, 18, 27, 28, 55, 72, 78, 84, 90, 96, 102, 108, 114, 120, 121,
 63 |                    11, 12, 21, 22, 31, 32, 57, 74, 80, 86, 92, 98, 104, 110, 116, 124, 125] 
 64 |                 # resp_1 resp_2 feat
 65 |     
 66 | features_2_index_v = [0] + list(set(range(130)).difference(features_1_index_v))
 67 | 
 68 | features_1_v = [f'feature_{i}' for i in features_1_index_v]
 69 | 
 70 | features_2_v = [f'feature_{i}' for i in features_2_index_v]
 71 | 
 72 | 
 73 | feat_spike_index = [1, 2, 3, 4, 5, 6, 10, 14, 16, 69, 70, 71, 73, 74, 75, 76, 79, 80, 81, 82, 85,
 74 |                     86, 87, 88, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 109, 111, 112, 115, 117, 118]
 75 | features_spike = [f'feature_{i}' for i in feat_spike_index]
 76 | 
 77 | cat_cols = [f+'_c' for f in features_spike]
 78 | 
 79 | #%%
 80 | '''
 81 | Loading model trained in tf and verify their utility scores
 82 | '''
 83 | train_parquet = os.path.join(DATA_DIR, 'train_pdm.parquet')
 84 | train = pd.read_parquet(train_parquet)
 85 | train['action'] = (train['resp'] > 0).astype(int)
 86 | for c in range(1,5):
 87 |     train['action_'+str(c)] = (train['resp_'+str(c)] > 0).astype(int)
 88 | 
 89 | train['cross_41_42_43'] = train['feature_41'] + train['feature_42'] + train['feature_43']
 90 | train['cross_1_2'] = train['feature_1'] / (train['feature_2'] + 1e-5).astype(np.float32)
 91 | 
 92 | most_common_vals = np.load(DATA_DIR+'spike_common_vals_42.npy').reshape(-1)
 93 | for i, feat in tqdm(enumerate(features_spike)):
 94 |     train[feat+'_c'] = (train[feat] - most_common_vals[i]).astype(np.int32)
 95 | #%%
 96 | '''
 97 | Final model resnet
 98 | '''
 99 | # model_files = ['resnet_reg_fold_0_seed_1127802.h5', 
100 | #                 'resnet_reg_fold_1_seed_1127802.h5',
101 | #                 'resnet_reg_fold_2_seed_1127802.h5']
102 | model_files = ['resnet_reg_fold_1_res_seed_792734.h5',
103 |                'resnet_reg_fold_2_res_seed_97275.h5']
104 | 
105 | # model_files = ['resnet_reg_fold_0_seed_157157.h5', 
106 | #                 'resnet_reg_fold_1_seed_157157.h5',
107 | #                 'resnet_reg_fold_2_seed_157157.h5']
108 | for _fold, model_file in enumerate(model_files):
109 |     print(f"Model {model_file}")
110 |     tf.keras.backend.clear_session()
111 |     tf_model = create_resnet_reg(len(features_1), len(features_2), len(resp_cols), 
112 |                                 hidden_size=256, label_smoothing=5e-03)
113 | 
114 |     tf_model.load_weights(os.path.join(MODEL_DIR, model_file))
115 |     # tf_model.call = tf.function(tf_model.call, experimental_relax_shapes=True)
116 | 
117 |     print_valid_score_tf(train, tf_model, start_day=400, num_days=33, 
118 |                             f=median_avg, threshold=0.5, 
119 |                             feature_indices=(features, features_1_index, features_2_index))
120 | 
121 | #%%
122 | '''
123 | Final model resnet
124 | '''
125 | # model_files = ['resnet_volatile_fold_0_seed_1127802.h5', 
126 | #                 'resnet_volatile_fold_1_seed_1127802.h5',
127 | #                 'resnet_volatile_fold_2_seed_1127802.h5']
128 | # model_files = ['resnet_volatile_fold_0_seed_157157.h5', 
129 | #                 'resnet_volatile_fold_1_seed_157157.h5',
130 | #                 'resnet_volatile_fold_2_seed_157157.h5']
131 | 
132 | # model_files = ['resnet_volatile_fold_0_seed_745273.h5', 
133 | #                'resnet_volatile_fold_2_seed_962656.h5']
134 | model_files = ['resnet_volatile_fold_0_seed_5567273.h5', 
135 |                 'resnet_volatile_fold_1_seed_123835.h5',
136 |                 'resnet_volatile_fold_2_seed_676656.h5']
137 | 
138 | for _fold, model_file in enumerate(model_files):
139 |     print(f"Model {model_file}")
140 |     tf.keras.backend.clear_session()
141 |     tf_model = create_resnet(len(features_1_v), len(features_2_v), len(resp_cols_vol), 
142 |                                 hidden_size=256, label_smoothing=5e-03)
143 | 
144 |     tf_model.load_weights(os.path.join(MODEL_DIR, model_file))
145 |     # tf_model.call = tf.function(tf_model.call, experimental_relax_shapes=True)
146 | 
147 |     print_valid_score_tf(train, tf_model, start_day=400, num_days=33, 
148 |                             f=median_avg, threshold=0.5, 
149 |                             feature_indices=(features, features_1_index_v, features_2_index_v))
150 | # %%
151 | '''
152 | Final model ae+mlp, 5 targets
153 | '''
154 | # encoder_file = 'encoder_reg.hdf5'
155 | # model_files = ['ae_reg_fold_0.hdf5', 
156 | #                 'ae_reg_fold_1.hdf5',
157 | #                 'ae_reg_fold_2.hdf5']
158 | # hp_file = 'hp_ae_reg.pkl'
159 | 
160 | 
161 | # encoder_file = 'encoder_692874.hdf5'
162 | # model_files = ['model_692874_0.hdf5', 
163 | #                 'model_692874_1.hdf5',
164 | #                 'model_692874_2.hdf5']
165 | # hp_file = 'best_hp_692874.pkl'
166 | 
167 | encoder_file = 'ae_encoder_157157.hdf5'
168 | model_files = ['ae_157157_0.hdf5', 
169 |                 'ae_157157_1.hdf5',
170 |                 'ae_157157_2.hdf5']
171 | hp_file = 'ae_hp_157157.pkl'
172 | 
173 | _, encoder = create_autoencoder(len(features), len(resp_cols), noise=0.1)
174 | 
175 | encoder.load_weights(os.path.join(MODEL_DIR, encoder_file))
176 | encoder.trainable = False
177 | 
178 | model_fn = lambda hp: create_model(hp, len(features), len(resp_cols), encoder)
179 | 
180 | hp = pd.read_pickle(os.path.join(MODEL_DIR, hp_file))
181 | for _fold, model_file in enumerate(model_files):
182 |     tf.keras.backend.clear_session()
183 |     print(f"Model {model_file}")
184 |     model = model_fn(hp)
185 |     model.load_weights(os.path.join(MODEL_DIR, model_files[_fold]))
186 | 
187 |     print_valid_score_tf(train, model, start_day=400, num_days=33, 
188 |                             f=median_avg, threshold=0.5, 
189 |                             feature_indices=[features])
190 | # %%
191 | '''
192 | Final model ae+mlp
193 | '''
194 | # volatile models, 3 targets
195 | # encoder_file = 'encoder_volatile.hdf5'
196 | # model_files = ['ae_volatile_fold_0.hdf5', 
197 | #                 'ae_volatile_fold_1.hdf5',
198 | #                 'ae_volatile_fold_2.hdf5']
199 | # hp_file = 'hp_ae_volatile.pkl'
200 | 
201 | 
202 | # encoder_file = 'v_encoder_969725.hdf5'
203 | # model_files = ['v_model_969725_0.hdf5', 
204 | #                 'v_model_969725_1.hdf5',
205 | #                 'v_model_969725_2.hdf5']
206 | # hp_file = 'v_best_hp_969725.pkl'
207 | 
208 | # encoder_file = 'v_encoder_618734.hdf5'
209 | # model_files = ['v_model_618734_0.hdf5', 
210 | #                 'v_model_618734_1.hdf5',
211 | #                 'v_model_618734_2.hdf5']
212 | # hp_file = 'v_best_hp_618734.pkl'
213 | 
214 | encoder_file = 'ae_vol_encoder_283467.hdf5'
215 | model_files = ['ae_vol_283467_0.hdf5', 
216 |                 'ae_vol_283467_1.hdf5',
217 |                 'ae_vol_283467_2.hdf5']
218 | hp_file = 'ae_vol_hp_283467.pkl'
219 | 
220 | _, encoder = create_autoencoder(len(features), len(resp_cols_vol), noise=0.1)
221 | 
222 | encoder.load_weights(os.path.join(MODEL_DIR, encoder_file))
223 | encoder.trainable = False
224 | 
225 | model_fn = lambda hp: create_model(hp, len(features), len(resp_cols_vol), encoder)
226 | 
227 | hp = pd.read_pickle(os.path.join(MODEL_DIR, hp_file))
228 | for _fold, model_file in enumerate(model_files):
229 |     tf.keras.backend.clear_session()
230 |     print(f"Model {model_file}")
231 |     model = model_fn(hp)
232 |     model.load_weights(os.path.join(MODEL_DIR, model_files[_fold]))
233 | 
234 |     print_valid_score_tf(train, model, start_day=400, num_days=33, 
235 |                             f=median_avg, threshold=0.5, 
236 |                             feature_indices=[features])
237 | 
238 | #%%
239 | model_files = ['tf_spike_reg_seed_1127802_fold_0.h5', 
240 |                 'tf_spike_reg_seed_1127802_fold_1.h5',
241 |                 # 'tf_spike_reg_seed_1127802_fold_2.h5',
242 |                 'tf_spike_reg_seed_802_fold_2.h5'
243 |                 ]
244 | 
245 | for _fold, model_file in enumerate(model_files):
246 |     print(f"Model {model_file}")
247 |     tf.keras.backend.clear_session()
248 |     tf_model = create_spikenet(len(features_1), len(features_2), len(cat_cols), len(resp_cols), 
249 |                                 hidden_size=256, label_smoothing=5e-03)
250 | 
251 |     tf_model.load_weights(os.path.join(MODEL_DIR, model_file))
252 |     # tf_model.call = tf.function(tf_model.call, experimental_relax_shapes=True)
253 | 
254 |     print_valid_score_tf(train, tf_model, start_day=400, num_days=33, 
255 |                             f=median_avg, threshold=0.5, 
256 |                             feature_indices=(features, features_1_index, features_2_index, feat_spike_index))
257 | # %%
258 | 
259 | model_files = ['emb_volatile_fold_0_util_1445_auc_0.5550.pth',
260 |                'emb_volatile_fold_1_util_1225_auc_0.5557.pth',
261 |                'emb_volatile_fold_2_util_240_auc_0.5455.pth']
262 | 
263 | 
264 | for _fold, model_file in enumerate(model_files):
265 |     model = SpikeNet()
266 |     model.to(device)
267 |     model_weights = os.path.join(MODEL_DIR, model_file)
268 |     model.load_state_dict(torch.load(model_weights, map_location='cpu'))
269 |     model.eval();
270 |     print(f"\n\nModel {model_file}")
271 |     print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
272 |                             batch_size =8192, f=median_avg, threshold=0.5, 
273 |                             target_cols=target_cols, 
274 |                             feat_cols=features,
275 |                             resp_cols=resp_cols,
276 |                             cat_input=cat_cols)
277 | # %%
278 | model_files = ['pt_volatile_0_util_1424_auc_0.5520.pth',
279 |                'pt_volatile_1_util_1137_auc_0.5470.pth',
280 |                'pt_volatile_2_util_322_auc_0.5444.pth']
281 | 
282 | 
283 | for _fold, model_file in enumerate(model_files):
284 |     model = ResidualMLP(input_size=len(features_t), hidden_size=256, output_size=len(target_cols))
285 |     model.to(device)
286 |     model_weights = os.path.join(MODEL_DIR, model_file)
287 |     model.load_state_dict(torch.load(model_weights, map_location='cpu'))
288 |     model.eval();
289 |     print(f"\nModel {model_file}")
290 |     print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
291 |                             batch_size = 8192, f=median_avg, threshold=0.5, 
292 |                             target_cols=target_cols, 
293 |                             feat_cols=features_t,
294 |                             resp_cols=resp_cols)
295 |  # %%
296 | # %%
297 | model_files = ['final_0_util_1372_auc_0.5483.pth',
298 |                'final_1_util_865_auc_0.5450.pth',
299 |                'final_2_util_507_auc_0.5428.pth']
300 | 
301 | 
302 | for _fold, model_file in enumerate(model_files):
303 |     model = ResidualMLP(input_size=len(features_t), hidden_size=256, output_size=len(target_cols))
304 |     model.to(device)
305 |     model_weights = os.path.join(MODEL_DIR, model_file)
306 |     model.load_state_dict(torch.load(model_weights, map_location='cpu'))
307 |     model.eval();
308 |     print(f"\nModel {model_file}")
309 |     print_all_valid_score(train, model, start_day=CV_START_DAY, num_days=CV_DAYS, 
310 |                             batch_size = 8192, f=median_avg, threshold=0.5, 
311 |                             target_cols=target_cols, 
312 |                             feat_cols=features_t,
313 |                             resp_cols=resp_cols)
314 | # %%
315 | 


--------------------------------------------------------------------------------