├── save └── .gitkeep ├── features └── .gitkeep ├── submission └── .gitkeep ├── _figure └── overview.png ├── extended_abstract ├── report_task_3.pdf └── report_task_1_2.pdf ├── src ├── dataset.py ├── create_folds.py ├── utils.py ├── model.py ├── train_cat.py ├── utils_model.py ├── train_lgbm.py ├── train_xgb.py ├── feature_extraction_preds_meta.py ├── utils_feature.py ├── stacking.py └── train_mlp_multitask.py ├── task3 └── README.md ├── README.md ├── .gitignore ├── run └── solution.sh └── exp ├── task2_lgbm └── config.py ├── task1_cat └── config.py ├── task1_lgbm └── config.py ├── task1_lgbm_2 └── config.py └── task2_lgbm_fs100 └── config.py /save/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /features/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /submission/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_figure/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haradai1262/NeurIPS-Education-Challenge-2020/HEAD/_figure/overview.png -------------------------------------------------------------------------------- /extended_abstract/report_task_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haradai1262/NeurIPS-Education-Challenge-2020/HEAD/extended_abstract/report_task_3.pdf -------------------------------------------------------------------------------- /extended_abstract/report_task_1_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haradai1262/NeurIPS-Education-Challenge-2020/HEAD/extended_abstract/report_task_1_2.pdf -------------------------------------------------------------------------------- /src/dataset.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | 5 | class SimpleDataLoader: 6 | def __init__(self, dataset, batch_size=1, shuffle=False): 7 | self.dataset = dataset 8 | self.batch_size = batch_size 9 | self.shuffle = shuffle 10 | assert all([dataset[i].size(0) == dataset[0].size(0) for i in range(len(dataset))]), 'all the elemtnes must have the same length' 11 | self.data_size = dataset[0].size(0) 12 | 13 | def __iter__(self): 14 | self._i = 0 15 | 16 | if self.shuffle: 17 | index_shuffle = torch.randperm(self.data_size) 18 | self.dataset = [v[index_shuffle] for v in self.dataset] 19 | 20 | return self 21 | 22 | def __next__(self): 23 | 24 | i1 = self.batch_size * self._i 25 | i2 = min(self.batch_size * (self._i + 1), self.data_size) 26 | 27 | if i1 >= self.data_size: 28 | raise StopIteration() 29 | 30 | value = [v[i1:i2] for v in self.dataset] 31 | self._i += 1 32 | return value 33 | -------------------------------------------------------------------------------- /src/create_folds.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from iterstrat.ml_stratifiers import MultilabelStratifiedKFold 4 | 5 | INPUT_DIR = os.environ.get('INPUT_DIR') 6 | FOLD_DIR = os.environ.get('FOLD_DIR') 7 | 8 | RANDOM_STATE = os.environ.get('RANDOM_STATE') 9 | 10 | FOLD_NAME = os.environ.get('FOLD_NAME') 11 | FOLD_NUM = os.environ.get('FOLD_NUM') 12 | 13 | if __name__ == "__main__": 14 | 15 | data_path = f'{INPUT_DIR}/train_data/train_task_1_2.csv' 16 | X = pd.read_csv(data_path) 17 | y = X['IsCorrect'] 18 | 19 | X.loc[:, 'kfold'] = 0 20 | if FOLD_NAME == 'mskf_user': 21 | mskf = MultilabelStratifiedKFold(n_splits=FOLD_NUM, random_state=RANDOM_STATE, shuffle=True) 22 | labels = X[['IsCorrect', 'UserId']].values 23 | splits = mskf.split(X.values, labels) 24 | 25 | for fold, (trn, val) in enumerate(splits): 26 | X.loc[val, 'kfold'] = fold 27 | 28 | print(X.kfold.value_counts()) 29 | X[['kfold']].to_csv(f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv', index=False) 30 | -------------------------------------------------------------------------------- /task3/README.md: -------------------------------------------------------------------------------- 1 | # Task 3: Global Question Quality Assessment 2 | 3 | ## Our Solution 4 | For devising suitable metrics to measure the quality of the diagnostic questions, we formed a hypothesis that an appropriate diagnostic question strikes 5 | - a balance between the choice of answers 6 | - an appropriate level of difficulty 7 | - readability 8 | 9 | Based on this hypothesis, we compute below features. 10 | 1. Selection entropy 11 | - Utilize the variation of `AnsweValue` 12 | 2. Correct/Wrong/entropy 13 | - Utilize the variation of `IsCorrect` 14 | 3. Difficulty 15 | - Compute the difference between the mean correctness rate of a student who answered a question and whether the student’s answer to the question is correct or wrong 16 | 4. Readability 17 | - Extract text regions from a question image and then calculated the proportion of text area to whole area of the image utilizing [CRAFT](https://arxiv.org/pdf/1904.01941.pdf) 18 | 19 | ## How to Use 20 | 1. Utilize CRAFT to each question image 21 | See the detail in https://github.com/clovaai/CRAFT-pytorch 22 | 2. Put the result files into `../data/images_text-segmentation` 23 | 3. Run `task3-solution.ipynb` 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Solution of NeurIPS-Education-Challenge-2020 2 | 3 | This repository is the code for task 1 and task 2 solutions of The NeurIPS 2020 Education Challenge. 4 | The Task 3 code is located [here](https://github.com/haradai1262/NeurIPS-Education-Challenge-2020/tree/main/task3) 5 | 6 | ## Competition results 7 | 8 | - Task 1 (Predict Student Responses – Right or Wrong) - 1st place 9 | - Task 2 (Predict Student Responses – Answer Prediction) - 2nd place 10 | - Task 3 (Global Question Quality Assessment) - 1st place (In this task there was a four-way tie.) 11 | - Conbined - 1st place 12 | 13 | competition website: https://eedi.com/projects/neurips-education-challenge 14 | 15 | ## Overview 16 | 17 | ![overview](./_figure/overview.png) 18 | 19 | ## Usage 20 | 21 | ### Download dataset 22 | 23 | - register codelab and join "Diagnostic Questions - The NeurIPS 2020 Education Challenge" 24 | - https://competitions.codalab.org/competitions/25449 25 | - download data (https://competitions.codalab.org/competitions/25449#participate) 26 | 27 | ### Run 28 | 29 | ``` 30 | $ cd ./run 31 | $ sh solution.sh 32 | ``` 33 | 34 | ## Folder structure 35 | 36 | ``` 37 | NeurIPS-Education-Challenge-2020 38 | 39 | ├── exp # deploy the experiment's config 40 | └── (experiment name) 41 | └── config.py 42 | 43 | ├── features # save extracted feature files 44 | └── (feature_name).feather 45 | 46 | ├── folds # save cross validation data table files 47 | └── (folds_name).csv 48 | 49 | ├── data 50 | └── trian_data 51 | └── train_task_1_2.csv 52 | └── metadata 53 | └── answer_metadata_task_1_2.csv 54 | └── question_metadata_task_1_2.csv 55 | └── student_metadata_task_1_2.csv 56 | 57 | ├── run 58 | └── solution.sh 59 | 60 | ├── save # save experimental result files 61 | └── (experiment name)/ 62 | └── model_log/ 63 | └── model_weight/ 64 | └── preds_val_task1_{run_id}.csv 65 | └── preds_test_task1_{run_id}.csv 66 | 67 | ├── src 68 | └── create_folds.py 69 | └── dataset.py 70 | └── feature_extraction.py 71 | └── model.py 72 | └── train_5fold.py 73 | └── utils.py 74 | └── utils_feature.py 75 | └── utils_model.py 76 | 77 | ├── submission # save submission files 78 | └── (experiment name)/ 79 | └── submission_task1__auc{local auc}__acc{local acc}_th{selected threshold value}.zip 80 | └── submission_task2__acc{local acc}.zip 81 | ``` 82 | 83 | ## Contact 84 | 85 | - Email: daichi.takehara0730@gmail.com (Daichi Takehara) 86 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | tmp 4 | _tmp 5 | notebook_output 6 | __pycache__/ 7 | .vscode/ 8 | secrets/ 9 | secret/ 10 | mlruns/ 11 | 12 | data/ 13 | starter_kit/ 14 | submissions/ 15 | submission_templates/ 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | pip-wheel-metadata/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import os 4 | import time 5 | from contextlib import contextmanager 6 | import torch 7 | import logging 8 | import cloudpickle 9 | 10 | 11 | def load_from_pkl(load_path): 12 | frb = open(load_path, 'rb') 13 | obj = cloudpickle.loads(frb.read()) 14 | return obj 15 | 16 | 17 | def save_as_pkl(obj, save_path): 18 | fwb = open(save_path, 'wb') 19 | fwb.write(cloudpickle.dumps(obj)) 20 | return 21 | 22 | 23 | def seed_everything(seed=46): 24 | random.seed(seed) 25 | os.environ['PYTHONHASHSEED'] = str(seed) 26 | np.random.seed(seed) 27 | torch.manual_seed(seed) 28 | torch.backends.cudnn.deterministic = True 29 | torch.backends.cudnn.benchmark = False 30 | 31 | 32 | def reduce_mem_usage(df): 33 | start_mem = df.memory_usage().sum() / 1024 ** 2 34 | for i, col in enumerate(df.columns): 35 | try: 36 | col_type = df[col].dtype 37 | 38 | if col_type != object: 39 | c_min = df[col].min() 40 | c_max = df[col].max() 41 | if str(col_type)[:3] == 'int': 42 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 43 | df[col] = df[col].astype(np.int8) 44 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 45 | df[col] = df[col].astype(np.int16) 46 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 47 | df[col] = df[col].astype(np.int32) 48 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 49 | df[col] = df[col].astype(np.int32) 50 | else: 51 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 52 | df[col] = df[col].astype(np.float32) 53 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 54 | df[col] = df[col].astype(np.float32) 55 | else: 56 | df[col] = df[col].astype(np.float32) 57 | except ValueError: 58 | continue 59 | 60 | end_mem = df.memory_usage().sum() / 1024 ** 2 61 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) 62 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) 63 | 64 | return df 65 | 66 | 67 | class Timer: 68 | def __init__(self): 69 | self.processing_time = 0 70 | 71 | @contextmanager 72 | def timer(self, name): 73 | logging.info(f'[{name}] start') 74 | t0 = time.time() 75 | yield 76 | t1 = time.time() 77 | processing_time = t1 - t0 78 | self.processing_time += round(processing_time, 2) 79 | if self.processing_time < 60: 80 | logging.info(f'[{name}] done in {processing_time:.0f} s (Total: {self.processing_time:.2f} sec)') 81 | elif self.processing_time < 3600: 82 | logging.info(f'[{name}] done in {processing_time:.0f} s (Total: {self.processing_time / 60:.2f} min)') 83 | else: 84 | logging.info(f'[{name}] done in {processing_time:.0f} s (Total: {self.processing_time / 3600:.2f} hour)') 85 | 86 | def get_processing_time(self): 87 | return round(self.processing_time, 2) -------------------------------------------------------------------------------- /run/solution.sh: -------------------------------------------------------------------------------- 1 | 2 | export RANDOM_STATE='46' 3 | export INPUT_DIR='../data' 4 | export FEATURE_DIR='../features' 5 | export FOLD_DIR='../folds' 6 | export SUB_DIR='../submission' 7 | export EXP_DIR='../exp' 8 | # export DEVICE='cuda:1' 9 | 10 | ''' 11 | - create folds 12 | - `create_folds.py` 13 | ''' 14 | export FOLD_NAME='mskf_user' 15 | export FOLD_NUM='5' 16 | python ../src/create_folds.py 17 | 18 | ''' 19 | - Feature extraction 20 | - `feature_extraction.py` 21 | - extract features from original data for training the model 22 | ''' 23 | python ../src/feature_extraction.py 24 | 25 | ''' 26 | - Training the models using all features 27 | - EXP_NAME: `task{1 or 2}_{model}` 28 | - script: `train_{model}.py` 29 | - hyperparameters and evaluation scores of each model are saved in mlflow 30 | - refer to `config.py` to set the features and hyperparameters to be used to train the model 31 | ''' 32 | 33 | export EXP_NAME='task1_lgbm' # lgb, all feature, te_smooth5 34 | python ../src/train_lgbm.py 35 | 36 | export EXP_NAME='task1_lgbm_2' # lgb, all feature, te_smooth2 37 | python ../src/train_lgbm.py 38 | 39 | export EXP_NAME='task1_xgb_2' # xgb, all feature, te_smooth5 40 | python ../src/train_xgb.py 41 | 42 | export EXP_NAME='task1_cat' # cat, all feature, te_smooth5 43 | python ../src/train_cat.py 44 | 45 | export EXP_NAME='task2_lgbm' # lgb, all feature, te_smooth5 46 | python ../src/train_lgbm.py 47 | 48 | export EXP_NAME='task2_xgb_2' # xgb, all feature, te_smooth5 49 | python ../src/train_xgb.py 50 | 51 | 52 | ''' 53 | - Training the models using selected features 54 | - EXP_NAME: `task{1 or 2 or 12}_{model}_{feature_select}` 55 | - script: `train_{model}.py` 56 | ''' 57 | 58 | export EXP_NAME='task1_lgbm_fs100' # lgb, fs100, te_smooth5 59 | python ../src/train_lgbm.py 60 | 61 | export EXP_NAME='task1_xgb_fs100' # xgb, fs100, te_smooth5 62 | python ../src/train_xgb.py 63 | 64 | export EXP_NAME='task1_cat_fs100' # cat depth 8, fs100, te_smooth5 65 | python ../src/train_cat.py 66 | 67 | export EXP_NAME='task1_cat_2_fs100' # cat depth 10, fs100, te_smooth5 68 | python ../src/train_cat.py 69 | 70 | export EXP_NAME='task1_mlp_fs100' # mlp, fs100, te_smooth5 71 | python ../src/train_mlp.py 72 | 73 | export EXP_NAME='task2_lgbm_fs100' # lgb, fs100, te_smooth5 74 | python ../src/train_lgbm.py 75 | 76 | export EXP_NAME='task2_xgb_fs100' # xgb, fs100, te_smooth5 77 | python ../src/train_xgb.py 78 | 79 | export EXP_NAME='task2_cat_fs100' # cat depth 8, fs100, te_smooth5 80 | python ../src/train_cat.py 81 | 82 | export EXP_NAME='task2_mlp_fs100' # mlp, fs100, te_smooth5 83 | python ../src/train_mlp.py 84 | 85 | export EXP_NAME='task12_multitask_mlp_fs100' # mlp multi, fs100, te_smooth5 86 | python ../src/train_mlp_multitask.py 87 | 88 | # ... 89 | 90 | ''' 91 | - Feature extraction from prediction results 92 | ''' 93 | python ../src/feature_extraction_preds_meta.py 94 | 95 | ''' 96 | - Training the models using prediction resuls features, 2nd-stage 97 | ''' 98 | 99 | export EXP_NAME='task1_xgb_fs100_meta' 100 | python ../src/train_xgb.py 101 | 102 | export EXP_NAME='task1_xgb_fs50_meta' 103 | python ../src/train_xgb.py 104 | 105 | export EXP_NAME='task1_cat_fs50_meta' 106 | python ../src/train_cat.py 107 | 108 | export EXP_NAME='task2_xgb_fs100_meta' 109 | python ../src/train_xgb.py 110 | 111 | export EXP_NAME='task2_xgb_fs50_meta' 112 | python ../src/train_xgb.py 113 | 114 | export EXP_NAME='task12_multitask_mlp_fs50_meta' 115 | python ../src/train_mlp_multitask.py 116 | 117 | ''' 118 | - Ridge stacking 119 | ''' 120 | python ../stacking.py -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | from dataset import SimpleDataLoader 6 | from utils_model import DNN, PredictionLayer, get_varlen_pooling_list 7 | 8 | 9 | class DNN_multitask_v2(nn.Module): 10 | def __init__( 11 | self, 12 | dnn_input, dnn_hidden_units, dnn_hidden_units_task, dnn_dropout, 13 | activation='relu', use_bn=True, l2_reg=1e-4, init_std=1e-4, 14 | device='cpu', 15 | feature_index={}, 16 | embedding_dict={}, 17 | dense_features=[], 18 | sparse_features=[], 19 | varlen_sparse_features=[], 20 | ): 21 | super().__init__() 22 | self.device = device 23 | self.feature_index = feature_index 24 | self.embedding_dict = embedding_dict 25 | self.dense_features = dense_features 26 | self.sparse_features = sparse_features 27 | self.varlen_sparse_features = varlen_sparse_features 28 | 29 | self.dnn = DNN( 30 | dnn_input, dnn_hidden_units, 31 | activation='relu', l2_reg=l2_reg, dropout_rate=dnn_dropout, use_bn=use_bn, 32 | init_std=init_std, device=device 33 | ) 34 | self.dnn_t1 = DNN( 35 | dnn_hidden_units[-1], dnn_hidden_units_task, 36 | activation='relu', l2_reg=l2_reg, dropout_rate=dnn_dropout, use_bn=use_bn, 37 | init_std=init_std, device=device 38 | ) 39 | self.dnn_t2 = DNN( 40 | dnn_hidden_units[-1], dnn_hidden_units_task, 41 | activation='relu', l2_reg=l2_reg, dropout_rate=dnn_dropout, use_bn=use_bn, 42 | init_std=init_std, device=device 43 | ) 44 | self.dnn_linear_t1 = nn.Linear(dnn_hidden_units_task[-1], 1, bias=False).to(device) 45 | self.dnn_linear_t2 = nn.Linear(dnn_hidden_units_task[-1], 4, bias=False).to(device) 46 | 47 | task = 'binary' 48 | self.out_t1 = PredictionLayer(task,) 49 | self.out_t2 = nn.Softmax(dim=1) 50 | self.to(device) 51 | 52 | def forward(self, X): 53 | 54 | dense_value_list = [ 55 | X[:, self.feature_index[feat]: self.feature_index[feat] + 1] for feat in self.dense_features 56 | ] 57 | sparse_embedding_list = [ 58 | self.embedding_dict[feat]( 59 | X[:, self.feature_index[feat]].long() 60 | ) for feat in self.sparse_features 61 | ] 62 | varlen_sparse_embedding_list = get_varlen_pooling_list( 63 | self.embedding_dict, X, self.feature_index, self.varlen_sparse_features, self.device 64 | ) 65 | sparse_embedding_list = sparse_embedding_list + varlen_sparse_embedding_list 66 | 67 | sparse_dnn_input = torch.flatten(torch.cat(sparse_embedding_list, dim=-1), start_dim=1) 68 | dense_dnn_input = torch.flatten(torch.cat(dense_value_list, dim=-1), start_dim=1) 69 | 70 | dnn_input = torch.cat([sparse_dnn_input, dense_dnn_input], dim=-1) 71 | dnn_hidden = self.dnn(dnn_input) 72 | 73 | dnn_output_t1 = self.dnn_t1(dnn_hidden) 74 | dnn_logit_t1 = self.dnn_linear_t1(dnn_output_t1) 75 | y_pred_t1 = self.out_t1(dnn_logit_t1) 76 | y_pred_t1 = torch.clamp(y_pred_t1, min=1e-8, max=1.0 - 1e-8) 77 | 78 | dnn_output_t2 = self.dnn_t2(dnn_hidden) 79 | dnn_logit_t2 = self.dnn_linear_t2(dnn_output_t2) 80 | y_pred_t2 = self.out_t2(dnn_logit_t2) 81 | 82 | return y_pred_t1, y_pred_t2 83 | 84 | def predict(self, x, batch_size=256): 85 | 86 | model = self.eval() 87 | test_loader = SimpleDataLoader( 88 | [torch.from_numpy(x.values)], 89 | batch_size=batch_size, 90 | shuffle=False 91 | ) 92 | 93 | pred_ans = [] 94 | pred_t2_ans = [] 95 | with torch.no_grad(): 96 | for index, x_test in enumerate(test_loader): 97 | x = x_test[0].to(self.device).float() 98 | y_pred, y_pred_t2 = model(x) 99 | y_pred = y_pred.squeeze() 100 | y_pred_t2 = y_pred_t2.squeeze() 101 | pred_ans.append(y_pred.cpu().detach().numpy()) 102 | pred_t2_ans.append(y_pred_t2.cpu().detach().numpy()) 103 | 104 | return np.concatenate(pred_ans), np.concatenate(pred_t2_ans) -------------------------------------------------------------------------------- /src/train_cat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | import zipfile 7 | import shutil 8 | import logging 9 | 10 | import mlflow 11 | from sklearn import metrics 12 | 13 | from catboost import CatBoostClassifier 14 | 15 | from utils import ( 16 | seed_everything, 17 | Timer, 18 | reduce_mem_usage, 19 | load_from_pkl, save_as_pkl 20 | ) 21 | 22 | logging.basicConfig(level=logging.INFO) 23 | 24 | DEVICE = os.environ.get('DEVICE') 25 | 26 | EXP_NAME = os.environ.get('EXP_NAME') 27 | EXP_DIR = os.environ.get('EXP_DIR') 28 | 29 | INPUT_DIR = os.environ.get('INPUT_DIR') 30 | FEATURE_DIR = os.environ.get('FEATURE_DIR') 31 | FOLD_DIR = os.environ.get('FOLD_DIR') 32 | SUB_DIR = os.environ.get('SUB_DIR') 33 | 34 | sys.path.append(f'{EXP_DIR}/{EXP_NAME}') 35 | import config 36 | 37 | FOLD_NAME = config.FOLD_NAME 38 | FOLD_NUM = config.FOLD_NUM 39 | RANDOM_STATE = config.RANDOM_STATE 40 | 41 | TARGET_TASK = config.TARGET_TASK 42 | 43 | CAT_PARAMS = config.CAT_PARAMS 44 | 45 | dense_features = config.dense_features 46 | sparse_features = config.sparse_features 47 | varlen_sparse_features = config.varlen_sparse_features 48 | 49 | 50 | def save_mlflow(run_id, cv, acc=None, th=None): 51 | 52 | mlflow.log_param("fold_name", FOLD_NAME) 53 | mlflow.log_param("fold_num", FOLD_NUM) 54 | 55 | for feat in dense_features: 56 | feat = feat.replace('#', '') 57 | mlflow.log_param(f'f__dense__{feat}', 1) 58 | for feat in sparse_features: 59 | feat = feat.replace('#', '') 60 | mlflow.log_param(f'f__sparse__{feat}', 1) 61 | for feat in varlen_sparse_features: 62 | feat = feat.replace('#', '') 63 | mlflow.log_param(f'f__varspa__{feat}', 1) 64 | 65 | mlflow.log_metric("cv", cv) 66 | if acc is not None: 67 | mlflow.log_metric("acc", acc) 68 | if th is not None: 69 | mlflow.log_metric("th", th) 70 | return 71 | 72 | 73 | def train_1fold(preds, preds_test, trn_idx, val_idx, X_train, y_train, x_test, cat_features_index): 74 | 75 | x_trn = X_train.iloc[trn_idx] 76 | y_trn = y_train[trn_idx] 77 | x_val = X_train.iloc[val_idx] 78 | y_val = y_train[val_idx] 79 | 80 | model = CatBoostClassifier( 81 | **CAT_PARAMS 82 | ) 83 | model.fit( 84 | x_trn, y_trn, 85 | cat_features=cat_features_index, 86 | eval_set=(x_val, y_val), 87 | use_best_model=True, 88 | verbose=100 89 | ) 90 | 91 | if TARGET_TASK == '1': 92 | preds[val_idx] = model.predict_proba(x_val)[:, 1] 93 | preds_test += model.predict_proba(x_test)[:, 1] 94 | elif TARGET_TASK == '2': 95 | preds[val_idx] = model.predict_proba(x_val) 96 | preds_test += model.predict_proba(x_test) 97 | 98 | return preds, preds_test 99 | 100 | 101 | if __name__ == "__main__": 102 | 103 | t = Timer() 104 | with t.timer(f'fix seed RANDOM_STATE:{RANDOM_STATE}'): 105 | seed_everything(RANDOM_STATE) 106 | 107 | with t.timer(f'read label'): 108 | data_path = f'{INPUT_DIR}/train_data/train_task_1_2.csv' 109 | train = pd.read_csv(data_path) 110 | if TARGET_TASK == '1': 111 | y_train = train['IsCorrect'].values 112 | elif TARGET_TASK == '2': 113 | y_train = (train['AnswerValue'] - 1).values 114 | 115 | skip_fr = False 116 | if skip_fr is False: 117 | with t.timer(f'read features'): 118 | 119 | X_train = pd.DataFrame() 120 | X_test = pd.DataFrame() 121 | cat_features_index = [] 122 | fidx = 0 123 | for feat in dense_features: 124 | logging.info(f'[{feat}] read feature ...') 125 | X_train = pd.concat([ 126 | X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') 127 | ], axis=1) 128 | X_test = pd.concat([ 129 | X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') 130 | ], axis=1) 131 | fidx += 1 132 | X_train = reduce_mem_usage(X_train) 133 | X_test = reduce_mem_usage(X_test) 134 | for feat in sparse_features: 135 | logging.info(f'[{feat}] read feature ...') 136 | X_train = pd.concat([ 137 | X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') 138 | ], axis=1) 139 | X_test = pd.concat([ 140 | X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') 141 | ], axis=1) 142 | cat_features_index.append(fidx) 143 | fidx += 1 144 | X_train = reduce_mem_usage(X_train) 145 | X_test = reduce_mem_usage(X_test) 146 | save_as_pkl(X_train, f'X_train_{EXP_NAME}_.pkl') 147 | save_as_pkl(X_test, f'X_test_{EXP_NAME}_.pkl') 148 | elif skip_fr is True: 149 | X_train = load_from_pkl(f'X_train_task1_lgbm_fs100.pkl') 150 | X_test = load_from_pkl(f'X_test_task1_lgbm_fs100.pkl') 151 | cat_features_index = [] 152 | 153 | mlflow.set_experiment(EXP_NAME) 154 | mlflow.start_run() 155 | run_id = mlflow.active_run().info.run_id 156 | 157 | with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'): 158 | folds = pd.read_csv(f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv') 159 | 160 | with t.timer(f'train XGB'): 161 | 162 | logging.info(f'Num. of Samples: {len(X_train)}') 163 | logging.info(f'Num. of Features: {X_train.shape[1]}') 164 | 165 | if TARGET_TASK == '1': 166 | preds = np.zeros(len(X_train)) 167 | preds_test = np.zeros(len(X_test)) 168 | elif TARGET_TASK == '2': 169 | preds = np.zeros((len(X_train), 4)) 170 | preds_test = np.zeros((len(X_test), 4)) 171 | 172 | for fold_idx in range(FOLD_NUM): 173 | 174 | logging.info(f'FOLD:{fold_idx}') 175 | 176 | trn_idx = folds[folds.kfold != fold_idx].index.tolist() 177 | val_idx = folds[folds.kfold == fold_idx].index.tolist() 178 | 179 | preds, preds_test = train_1fold( 180 | preds, preds_test, 181 | trn_idx, val_idx, X_train, y_train, X_test, 182 | cat_features_index 183 | ) 184 | 185 | preds_test /= FOLD_NUM 186 | 187 | if not os.path.exists(f'../save/{EXP_NAME}'): 188 | os.mkdir(f'../save/{EXP_NAME}') 189 | 190 | pd.DataFrame(preds).to_csv(f'../save/{EXP_NAME}/preds_val_task{TARGET_TASK}_{run_id}.csv', index=False) 191 | pd.DataFrame(preds_test).to_csv(f'../save/{EXP_NAME}/preds_test_task{TARGET_TASK}_{run_id}.csv', index=False) 192 | 193 | if TARGET_TASK == '1': 194 | with t.timer(f'postprocess, threshold'): 195 | rows = [] 196 | for th in tqdm(range(40, 60, 1)): 197 | th = th * 0.01 198 | preds_th = [] 199 | for i in preds: 200 | if i > th: 201 | preds_th.append(1) 202 | else: 203 | preds_th.append(0) 204 | acc = metrics.accuracy_score(y_train, preds_th) 205 | rows.append([th, acc]) 206 | acc_th = pd.DataFrame(rows, columns=['th', 'acc']) 207 | 208 | cv = metrics.roc_auc_score(y_train, preds) 209 | scores = acc_th.sort_values('acc', ascending=False).head(1).values[0] 210 | best_th, best_acc = scores[0], scores[1] 211 | logging.info(f'Val AUC: {cv}',) 212 | logging.info(f'Val Best Acc: {best_acc} (threshold: {best_th})') 213 | 214 | with t.timer(f'make submission'): 215 | preds_test_th = [] 216 | for i in preds_test: 217 | if i > best_th: 218 | preds_test_th.append(1) 219 | else: 220 | preds_test_th.append(0) 221 | 222 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 223 | sub = pd.read_csv(test_data_path) 224 | sub['IsCorrect'] = preds_test_th 225 | 226 | if not os.path.exists(f'../submission/{EXP_NAME}'): 227 | os.mkdir(f'../submission/{EXP_NAME}') 228 | 229 | sub_name = f'submission_task1__auc{cv}__acc{best_acc}__th{best_th}' 230 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 231 | if not os.path.exists(valid_sub_dir): 232 | os.mkdir(valid_sub_dir) 233 | 234 | sub.to_csv(f'{valid_sub_dir}/submission_task_1.csv', index=False) 235 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 236 | new_zip.write(f'{valid_sub_dir}/submission_task_1.csv', arcname='submission_task_1.csv') 237 | shutil.rmtree(valid_sub_dir) 238 | mlflow.log_artifact(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip') 239 | 240 | submission_path = f'../submission/{EXP_NAME}/submission_task1__auc{cv}__acc{best_acc}__th{best_th}.csv' 241 | sub.to_csv(submission_path, index=False) 242 | 243 | elif TARGET_TASK == '2': 244 | 245 | preds_label = np.argmax(preds, axis=1) + 1 246 | preds_test_label = np.argmax(preds_test, axis=1) + 1 247 | cv = metrics.accuracy_score(y_train + 1, preds_label) 248 | logging.info(f'CV Multiclass Acc: {cv}') 249 | 250 | with t.timer(f'make submission'): 251 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 252 | sub = pd.read_csv(test_data_path) 253 | sub['AnswerValue'] = preds_test_label 254 | 255 | if not os.path.exists(f'{SUB_DIR}/{EXP_NAME}'): 256 | os.mkdir(f'{SUB_DIR}/{EXP_NAME}') 257 | 258 | sub_name = f'submission_task2__acc{cv}' 259 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 260 | if not os.path.exists(valid_sub_dir): 261 | os.mkdir(valid_sub_dir) 262 | 263 | sub.to_csv(f'{valid_sub_dir}/submission_task_2.csv', index=False) 264 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 265 | new_zip.write(f'{valid_sub_dir}/submission_task_2.csv', arcname='submission_task_2.csv') 266 | shutil.rmtree(valid_sub_dir) 267 | mlflow.log_artifact(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip') 268 | 269 | if TARGET_TASK == '1': 270 | save_mlflow(run_id, cv, best_acc, best_th) 271 | elif TARGET_TASK == '2': 272 | save_mlflow(run_id, cv) 273 | mlflow.end_run() 274 | -------------------------------------------------------------------------------- /src/utils_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class DNN(nn.Module): 6 | """The Multi Layer Percetron 7 | Input shape 8 | - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``. 9 | Output shape 10 | - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``. 11 | Arguments 12 | - **inputs_dim**: input feature dimension. 13 | - **hidden_units**:list of positive integer, the layer number and units in each layer. 14 | - **activation**: Activation function to use. 15 | - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix. 16 | - **dropout_rate**: float in [0,1). Fraction of the units to dropout. 17 | - **use_bn**: bool. Whether use BatchNormalization before activation or not. 18 | - **seed**: A Python integer to use as random seed. 19 | """ 20 | 21 | def __init__(self, inputs_dim, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, 22 | init_std=0.0001, dice_dim=3, seed=1024, device='cpu'): 23 | super(DNN, self).__init__() 24 | self.dropout_rate = dropout_rate 25 | self.dropout = nn.Dropout(dropout_rate) 26 | self.seed = seed 27 | self.l2_reg = l2_reg 28 | self.use_bn = use_bn 29 | if len(hidden_units) == 0: 30 | raise ValueError("hidden_units is empty!!") 31 | hidden_units = [inputs_dim] + list(hidden_units) 32 | 33 | print(hidden_units) 34 | 35 | self.linears = nn.ModuleList( 36 | [nn.Linear(hidden_units[i], hidden_units[i + 1]) for i in range(len(hidden_units) - 1)]) 37 | 38 | if self.use_bn: 39 | self.bn = nn.ModuleList( 40 | [nn.BatchNorm1d(hidden_units[i + 1]) for i in range(len(hidden_units) - 1)]) 41 | 42 | self.activation_layers = nn.ModuleList( 43 | [activation_layer(activation, hidden_units[i + 1], dice_dim) for i in range(len(hidden_units) - 1)]) 44 | 45 | for name, tensor in self.linears.named_parameters(): 46 | if 'weight' in name: 47 | nn.init.normal_(tensor, mean=0, std=init_std) 48 | 49 | self.to(device) 50 | 51 | def forward(self, inputs): 52 | deep_input = inputs 53 | 54 | for i in range(len(self.linears)): 55 | 56 | fc = self.linears[i](deep_input) 57 | 58 | if self.use_bn: 59 | fc = self.bn[i](fc) 60 | 61 | fc = self.activation_layers[i](fc) 62 | 63 | fc = self.dropout(fc) 64 | deep_input = fc 65 | return deep_input 66 | 67 | 68 | class SequencePoolingLayer(nn.Module): 69 | """The SequencePoolingLayer is used to apply pooling operation(sum,mean,max) on variable-length sequence feature/multi-value feature. 70 | 71 | Input shape 72 | - A list of two tensor [seq_value,seq_len] 73 | 74 | - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)`` 75 | 76 | - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence. 77 | 78 | Output shape 79 | - 3D tensor with shape: ``(batch_size, 1, embedding_size)``. 80 | 81 | Arguments 82 | - **mode**:str.Pooling operation to be used,can be sum,mean or max. 83 | 84 | """ 85 | 86 | def __init__(self, mode='mean', supports_masking=False, device='cpu'): 87 | 88 | super(SequencePoolingLayer, self).__init__() 89 | if mode not in ['sum', 'mean', 'max']: 90 | raise ValueError('parameter mode should in [sum, mean, max]') 91 | self.supports_masking = supports_masking 92 | self.mode = mode 93 | self.device = device 94 | self.eps = torch.FloatTensor([1e-8]).to(device) 95 | self.to(device) 96 | 97 | def _sequence_mask(self, lengths, maxlen=None, dtype=torch.bool): 98 | # Returns a mask tensor representing the first N positions of each cell. 99 | if maxlen is None: 100 | maxlen = lengths.max() 101 | row_vector = torch.arange(0, maxlen, 1).to(self.device) 102 | matrix = torch.unsqueeze(lengths, dim=-1) 103 | mask = row_vector < matrix 104 | 105 | mask.type(dtype) 106 | return mask 107 | 108 | def forward(self, seq_value_len_list): 109 | if self.supports_masking: 110 | uiseq_embed_list, mask = seq_value_len_list # [B, T, E], [B, 1] 111 | mask = mask.float() 112 | user_behavior_length = torch.sum(mask, dim=-1, keepdim=True) 113 | mask = mask.unsqueeze(2) 114 | else: 115 | uiseq_embed_list, user_behavior_length = seq_value_len_list # [B, T, E], [B, 1] 116 | mask = self._sequence_mask(user_behavior_length, maxlen=uiseq_embed_list.shape[1], 117 | dtype=torch.float32) # [B, 1, maxlen] 118 | mask = torch.transpose(mask, 1, 2) # [B, maxlen, 1] 119 | 120 | embedding_size = uiseq_embed_list.shape[-1] 121 | 122 | mask = torch.repeat_interleave(mask, embedding_size, dim=2) # [B, maxlen, E] 123 | 124 | if self.mode == 'max': 125 | hist = uiseq_embed_list - (1 - mask) * 1e9 126 | hist = torch.max(hist, dim=1, keepdim=True)[0] 127 | return hist 128 | hist = uiseq_embed_list * mask.float() 129 | hist = torch.sum(hist, dim=1, keepdim=False) 130 | 131 | if self.mode == 'mean': 132 | hist = torch.div(hist, user_behavior_length.type(torch.float32) + self.eps) 133 | 134 | hist = torch.unsqueeze(hist, dim=1) 135 | return hist 136 | 137 | 138 | def get_varlen_pooling_list(embedding_dict, features, feature_index, varlen_sparse_feature_columns, device): 139 | varlen_sparse_embedding_list = [] 140 | for feat in varlen_sparse_feature_columns: 141 | # print(embedding_dict[feat]) 142 | # print(features.shape) 143 | # print(feature_index[feat][0], feature_index[feat][1]) 144 | # print(features[:, feature_index[feat][0]:feature_index[feat][1]].long()) 145 | seq_emb = embedding_dict[feat]( 146 | features[:, feature_index[feat][0]:feature_index[feat][1]].long() 147 | ) 148 | seq_mask = features[:, feature_index[feat][0]:feature_index[feat][1]].long() != 0 149 | emb = SequencePoolingLayer( 150 | mode='sum', 151 | supports_masking=True, 152 | device=device 153 | )([seq_emb, seq_mask]) 154 | b = len(features) 155 | emb = emb.view(b, -1) 156 | varlen_sparse_embedding_list.append(emb) 157 | return varlen_sparse_embedding_list 158 | 159 | 160 | class Dice(nn.Module): 161 | """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data. 162 | Input shape: 163 | - 2 dims: [batch_size, embedding_size(features)] 164 | - 3 dims: [batch_size, num_features, embedding_size(features)] 165 | Output shape: 166 | - Same shape as input. 167 | References 168 | - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf) 169 | - https://github.com/zhougr1993/DeepInterestNetwork, https://github.com/fanoping/DIN-pytorch 170 | """ 171 | def __init__(self, emb_size, dim=2, epsilon=1e-8, device='cpu'): 172 | super(Dice, self).__init__() 173 | assert dim == 2 or dim == 3 174 | 175 | self.bn = nn.BatchNorm1d(emb_size, eps=epsilon) 176 | self.sigmoid = nn.Sigmoid() 177 | self.dim = dim 178 | 179 | if self.dim == 2: 180 | self.alpha = torch.zeros((emb_size,)).to(device) 181 | else: 182 | self.alpha = torch.zeros((emb_size, 1)).to(device) 183 | 184 | def forward(self, x): 185 | assert x.dim() == self.dim 186 | if self.dim == 2: 187 | x_p = self.sigmoid(self.bn(x)) 188 | out = self.alpha * (1 - x_p) * x + x_p * x 189 | else: 190 | x = torch.transpose(x, 1, 2) 191 | x_p = self.sigmoid(self.bn(x)) 192 | out = self.alpha * (1 - x_p) * x + x_p * x 193 | out = torch.transpose(out, 1, 2) 194 | return out 195 | 196 | 197 | class Identity(nn.Module): 198 | def __init__(self, **kwargs): 199 | super(Identity, self).__init__() 200 | 201 | def forward(self, X): 202 | return X 203 | 204 | 205 | def activation_layer(act_name, hidden_size=None, dice_dim=2): 206 | """Construct activation layers 207 | Args: 208 | act_name: str or nn.Module, name of activation function 209 | hidden_size: int, used for Dice activation 210 | dice_dim: int, used for Dice activation 211 | Return: 212 | act_layer: activation layer 213 | """ 214 | if isinstance(act_name, str): 215 | if act_name.lower() == 'sigmoid': 216 | act_layer = nn.Sigmoid() 217 | elif act_name.lower() == 'linear': 218 | act_layer = Identity() 219 | elif act_name.lower() == 'relu': 220 | act_layer = nn.ReLU(inplace=True) 221 | elif act_name.lower() == 'dice': 222 | assert dice_dim 223 | act_layer = Dice(hidden_size, dice_dim) 224 | elif act_name.lower() == 'prelu': 225 | act_layer = nn.PReLU() 226 | elif issubclass(act_name, nn.Module): 227 | act_layer = act_name() 228 | else: 229 | raise NotImplementedError 230 | return act_layer 231 | 232 | 233 | class PredictionLayer(nn.Module): 234 | """ 235 | Arguments 236 | - **task**: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss 237 | - **use_bias**: bool.Whether add bias term or not. 238 | """ 239 | 240 | def __init__(self, task='binary', use_bias=True, **kwargs): 241 | if task not in ["binary", "multiclass", "regression"]: 242 | raise ValueError("task must be binary,multiclass or regression") 243 | 244 | super(PredictionLayer, self).__init__() 245 | self.use_bias = use_bias 246 | self.task = task 247 | if self.use_bias: 248 | self.bias = nn.Parameter(torch.zeros((1,))) 249 | 250 | def forward(self, X): 251 | output = X 252 | if self.use_bias: 253 | output = output + self.bias 254 | if self.task == "binary": 255 | output = torch.sigmoid(output) 256 | return output -------------------------------------------------------------------------------- /src/train_lgbm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | import zipfile 7 | import shutil 8 | import logging 9 | 10 | import mlflow 11 | from sklearn import metrics 12 | 13 | import lightgbm as lgb 14 | print('LGBM Version', lgb.__version__) 15 | 16 | from utils import ( 17 | seed_everything, 18 | Timer, 19 | reduce_mem_usage, 20 | load_from_pkl, save_as_pkl 21 | ) 22 | 23 | logging.basicConfig(level=logging.INFO) 24 | 25 | DEVICE = os.environ.get('DEVICE') 26 | 27 | EXP_NAME = os.environ.get('EXP_NAME') 28 | EXP_DIR = os.environ.get('EXP_DIR') 29 | 30 | INPUT_DIR = os.environ.get('INPUT_DIR') 31 | FEATURE_DIR = os.environ.get('FEATURE_DIR') 32 | FOLD_DIR = os.environ.get('FOLD_DIR') 33 | SUB_DIR = os.environ.get('SUB_DIR') 34 | 35 | sys.path.append(f'{EXP_DIR}/{EXP_NAME}') 36 | import config 37 | 38 | FOLD_NAME = config.FOLD_NAME 39 | FOLD_NUM = config.FOLD_NUM 40 | RANDOM_STATE = config.RANDOM_STATE 41 | 42 | TARGET_TASK = config.TARGET_TASK 43 | 44 | LGB_MDOEL_PARAMS = config.LGB_MDOEL_PARAMS 45 | LGB_TRAIN_PARAMS = config.LGB_TRAIN_PARAMS 46 | 47 | 48 | dense_features = config.dense_features 49 | sparse_features = config.sparse_features 50 | varlen_sparse_features = config.varlen_sparse_features 51 | 52 | 53 | def save_mlflow(run_id, cv, acc=None, th=None): 54 | 55 | mlflow.log_param("fold_name", FOLD_NAME) 56 | mlflow.log_param("fold_num", FOLD_NUM) 57 | 58 | for feat in dense_features: 59 | feat = feat.replace('#', '') 60 | mlflow.log_param(f'f__dense__{feat}', 1) 61 | for feat in sparse_features: 62 | feat = feat.replace('#', '') 63 | mlflow.log_param(f'f__sparse__{feat}', 1) 64 | for feat in varlen_sparse_features: 65 | feat = feat.replace('#', '') 66 | mlflow.log_param(f'f__varspa__{feat}', 1) 67 | 68 | mlflow.log_metric("cv", cv) 69 | if acc is not None: 70 | mlflow.log_metric("acc", acc) 71 | if th is not None: 72 | mlflow.log_metric("th", th) 73 | return 74 | 75 | 76 | def train_1fold(preds, preds_test, importance, trn_idx, val_idx, X_train, y_train, X_test): 77 | 78 | x_trn = X_train.iloc[trn_idx] 79 | y_trn = y_train[trn_idx] 80 | x_val = X_train.iloc[val_idx] 81 | y_val = y_train[val_idx] 82 | 83 | d_train = lgb.Dataset(x_trn, label=y_trn) 84 | d_val = lgb.Dataset(x_val, label=y_val, reference=d_train) 85 | 86 | model = lgb.train( 87 | params=LGB_MDOEL_PARAMS, 88 | train_set=d_train, 89 | valid_sets=[d_train, d_val], 90 | valid_names=['train', 'valid'], 91 | **LGB_TRAIN_PARAMS 92 | ) 93 | 94 | preds[val_idx] = model.predict(x_val) 95 | preds_test += model.predict(X_test) 96 | 97 | importance = {} 98 | fnames = model.feature_name() 99 | importance_fold = model.feature_importance(importance_type='gain') 100 | for i, f in enumerate(fnames): 101 | if f not in importance: 102 | importance[f] = 0 103 | importance[f] += importance_fold[i] 104 | return preds, preds_test, importance 105 | 106 | 107 | if __name__ == "__main__": 108 | 109 | t = Timer() 110 | with t.timer(f'fix seed RANDOM_STATE:{RANDOM_STATE}'): 111 | seed_everything(RANDOM_STATE) 112 | 113 | with t.timer(f'read label'): 114 | data_path = f'{INPUT_DIR}/train_data/train_task_1_2.csv' 115 | train = pd.read_csv(data_path) 116 | if TARGET_TASK == '1': 117 | y_train = train['IsCorrect'].values 118 | elif TARGET_TASK == '2': 119 | y_train = (train['AnswerValue'] - 1).values 120 | 121 | skip_fr = True 122 | if skip_fr is False: 123 | with t.timer(f'read features'): 124 | 125 | X_train = pd.DataFrame() 126 | X_test = pd.DataFrame() 127 | for feat in dense_features: 128 | logging.info(f'[{feat}] read feature ...') 129 | X_train = pd.concat([ 130 | X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') 131 | ], axis=1) 132 | X_test = pd.concat([ 133 | X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') 134 | ], axis=1) 135 | X_train = reduce_mem_usage(X_train) 136 | X_test = reduce_mem_usage(X_test) 137 | for feat in sparse_features: 138 | logging.info(f'[{feat}] read feature ...') 139 | X_train = pd.concat([ 140 | X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') 141 | ], axis=1) 142 | X_test = pd.concat([ 143 | X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') 144 | ], axis=1) 145 | X_train = reduce_mem_usage(X_train) 146 | X_test = reduce_mem_usage(X_test) 147 | 148 | save_as_pkl(X_train, f'X_train_{EXP_NAME}_.pkl') 149 | save_as_pkl(X_test, f'X_test_{EXP_NAME}_.pkl') 150 | 151 | elif skip_fr is True: 152 | X_train = load_from_pkl(f'X_train_task1_lgbm_t2.pkl') 153 | X_test = load_from_pkl(f'X_test_task1_lgbm_t2.pkl') 154 | 155 | mlflow.set_experiment(EXP_NAME) 156 | mlflow.start_run() 157 | run_id = mlflow.active_run().info.run_id 158 | 159 | with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'): 160 | folds = pd.read_csv(f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv') 161 | 162 | with t.timer(f'train LGBM'): 163 | 164 | logging.info(f'Num. of Samples: {len(X_train)}') 165 | logging.info(f'Num. of Features: {X_train.shape[1]}') 166 | 167 | if TARGET_TASK == '1': 168 | preds = np.zeros(len(X_train)) 169 | preds_test = np.zeros(len(X_test)) 170 | elif TARGET_TASK == '2': 171 | preds = np.zeros((len(X_train), LGB_MDOEL_PARAMS['num_class'])) 172 | preds_test = np.zeros((len(X_test), LGB_MDOEL_PARAMS['num_class'])) 173 | 174 | importance = {} 175 | for fold_idx in range(FOLD_NUM): 176 | 177 | logging.info(f'FOLD:{fold_idx}') 178 | 179 | trn_idx = folds[folds.kfold != fold_idx].index.tolist() 180 | val_idx = folds[folds.kfold == fold_idx].index.tolist() 181 | 182 | preds, preds_test, importance = train_1fold( 183 | preds, preds_test, importance, 184 | trn_idx, val_idx, X_train, y_train, X_test 185 | ) 186 | 187 | preds_test /= FOLD_NUM 188 | df_score = pd.DataFrame( 189 | [(i, j) for i, j in importance.items()], columns=['fname', 'importance'] 190 | ).sort_values('importance', ascending=False) 191 | 192 | if not os.path.exists(f'../save/{EXP_NAME}'): 193 | os.mkdir(f'../save/{EXP_NAME}') 194 | 195 | pd.DataFrame(preds).to_csv(f'../save/{EXP_NAME}/preds_val_task{TARGET_TASK}_{run_id}.csv', index=False) 196 | pd.DataFrame(preds_test).to_csv(f'../save/{EXP_NAME}/preds_test_task{TARGET_TASK}_{run_id}.csv', index=False) 197 | df_score.to_csv(f'../save/{EXP_NAME}/xgb_feature_importance_task{TARGET_TASK}_{run_id}.csv', index=False) 198 | 199 | if TARGET_TASK == '1': 200 | with t.timer(f'postprocess, threshold'): 201 | rows = [] 202 | for th in tqdm(range(40, 60, 1)): 203 | th = th * 0.01 204 | preds_th = [] 205 | for i in preds: 206 | if i > th: 207 | preds_th.append(1) 208 | else: 209 | preds_th.append(0) 210 | acc = metrics.accuracy_score(y_train, preds_th) 211 | rows.append([th, acc]) 212 | acc_th = pd.DataFrame(rows, columns=['th', 'acc']) 213 | 214 | cv = metrics.roc_auc_score(y_train, preds) 215 | scores = acc_th.sort_values('acc', ascending=False).head(1).values[0] 216 | best_th, best_acc = scores[0], scores[1] 217 | logging.info(f'Val AUC: {cv}',) 218 | logging.info(f'Val Best Acc: {best_acc} (threshold: {best_th})') 219 | 220 | with t.timer(f'make submission'): 221 | preds_test_th = [] 222 | for i in preds_test: 223 | if i > best_th: 224 | preds_test_th.append(1) 225 | else: 226 | preds_test_th.append(0) 227 | 228 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 229 | sub = pd.read_csv(test_data_path) 230 | sub['IsCorrect'] = preds_test_th 231 | 232 | if not os.path.exists(f'../submission/{EXP_NAME}'): 233 | os.mkdir(f'../submission/{EXP_NAME}') 234 | 235 | sub_name = f'submission_task1__auc{cv}__acc{best_acc}__th{best_th}' 236 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 237 | if not os.path.exists(valid_sub_dir): 238 | os.mkdir(valid_sub_dir) 239 | 240 | sub.to_csv(f'{valid_sub_dir}/submission_task_1.csv', index=False) 241 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 242 | new_zip.write(f'{valid_sub_dir}/submission_task_1.csv', arcname='submission_task_1.csv') 243 | shutil.rmtree(valid_sub_dir) 244 | mlflow.log_artifact(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip') 245 | 246 | submission_path = f'../submission/{EXP_NAME}/submission_task1__auc{cv}__acc{best_acc}__th{best_th}.csv' 247 | sub.to_csv(submission_path, index=False) 248 | 249 | elif TARGET_TASK == '2': 250 | 251 | preds_label = np.argmax(preds, axis=1) 252 | preds_test_label = np.argmax(preds_test, axis=1) 253 | cv = metrics.accuracy_score(y_train, preds_label) 254 | logging.info(f'CV Multiclass Acc: {cv}') 255 | preds_label = preds_label + 1 256 | preds_test_label = preds_test_label + 1 257 | 258 | with t.timer(f'make submission'): 259 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 260 | sub = pd.read_csv(test_data_path) 261 | sub['AnswerValue'] = preds_test_label 262 | 263 | if not os.path.exists(f'{SUB_DIR}/{EXP_NAME}'): 264 | os.mkdir(f'{SUB_DIR}/{EXP_NAME}') 265 | 266 | sub_name = f'submission_task2__acc{cv}' 267 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 268 | if not os.path.exists(valid_sub_dir): 269 | os.mkdir(valid_sub_dir) 270 | 271 | sub.to_csv(f'{valid_sub_dir}/submission_task_2.csv', index=False) 272 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 273 | new_zip.write(f'{valid_sub_dir}/submission_task_2.csv', arcname='submission_task_2.csv') 274 | shutil.rmtree(valid_sub_dir) 275 | mlflow.log_artifact(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip') 276 | 277 | if TARGET_TASK == '1': 278 | save_mlflow(run_id, cv, best_acc, best_th) 279 | elif TARGET_TASK == '2': 280 | save_mlflow(run_id, cv) 281 | mlflow.end_run() 282 | -------------------------------------------------------------------------------- /src/train_xgb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gc 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm import tqdm 7 | import zipfile 8 | import shutil 9 | import logging 10 | 11 | import mlflow 12 | from sklearn import metrics 13 | 14 | import xgboost as xgb 15 | print('XGB Version', xgb.__version__) 16 | 17 | from utils import ( 18 | seed_everything, 19 | Timer, 20 | reduce_mem_usage, 21 | load_from_pkl, save_as_pkl 22 | ) 23 | 24 | logging.basicConfig(level=logging.INFO) 25 | 26 | DEVICE = os.environ.get('DEVICE') 27 | 28 | EXP_NAME = os.environ.get('EXP_NAME') 29 | EXP_DIR = os.environ.get('EXP_DIR') 30 | 31 | INPUT_DIR = os.environ.get('INPUT_DIR') 32 | FEATURE_DIR = os.environ.get('FEATURE_DIR') 33 | FOLD_DIR = os.environ.get('FOLD_DIR') 34 | SUB_DIR = os.environ.get('SUB_DIR') 35 | 36 | sys.path.append(f'{EXP_DIR}/{EXP_NAME}') 37 | import config 38 | 39 | FOLD_NAME = config.FOLD_NAME 40 | FOLD_NUM = config.FOLD_NUM 41 | RANDOM_STATE = config.RANDOM_STATE 42 | 43 | TARGET_TASK = config.TARGET_TASK 44 | 45 | XGB_MDOEL_PARAMS = config.XGB_MDOEL_PARAMS 46 | XGB_MDOEL_PARAMS['gpu_id'] = int(DEVICE.split(':')[1]) 47 | XGB_TRAIN_PARAMS = config.XGB_TRAIN_PARAMS 48 | 49 | dense_features = config.dense_features 50 | sparse_features = config.sparse_features 51 | varlen_sparse_features = config.varlen_sparse_features 52 | 53 | 54 | def save_mlflow(run_id, cv, acc=None, th=None): 55 | 56 | mlflow.log_param("fold_name", FOLD_NAME) 57 | mlflow.log_param("fold_num", FOLD_NUM) 58 | 59 | for feat in dense_features: 60 | feat = feat.replace('#', '') 61 | mlflow.log_param(f'f__dense__{feat}', 1) 62 | for feat in sparse_features: 63 | feat = feat.replace('#', '') 64 | mlflow.log_param(f'f__sparse__{feat}', 1) 65 | for feat in varlen_sparse_features: 66 | feat = feat.replace('#', '') 67 | mlflow.log_param(f'f__varspa__{feat}', 1) 68 | 69 | mlflow.log_metric("cv", cv) 70 | if acc is not None: 71 | mlflow.log_metric("acc", acc) 72 | if th is not None: 73 | mlflow.log_metric("th", th) 74 | return 75 | 76 | 77 | def train_1fold(preds, preds_test, importance, trn_idx, val_idx, X_train, y_train, d_test): 78 | 79 | x_trn = X_train.iloc[trn_idx] 80 | y_trn = y_train[trn_idx] 81 | x_val = X_train.iloc[val_idx] 82 | y_val = y_train[val_idx] 83 | 84 | d_train = xgb.DMatrix(data=x_trn, label=y_trn) 85 | d_val = xgb.DMatrix(data=x_val, label=y_val) 86 | 87 | model = xgb.train( 88 | XGB_MDOEL_PARAMS, 89 | dtrain=d_train, 90 | evals=[(d_train, "train"), (d_val, "val")], 91 | **XGB_TRAIN_PARAMS 92 | ) 93 | preds[val_idx] = model.predict(d_val) 94 | preds_test += model.predict(d_test) 95 | 96 | importance_fold = model.get_score(importance_type='weight') 97 | sum_imp = sum(importance_fold.values()) 98 | for f, s in importance_fold.items(): 99 | if f not in importance: 100 | importance[f] = 0 101 | importance[f] += s / sum_imp 102 | return preds, preds_test, importance 103 | 104 | 105 | if __name__ == "__main__": 106 | 107 | t = Timer() 108 | with t.timer(f'fix seed RANDOM_STATE:{RANDOM_STATE}'): 109 | seed_everything(RANDOM_STATE) 110 | 111 | with t.timer(f'read label'): 112 | data_path = f'{INPUT_DIR}/train_data/train_task_1_2.csv' 113 | train = pd.read_csv(data_path) 114 | if TARGET_TASK == '1': 115 | y_train = train['IsCorrect'].values 116 | elif TARGET_TASK == '2': 117 | y_train = (train['AnswerValue'] - 1).values 118 | 119 | skip_fr = False 120 | if skip_fr is False: 121 | with t.timer(f'read features'): 122 | 123 | X_train = pd.DataFrame() 124 | X_test = pd.DataFrame() 125 | for feat in dense_features: 126 | logging.info(f'[{feat}] read feature ...') 127 | X_train = pd.concat([ 128 | X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') 129 | ], axis=1) 130 | X_test = pd.concat([ 131 | X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') 132 | ], axis=1) 133 | X_train = reduce_mem_usage(X_train) 134 | X_test = reduce_mem_usage(X_test) 135 | for feat in sparse_features: 136 | logging.info(f'[{feat}] read feature ...') 137 | X_train = pd.concat([ 138 | X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') 139 | ], axis=1) 140 | X_test = pd.concat([ 141 | X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') 142 | ], axis=1) 143 | X_train = reduce_mem_usage(X_train) 144 | X_test = reduce_mem_usage(X_test) 145 | save_as_pkl(X_train, f'X_train_{EXP_NAME}_.pkl') 146 | save_as_pkl(X_test, f'X_test_{EXP_NAME}_.pkl') 147 | elif skip_fr is True: 148 | X_train = load_from_pkl(f'X_train_task1_xgb_fs100_meta_.pkl') 149 | X_test = load_from_pkl(f'X_test_task1_xgb_fs100_meta_.pkl') 150 | cat_features_index = [] 151 | 152 | mlflow.set_experiment(EXP_NAME) 153 | mlflow.start_run() 154 | run_id = mlflow.active_run().info.run_id 155 | 156 | with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'): 157 | folds = pd.read_csv(f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv') 158 | 159 | with t.timer(f'train XGB'): 160 | 161 | logging.info(f'Num. of Samples: {len(X_train)}') 162 | logging.info(f'Num. of Features: {X_train.shape[1]}') 163 | 164 | if TARGET_TASK == '1': 165 | preds = np.zeros(len(X_train)) 166 | preds_test = np.zeros(len(X_test)) 167 | elif TARGET_TASK == '2': 168 | preds = np.zeros((len(X_train), XGB_MDOEL_PARAMS['num_class'])) 169 | preds_test = np.zeros((len(X_test), XGB_MDOEL_PARAMS['num_class'])) 170 | 171 | d_test = xgb.DMatrix(data=X_test) 172 | 173 | importance = {} 174 | for fold_idx in range(FOLD_NUM): 175 | 176 | logging.info(f'FOLD:{fold_idx}') 177 | 178 | trn_idx = folds[folds.kfold != fold_idx].index.tolist() 179 | val_idx = folds[folds.kfold == fold_idx].index.tolist() 180 | 181 | preds, preds_test, importance = train_1fold( 182 | preds, preds_test, importance, 183 | trn_idx, val_idx, X_train, y_train, d_test 184 | ) 185 | 186 | preds_test /= FOLD_NUM 187 | df_score = pd.DataFrame( 188 | [(i, j) for i, j in importance.items()], columns=['fname', 'importance'] 189 | ).sort_values('importance', ascending=False) 190 | 191 | if not os.path.exists(f'../save/{EXP_NAME}'): 192 | os.mkdir(f'../save/{EXP_NAME}') 193 | 194 | pd.DataFrame(preds).to_csv(f'../save/{EXP_NAME}/preds_val_task{TARGET_TASK}_{run_id}.csv', index=False) 195 | pd.DataFrame(preds_test).to_csv(f'../save/{EXP_NAME}/preds_test_task{TARGET_TASK}_{run_id}.csv', index=False) 196 | df_score.to_csv(f'../save/{EXP_NAME}/xgb_feature_importance_task{TARGET_TASK}_{run_id}.csv', index=False) 197 | 198 | if TARGET_TASK == '1': 199 | with t.timer(f'postprocess, threshold'): 200 | rows = [] 201 | for th in tqdm(range(490, 510, 1)): 202 | th = th * 0.001 203 | preds_th = [] 204 | for i in preds: 205 | if i > th: 206 | preds_th.append(1) 207 | else: 208 | preds_th.append(0) 209 | acc = metrics.accuracy_score(y_train, preds_th) 210 | rows.append([th, acc]) 211 | acc_th = pd.DataFrame(rows, columns=['th', 'acc']) 212 | 213 | cv = metrics.roc_auc_score(y_train, preds) 214 | scores = acc_th.sort_values('acc', ascending=False).head(1).values[0] 215 | best_th, best_acc = scores[0], scores[1] 216 | logging.info(f'Val AUC: {cv}',) 217 | logging.info(f'Val Best Acc: {best_acc} (threshold: {best_th})') 218 | 219 | with t.timer(f'make submission'): 220 | preds_test_th = [] 221 | for i in preds_test: 222 | if i > best_th: 223 | preds_test_th.append(1) 224 | else: 225 | preds_test_th.append(0) 226 | 227 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 228 | sub = pd.read_csv(test_data_path) 229 | sub['IsCorrect'] = preds_test_th 230 | 231 | if not os.path.exists(f'../submission/{EXP_NAME}'): 232 | os.mkdir(f'../submission/{EXP_NAME}') 233 | 234 | sub_name = f'submission_task1__auc{cv}__acc{best_acc}__th{best_th}' 235 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 236 | if not os.path.exists(valid_sub_dir): 237 | os.mkdir(valid_sub_dir) 238 | 239 | sub.to_csv(f'{valid_sub_dir}/submission_task_1.csv', index=False) 240 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 241 | new_zip.write(f'{valid_sub_dir}/submission_task_1.csv', arcname='submission_task_1.csv') 242 | shutil.rmtree(valid_sub_dir) 243 | mlflow.log_artifact(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip') 244 | 245 | submission_path = f'../submission/{EXP_NAME}/submission_task1__auc{cv}__acc{best_acc}__th{best_th}.csv' 246 | sub.to_csv(submission_path, index=False) 247 | 248 | elif TARGET_TASK == '2': 249 | 250 | preds_label = np.argmax(preds, axis=1) + 1 251 | preds_test_label = np.argmax(preds_test, axis=1) + 1 252 | cv = metrics.accuracy_score(y_train + 1, preds_label) 253 | logging.info(f'CV Multiclass Acc: {cv}') 254 | 255 | with t.timer(f'make submission'): 256 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 257 | sub = pd.read_csv(test_data_path) 258 | sub['AnswerValue'] = preds_test_label 259 | 260 | if not os.path.exists(f'{SUB_DIR}/{EXP_NAME}'): 261 | os.mkdir(f'{SUB_DIR}/{EXP_NAME}') 262 | 263 | sub_name = f'submission_task2__acc{cv}' 264 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 265 | if not os.path.exists(valid_sub_dir): 266 | os.mkdir(valid_sub_dir) 267 | 268 | sub.to_csv(f'{valid_sub_dir}/submission_task_2.csv', index=False) 269 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 270 | new_zip.write(f'{valid_sub_dir}/submission_task_2.csv', arcname='submission_task_2.csv') 271 | shutil.rmtree(valid_sub_dir) 272 | mlflow.log_artifact(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip') 273 | 274 | if TARGET_TASK == '1': 275 | save_mlflow(run_id, cv, best_acc, best_th) 276 | elif TARGET_TASK == '2': 277 | save_mlflow(run_id, cv) 278 | mlflow.end_run() 279 | -------------------------------------------------------------------------------- /src/feature_extraction_preds_meta.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | import ast 5 | import logging 6 | from tqdm import tqdm 7 | 8 | from sklearn import metrics 9 | 10 | from utils import ( 11 | seed_everything, 12 | Timer 13 | ) 14 | 15 | from utils_feature import ( 16 | label_encording, 17 | subject2level, 18 | factorize_dataAnswered, 19 | preprocess_DateOfBirth, 20 | preprocess_subject, 21 | extract_age, 22 | varlen_label_encording, 23 | question_subject_svd, 24 | user_subject_tfidfsvd 25 | ) 26 | 27 | logging.basicConfig(level=logging.INFO) 28 | 29 | INPUT_DIR = os.environ.get('INPUT_DIR') 30 | FEATURE_DIR = os.environ.get('FEATURE_DIR') 31 | 32 | VARLEN_LOWFREQ_TH = 0 33 | RANDOM_STATE = os.environ.get('RANDOM_STATE') 34 | 35 | exp_list_t1 = [ 36 | ('task1_lgbm', 'bbb432be523149369a5f95cdfd63578c'), # lgb, all feature, te_smooth5 37 | ('task1_xgb_2', '69f2b85366b34198a1ea2dc512e05f3a'), # xgb, all feature, te_smooth5 38 | ('task1_cat', 'b286e8fb8f824027869bf2ec6247b8ec'), # cat, all feature, te_smooth5 39 | ('task1_lgbm_fs100', 'bf0bfd4d47664b349c02dd92eaa50fc4'), # lgb, fs100, te_smooth5 40 | ('task1_xgb_fs100', '2cf16a7701da409daf40cbfe78b3b7bf'), # xgb, fs100, te_smooth5 41 | ('task1_mlp_fs100', '92598df31e9f4311adccbc8e267e7c01'), # mlp, fs100, te_smooth5 42 | ('task12_multitask_mlp_fs100', '38a138a4ad5e49b999ec0eca380be5dc'), # mlp multi, fs100, te_smooth5 43 | ('task1_cat_fs100', '468d28a067bc479caa65e12153965df4'), # cat depth 8, fs100, te_smooth5 44 | ('task1_cat_fs100', 'e4950f4d436a4d58a89c5bd90e254428'), # cat depth 10, fs100, te_smooth5 45 | ('task1_lgbm', 'c4def7cc4f3f4f44971ac474d2993e92'), # lgb, all feature, te_smooth2 46 | ] 47 | 48 | exp_list_t2 = [ 49 | ('task2_lgbm', '90443d723c874103a4b1c68f2830446e'), # lgb, all feature, te_smooth5 50 | ('task2_xgb_2', '19a82f5124d349ee9550b871a88025d2'), # xgb, all feature, te_smooth5 51 | ('task2_lgbm_fs100', '751bb6f02e5746798740f2b90f183a92'), # lgb, fs100, te_smooth5 52 | ('task2_xgb_fs100', '89909e8bf54443328830085dca5f26cc'), # xgb, fs100, te_smooth5 53 | ('task2_mlp_fs100', '2f1d7ef010874723a8dd70aa49891f5d'), # mlp, fs100, te_smooth5 54 | ('task12_multitask_mlp_fs100', '38a138a4ad5e49b999ec0eca380be5dc'), # mlp multi, fs100, te_smooth5 55 | ('task2_cat_fs100', '5f4c75b620ef4eccbb14581589340db6'), # cat depth 8, fs100, te_smooth5 56 | ] 57 | 58 | meta_pred_features = [] 59 | for ename, run_id in exp_list_t1: 60 | meta_pred_features.append(f't1_{ename}_{run_id}') 61 | for ename, run_id in exp_list_t2: 62 | for idx in range(4): 63 | meta_pred_features.append(f't2_{ename}_{run_id}_{idx}') 64 | 65 | meta_agg_t1 = [ 66 | 'preds_t1_mean_0', 67 | 'preds_t1_std_0', 68 | 'preds_t1_max_0', 69 | 'preds_t1_min_0', 70 | 'preds_t1_diff_0', 71 | ] 72 | 73 | meta_agg_t2 = [ 74 | 'preds_t2_mean_0', 75 | 'preds_t2_mean_1', 76 | 'preds_t2_mean_2', 77 | 'preds_t2_mean_3', 78 | 'preds_t2_mean_max_0', 79 | 'preds_t2_mean_min_0', 80 | 'preds_t2_std_0', 81 | 'preds_t2_std_1', 82 | 'preds_t2_std_2', 83 | 'preds_t2_std_3', 84 | 'preds_t2_std_sum_0', 85 | 'preds_t2_max_0', 86 | 'preds_t2_max_1', 87 | 'preds_t2_max_2', 88 | 'preds_t2_max_3', 89 | 'preds_t2_min_0', 90 | 'preds_t2_min_1', 91 | 'preds_t2_min_2', 92 | 'preds_t2_min_3', 93 | ] 94 | 95 | prediction_lag_target = [ 96 | 'preds_t1_mean_0', 97 | 'preds_t1_std_0', 98 | 'preds_t2_mean_max_0', 99 | 'preds_t2_mean_min_0', 100 | 'preds_t2_std_sum_0', 101 | ] 102 | 103 | meta_mul = [] 104 | for i in range(4): 105 | meta_mul.append(f'preds_t1_mean_mul_t2_mean_{i}') 106 | meta_mul.append(f'preds_t1_std_mul_t2_std_{i}') 107 | meta_mul.append(f'preds_t1_std_mul_t2_mean_{i}') 108 | 109 | pred_lag_features = [] 110 | for i in prediction_lag_target: 111 | for j in ['prev', 'diff', 'cumsum']: 112 | pred_lag_features.append(f'{i}_{j}') 113 | 114 | pred_eval_metric_features = [] 115 | for cat in ['UserId', 'QuestionId']: 116 | pred_eval_metric_features.append(f'pred_{cat}_t1_auc') 117 | pred_eval_metric_features.append(f'pred_{cat}_t2_acc') 118 | 119 | dense_features = [] 120 | dense_features += meta_pred_features 121 | dense_features += meta_agg_t1 122 | dense_features += meta_agg_t2 123 | dense_features += meta_mul 124 | 125 | 126 | def add_preds(results, TARGET_TASK, EXP_NAME, run_id): 127 | oof = pd.read_csv(f'../save/{EXP_NAME}/preds_val_task{TARGET_TASK}_{run_id}.csv') 128 | test_preds = pd.read_csv(f'../save/{EXP_NAME}/preds_test_task{TARGET_TASK}_{run_id}.csv') 129 | results[f't{TARGET_TASK}_{EXP_NAME}_{run_id}'] = {} 130 | results[f't{TARGET_TASK}_{EXP_NAME}_{run_id}']['oof'] = oof 131 | results[f't{TARGET_TASK}_{EXP_NAME}_{run_id}']['test_preds'] = test_preds 132 | return results 133 | 134 | 135 | def save_as_feather(feat, X_train, X_test): 136 | X_train[[feat]].reset_index(drop=True).to_feather(f'{FEATURE_DIR}/{feat}_train.feather') 137 | X_test[[feat]].reset_index(drop=True).to_feather(f'{FEATURE_DIR}/{feat}_test.feather') 138 | return 139 | 140 | 141 | if __name__ == "__main__": 142 | 143 | t = Timer() 144 | with t.timer(f'fix seed RANDOM_STATE:{RANDOM_STATE}'): 145 | seed_everything(RANDOM_STATE) 146 | 147 | with t.timer(f'read data'): 148 | data_path = f'{INPUT_DIR}/train_data/train_task_1_2.csv' 149 | answer_path = f'{INPUT_DIR}/metadata/answer_metadata_task_1_2.csv' 150 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 151 | 152 | train = pd.read_csv(data_path) 153 | test = pd.read_csv(test_data_path) 154 | answer = pd.read_csv(answer_path) 155 | 156 | with t.timer(f'preprocess'): 157 | 158 | answer['AnswerId'] = answer['AnswerId'].fillna(-1).astype('int32') 159 | answer['Confidence'] = answer['Confidence'].fillna(-1).astype('int8') 160 | answer['SchemeOfWorkId'] = answer['SchemeOfWorkId'].fillna(-1).astype('int16') 161 | answer = factorize_dataAnswered(answer) 162 | 163 | with t.timer(f'marge dataframe'): 164 | 165 | df = pd.merge(train, answer, on='AnswerId', how='left') 166 | df_test = pd.merge(test, answer, on='AnswerId', how='left') 167 | 168 | with t.timer(f'read prediction'): 169 | 170 | results_t1 = {} 171 | for ename, run_id in exp_list_t1: 172 | results_t1 = add_preds(results_t1, TARGET_TASK='1', EXP_NAME=ename, run_id=run_id) 173 | 174 | results_t2 = {} 175 | for ename, run_id in exp_list_t2: 176 | results_t2 = add_preds(results_t2, TARGET_TASK='2', EXP_NAME=ename, run_id=run_id) 177 | 178 | with t.timer(f'prediction agg'): 179 | 180 | for ename, run_id in exp_list_t1: 181 | df[f't1_{ename}_{run_id}'] = results_t1[f't1_{ename}_{run_id}']['oof'] 182 | df_test[f't1_{ename}_{run_id}'] = results_t1[f't1_{ename}_{run_id}']['test_preds'] 183 | 184 | rdf_oof = pd.DataFrame() 185 | rdf_test = pd.DataFrame() 186 | for ename, run_id in exp_list_t1: 187 | rdf_oof = pd.concat([rdf_oof, results_t1[f't1_{ename}_{run_id}']['oof']], axis=1) 188 | rdf_test = pd.concat([rdf_test, results_t1[f't1_{ename}_{run_id}']['test_preds']], axis=1) 189 | 190 | meta_oofs_t1 = pd.DataFrame() 191 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.mean(axis=1)).add_prefix('preds_t1_mean_')], axis=1) 192 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.std(axis=1)).add_prefix('preds_t1_std_')], axis=1) 193 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.max(axis=1)).add_prefix('preds_t1_max_')], axis=1) 194 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.min(axis=1)).add_prefix('preds_t1_min_')], axis=1) 195 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.max(axis=1) - rdf_oof.min(axis=1)).add_prefix('preds_t1_diff_')], axis=1) 196 | 197 | meta_tests_t1 = pd.DataFrame() 198 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.mean(axis=1)).add_prefix('preds_t1_mean_')], axis=1) 199 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.std(axis=1)).add_prefix('preds_t1_std_')], axis=1) 200 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.max(axis=1)).add_prefix('preds_t1_max_')], axis=1) 201 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.min(axis=1)).add_prefix('preds_t1_min_')], axis=1) 202 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.max(axis=1) - rdf_test.min(axis=1)).add_prefix('preds_t1_diff_')], axis=1) 203 | 204 | df = pd.concat([df, meta_oofs_t1], axis=1) 205 | df_test = pd.concat([df_test, meta_tests_t1], axis=1) 206 | 207 | oofs = np.zeros((len(rdf_oof), 4, len(exp_list_t2))) 208 | tests = np.zeros((len(rdf_test), 4, len(exp_list_t2))) 209 | for eidx, (ename, run_id) in enumerate(exp_list_t2): 210 | oofs[:, :, eidx] = results_t2[f't2_{ename}_{run_id}']['oof'].values 211 | tests[:, :, eidx] = results_t2[f't2_{ename}_{run_id}']['test_preds'].values 212 | 213 | for eidx, (ename, run_id) in enumerate(exp_list_t2): 214 | df = pd.concat([df, pd.DataFrame(oofs[:, :, eidx]).add_prefix(f't2_{ename}_{run_id}_')], axis=1) 215 | df_test = pd.concat([df_test, pd.DataFrame(tests[:, :, eidx]).add_prefix(f't2_{ename}_{run_id}_')], axis=1) 216 | 217 | meta_oofs_t2 = pd.DataFrame() 218 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.mean(axis=2)).add_prefix('preds_t2_mean_')], axis=1) 219 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.mean(axis=2).max(axis=1)).add_prefix('preds_t2_mean_max_')], axis=1) 220 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.mean(axis=2).min(axis=1)).add_prefix('preds_t2_mean_min_')], axis=1) 221 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.std(axis=2)).add_prefix('preds_t2_std_')], axis=1) 222 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.std(axis=2).sum(axis=1)).add_prefix('preds_t2_std_sum_')], axis=1) 223 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.max(axis=2)).add_prefix('preds_t2_max_')], axis=1) 224 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.min(axis=2)).add_prefix('preds_t2_min_')], axis=1) 225 | 226 | meta_tests_t2 = pd.DataFrame() 227 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.mean(axis=2)).add_prefix('preds_t2_mean_')], axis=1) 228 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.mean(axis=2).max(axis=1)).add_prefix('preds_t2_mean_max_')], axis=1) 229 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.mean(axis=2).min(axis=1)).add_prefix('preds_t2_mean_min_')], axis=1) 230 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.std(axis=2)).add_prefix('preds_t2_std_')], axis=1) 231 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.std(axis=2).sum(axis=1)).add_prefix('preds_t2_std_sum_')], axis=1) 232 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.max(axis=2)).add_prefix('preds_t2_max_')], axis=1) 233 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.min(axis=2)).add_prefix('preds_t2_min_')], axis=1) 234 | 235 | df = pd.concat([df, meta_oofs_t2], axis=1) 236 | df_test = pd.concat([df_test, meta_tests_t2], axis=1) 237 | 238 | with t.timer(f'prediction mul'): 239 | for i in range(4): 240 | df[f'preds_t1_mean_mul_t2_mean_{i}'] = df['preds_t1_mean_0'].mul(df[f'preds_t2_mean_{i}']) 241 | df[f'preds_t1_std_mul_t2_std_{i}'] = df['preds_t1_std_0'].mul(df[f'preds_t2_std_{i}']) 242 | df[f'preds_t1_std_mul_t2_mean_{i}'] = df['preds_t1_std_0'].mul(df[f'preds_t2_mean_{i}']) 243 | df_test[f'preds_t1_mean_mul_t2_mean_{i}'] = df_test['preds_t1_mean_0'].mul(df_test[f'preds_t2_mean_{i}']) 244 | df_test[f'preds_t1_std_mul_t2_std_{i}'] = df_test['preds_t1_std_0'].mul(df_test[f'preds_t2_std_{i}']) 245 | df_test[f'preds_t1_std_mul_t2_mean_{i}'] = df_test['preds_t1_std_0'].mul(df_test[f'preds_t2_mean_{i}']) 246 | 247 | 248 | with t.timer('save dense features'): 249 | for feat in dense_features: 250 | logging.info(f'save {feat}') 251 | save_as_feather(feat, df, df_test) -------------------------------------------------------------------------------- /src/utils_feature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.preprocessing import LabelEncoder 4 | from math import ceil 5 | from tqdm import tqdm 6 | import datetime 7 | import cupy 8 | import cudf 9 | from sklearn.model_selection import KFold 10 | from sklearn.decomposition import TruncatedSVD 11 | from sklearn.feature_extraction.text import TfidfVectorizer 12 | 13 | 14 | def reduce_svd(features, n_components): 15 | dr_model = TruncatedSVD(n_components=n_components, random_state=46) 16 | features_dr = dr_model.fit_transform(features) 17 | return features_dr 18 | 19 | 20 | def tfidf_reduce_svd(sentences, n_gram, n_components): 21 | tfv = TfidfVectorizer(min_df=1, max_features=None, 22 | strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b', 23 | ngram_range=(1, n_gram), use_idf=1, smooth_idf=1, sublinear_tf=1) 24 | dr_model = TruncatedSVD(n_components=n_components, random_state=46) 25 | 26 | tfidf_col = tfv.fit_transform(sentences) 27 | dr_col = dr_model.fit_transform(tfidf_col) 28 | return dr_col 29 | 30 | 31 | def question_subject_svd(question, subject_id_list, n_conponents): 32 | subject_oht = np.zeros((len(question), len(subject_id_list)), dtype='int8') 33 | for i, slist in tqdm(enumerate(question['SubjectId'].values), total=len(question)): 34 | for sid in slist: 35 | j = subject_id_list.index(sid) 36 | subject_oht[i, j] += 1 37 | 38 | ques_subj_svd = reduce_svd(subject_oht, n_conponents) 39 | ques_subj_svd = pd.DataFrame(ques_subj_svd).add_prefix('ques_subj_svd_') 40 | ques_subj_svd['QuestionId'] = question['QuestionId'].values 41 | return ques_subj_svd 42 | 43 | 44 | def user_subject_tfidfsvd(df, n_conponents): 45 | uids = [] 46 | u_subjects = [] 47 | for uid, udf in tqdm(df[['UserId', 'SubjectId']].groupby('UserId'), total=df['UserId'].nunique()): 48 | u_subject = [] 49 | for x in udf['SubjectId'].values: 50 | u_subject.extend(x) 51 | uids.append(uid) 52 | u_subjects.append(u_subject) 53 | u_subjects = [' '.join(map(str,x)) for x in u_subjects] 54 | user_subj_svd = tfidf_reduce_svd(u_subjects, n_gram=1, n_components=n_conponents) 55 | user_subj_svd = pd.DataFrame(user_subj_svd).add_prefix('user_subj_svd_') 56 | user_subj_svd['UserId'] = uids 57 | return user_subj_svd 58 | 59 | 60 | 61 | def target_encode_cudf_v3(train, valid, col, tar, n_folds=5, min_ct=0, smooth=20, 62 | seed=42, shuffle=False, t2=None, v2=None, x=-1): 63 | ''' 64 | ref: https://github.com/rapidsai/deeplearning/blob/main/RecSys2020/02_ModelsCompetition/XGBoost3/XGBoost3.ipynb 65 | ''' 66 | # 67 | # col = column to target encode (or if list of columns then multiple groupby) 68 | # tar = tar column encode against 69 | # if min_ct>0 then all classes with <= min_ct are consider in new class "other" 70 | # smooth = Bayesian smooth parameter 71 | # seed = for 5 Fold if shuffle==True 72 | # if x==-1 result appended to train and valid 73 | # if x>=0 then result returned in column x of t2 and v2 74 | # 75 | # SINGLE OR MULTIPLE COLUMN 76 | if not isinstance(col, list): 77 | col = [col] 78 | if (min_ct > 0) & (len(col) > 1): 79 | print('WARNING: Setting min_ct=0 with multiple columns. Not implemented') 80 | min_ct = 0 81 | name = "_".join(col) 82 | 83 | # FIT ALL TRAIN 84 | gf = cudf.from_pandas(train[col + [tar]]).reset_index(drop=True) 85 | gf['idx'] = gf.index # needed because cuDF merge returns out of order 86 | if min_ct > 0: # USE MIN_CT? 87 | other = gf.groupby(col[0]).size() 88 | other = other[other <= min_ct].index 89 | save = gf[col[0]].values.copy() 90 | gf.loc[gf[col[0]].isin(other), col[0]] = -1 91 | te = gf.groupby(col)[[tar]].agg(['mean', 'count']).reset_index() 92 | te.columns = col + ['m', 'c'] 93 | mn = gf[tar].mean().astype('float32') 94 | te['smooth'] = ((te['m'] * te['c']) + (mn * smooth)) / (te['c'] + smooth) 95 | if min_ct > 0: 96 | gf[col[0]] = save.copy() 97 | 98 | # PREDICT VALID 99 | gf2 = cudf.from_pandas(valid[col]).reset_index(drop=True) 100 | gf2['idx'] = gf2.index 101 | if min_ct > 0: 102 | gf2.loc[gf2[col[0]].isin(other), col[0]] = -1 103 | gf2 = gf2.merge(te[col + ['smooth']], on=col, how='left', sort=False).sort_values('idx') 104 | if x == -1: 105 | valid[f'TE_s{smooth}_{name}_{tar}'] = gf2['smooth'].fillna(mn).astype('float32').to_array() 106 | elif x >= 0: 107 | v2[:, x] = gf2['smooth'].fillna(mn).astype('float32').to_array() 108 | 109 | # KFOLD ON TRAIN 110 | tmp = cupy.zeros((train.shape[0]), dtype='float32') 111 | gf['fold'] = 0 112 | if shuffle: # shuffling is 2x slower 113 | kf = KFold(n_folds, random_state=seed, shuffle=shuffle) 114 | for k, (idxT, idxV) in enumerate(kf.split(train)): 115 | gf.loc[idxV, 'fold'] = k 116 | else: 117 | fsize = train.shape[0] // n_folds 118 | gf['fold'] = cupy.clip(gf.idx.values // fsize, 0, n_folds - 1) 119 | for k in range(n_folds): 120 | if min_ct > 0: # USE MIN CT? 121 | if k < n_folds - 1: 122 | save = gf[col[0]].values.copy() 123 | other = gf.loc[gf.fold != k].groupby(col[0]).size() 124 | other = other[other <= min_ct].index 125 | gf.loc[gf[col[0]].isin(other), col[0]] = -1 126 | te = gf.loc[gf.fold != k].groupby(col)[[tar]].agg(['mean', 'count']).reset_index() 127 | te.columns = col + ['m', 'c'] 128 | mn = gf.loc[gf.fold != k, tar].mean().astype('float32') 129 | te['smooth'] = ((te['m'] * te['c']) + (mn * smooth)) / (te['c'] + smooth) 130 | gf = gf.merge(te[col + ['smooth']], on=col, how='left', sort=False).sort_values('idx') 131 | tmp[(gf.fold.values == k)] = gf.loc[gf.fold == k, 'smooth'].fillna(mn).astype('float32').values 132 | gf.drop_column('smooth') 133 | if (min_ct > 0) & (k < n_folds - 1): 134 | gf[col[0]] = save.copy() 135 | if x == -1: 136 | train[f'TE_s{smooth}_{name}_{tar}'] = cupy.asnumpy(tmp.astype('float32')) 137 | elif x >= 0: 138 | t2[:, x] = cupy.asnumpy(tmp.astype('float32')) 139 | return train, valid 140 | 141 | 142 | def extract_age(df): 143 | # Diagnostic Questions: The NeurIPS 2020 Education Challenge 144 | # roughly between 7 and 18 years old [3] 145 | age_days = [] 146 | age_years = [] 147 | pri_to_high_stu = [] 148 | for i, j, k in tqdm(df[['DateAnswered_dt', 'DateOfBirth_dt', 'DateOfBirth_NaN']].values, total=len(df)): 149 | if j.year <= 2011 and j.year >= 2000: 150 | age = i - j 151 | days = age.days 152 | years = int(days / 365) 153 | if years >= 7 and years <= 18: 154 | age_years.append(years) 155 | age_days.append(days) 156 | pri_to_high_stu.append(True) 157 | elif years < 7: 158 | age_years.append(7) 159 | age_days.append(7 * 365) 160 | pri_to_high_stu.append(False) 161 | elif years > 18: 162 | age_years.append(18) 163 | age_days.append(18 * 365) 164 | pri_to_high_stu.append(False) 165 | elif j.year > 2011: 166 | age = i - datetime.datetime(2011, 1, 1, 0, 0) 167 | days = age.days 168 | years = int(days / 365) 169 | age_days.append(days) 170 | age_years.append(years) 171 | pri_to_high_stu.append(False) 172 | elif j.year < 2000: 173 | age = i - datetime.datetime(1999, 1, 1, 0, 0) 174 | days = age.days 175 | years = int(days / 365) 176 | age_days.append(days) 177 | age_years.append(years) 178 | pri_to_high_stu.append(False) 179 | df['age_days'] = age_days 180 | df['age_years'] = age_years 181 | df['pri_to_high_stu'] = pri_to_high_stu 182 | return df 183 | 184 | 185 | def preprocess_subject(subject): 186 | subject['ParentId'] = subject['ParentId'].fillna(0.0).astype(int) 187 | 188 | def chilidren_num(x): 189 | return len(childrens[x]) 190 | 191 | childrens = {} 192 | for i, j in subject[['SubjectId', 'ParentId']].values: 193 | if j not in childrens: 194 | childrens[j] = [] 195 | if i not in childrens: 196 | childrens[i] = [] 197 | childrens[j].append(i) 198 | 199 | parent_dict = {} 200 | for i, j in subject[['SubjectId', 'ParentId']].values: 201 | parent_dict[i]= j 202 | 203 | subject['SubjectId_cnum'] = subject['SubjectId'].apply(chilidren_num) 204 | subject['Level__SubjectId_cnum'] = subject['Level'].astype(str) + '_' + subject['SubjectId_cnum'].astype(str) 205 | 206 | sid_w_parents = [] 207 | for i in subject['SubjectId'].values: 208 | x = i 209 | parenets = [x] 210 | while parent_dict[x] != 0: 211 | parenets.append(parent_dict[x]) 212 | x = parent_dict[x] 213 | sid_w_parents.append(parenets) 214 | subject['SubjectId_with_parents'] = sid_w_parents 215 | 216 | return subject 217 | 218 | 219 | def preprocess_DateOfBirth(student): 220 | 221 | # cal median 222 | timestamp = [] 223 | for x in tqdm(student['DateOfBirth'].values, total=len(student)): 224 | if type(x) != str: 225 | continue 226 | else: 227 | timestamp.append(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.000').timestamp()) 228 | medain_dt = pd.Series(timestamp).median() 229 | medain_dt = datetime.datetime.fromtimestamp(medain_dt) 230 | 231 | # fill median 232 | timestamp = [] 233 | for x in tqdm(student['DateOfBirth'].values, total=len(student)): 234 | if type(x) != str: 235 | timestamp.append(medain_dt) 236 | else: 237 | timestamp.append(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.000')) 238 | 239 | student['DateOfBirth_dt'] = timestamp 240 | return student 241 | 242 | 243 | def week_of_month(dt): 244 | """ Returns the week of the month for the specified date. 245 | """ 246 | first_day = dt.replace(day=1) 247 | 248 | dom = dt.day 249 | adjusted_dom = dom + first_day.weekday() 250 | 251 | return int(ceil(adjusted_dom / 7.0)) 252 | 253 | 254 | def factorize_dataAnswered(answer): 255 | timestamp = [] 256 | for x in tqdm(answer['DateAnswered'].values, total=len(answer)): 257 | timestamp.append(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.000')) 258 | 259 | weekday = [ts.weekday() for ts in timestamp] 260 | hour = [ts.hour for ts in timestamp] 261 | day = [ts.day for ts in timestamp] 262 | wom = [week_of_month(ts) for ts in timestamp] 263 | 264 | answer['DateAnswered_dt'] = timestamp 265 | answer['DateAnswered_weekday'] = weekday 266 | answer['DateAnswered_hour'] = hour 267 | answer['DateAnswered_day'] = day 268 | answer['DateAnswered_wom'] = wom 269 | return answer 270 | 271 | 272 | def label_encording(colname, X_train, X_test): 273 | lbe = LabelEncoder() 274 | tmp = pd.concat([ 275 | X_train[colname], X_test[colname]]) 276 | lbe.fit(tmp) 277 | X_train[colname] = lbe.transform(X_train[colname]) 278 | X_test[colname] = lbe.transform(X_test[colname]) 279 | return X_train, X_test, len(lbe.classes_) 280 | 281 | 282 | def subject2level(subject, df, df_test): 283 | 284 | subject2level_dic = {i: j for i, j in subject[['SubjectId', 'Level']].values} 285 | subject_levels = [] 286 | for i in df['SubjectId'].values: 287 | row = [] 288 | for s in i: 289 | row.append(subject2level_dic[s]) 290 | subject_levels.append(row) 291 | df['SubjectId_level'] = subject_levels 292 | 293 | subject_levels = [] 294 | for i in df_test['SubjectId'].values: 295 | row = [] 296 | for s in i: 297 | row.append(subject2level_dic[s]) 298 | subject_levels.append(row) 299 | df_test['SubjectId_level'] = subject_levels 300 | 301 | return df, df_test 302 | 303 | 304 | def map_varlen(x, dict): 305 | return [dict[i] for i in x] 306 | 307 | 308 | def varlen_label_encording(colname, X_train, X_test): 309 | 310 | entities = [] 311 | for x in X_train[colname].values: 312 | entities.extend(x) 313 | for x in X_test[colname].values: 314 | entities.extend(x) 315 | entities = pd.Series(entities) 316 | 317 | count = entities.value_counts() 318 | 319 | key2index = {} # null: 0, ... 320 | idx = 1 321 | for i in count.index: 322 | key2index[i] = idx 323 | idx += 1 324 | unique_num = idx 325 | 326 | X_train[colname] = X_train[colname].apply(map_varlen, dict=key2index) 327 | X_test[colname] = X_test[colname].apply(map_varlen, dict=key2index) 328 | 329 | print(f'{colname} - Num. of unique category: {unique_num}') 330 | 331 | return X_train, X_test, key2index, unique_num 332 | 333 | 334 | def varlen_label_encording_threshold(colname, X_train, X_test, low_freq_th): 335 | 336 | entities = [] 337 | for x in X_train[colname].values: 338 | entities.extend(x) 339 | for x in X_test[colname].values: 340 | entities.extend(x) 341 | entities = pd.Series(entities) 342 | 343 | count = entities.value_counts() 344 | low_freq_list = count[count <= low_freq_th].index.tolist() 345 | count = count[count > low_freq_th] 346 | 347 | key2index = {i: 1 for i in low_freq_list} # null: 0, low_freq: 1, ... 348 | idx = 2 349 | for i in count.index: 350 | key2index[i] = idx 351 | idx += 1 352 | unique_num = idx + 1 353 | 354 | X_train[colname] = X_train[colname].apply(map_varlen, dict=key2index) 355 | X_test[colname] = X_test[colname].apply(map_varlen, dict=key2index) 356 | 357 | print(f'{colname} - Num. of use as low frequency category: {len(low_freq_list)}') 358 | print(f'{colname} - Num. of unique category: {unique_num}') 359 | 360 | return X_train, X_test, key2index, unique_num 361 | -------------------------------------------------------------------------------- /exp/task2_lgbm/config.py: -------------------------------------------------------------------------------- 1 | FOLD_NAME = 'mskf_user' 2 | FOLD_NUM = 5 3 | RANDOM_STATE = 46 4 | 5 | TARGET_TASK = '2' 6 | 7 | LGB_MDOEL_PARAMS = { 8 | "boosting_type": "gbdt", 9 | "learning_rate": 0.1, 10 | "max_depth": 10, 11 | "num_leaves": 256, 12 | "colsample_bytree": 0.8, 13 | "min_child_weight": 0, 14 | "random_state": RANDOM_STATE, 15 | "verbose": -1, 16 | "n_jobs": 20, 17 | } 18 | LGB_TRAIN_PARAMS = { 19 | "num_boost_round": 2000, 20 | "early_stopping_rounds": 50, 21 | "verbose_eval": 100, 22 | } 23 | 24 | if TARGET_TASK == '1': 25 | LGB_MDOEL_PARAMS['metric'] = ['auc', 'binary_logloss'] 26 | LGB_MDOEL_PARAMS['objective'] = 'binary' 27 | if TARGET_TASK == '2': 28 | LGB_MDOEL_PARAMS['metric'] = ['multi_logloss', 'multi_error'] 29 | LGB_MDOEL_PARAMS['objective'] = 'multiclass' 30 | LGB_MDOEL_PARAMS['num_class'] = 4 31 | 32 | subject_id_list = [ 33 | 3, 34 | 32, 35 | 33, 36 | 34, 37 | 35, 38 | 36, 39 | 37, 40 | 38, 41 | 39, 42 | 40, 43 | 41, 44 | 42, 45 | 44, 46 | 45, 47 | 46, 48 | 47, 49 | 48, 50 | 49, 51 | 50, 52 | 51, 53 | 52, 54 | 53, 55 | 54, 56 | 55, 57 | 56, 58 | 57, 59 | 58, 60 | 59, 61 | 60, 62 | 61, 63 | 62, 64 | 63, 65 | 64, 66 | 65, 67 | 66, 68 | 67, 69 | 68, 70 | 69, 71 | 70, 72 | 71, 73 | 72, 74 | 73, 75 | 74, 76 | 75, 77 | 76, 78 | 77, 79 | 78, 80 | 79, 81 | 80, 82 | 81, 83 | 83, 84 | 84, 85 | 85, 86 | 86, 87 | 87, 88 | 88, 89 | 89, 90 | 90, 91 | 91, 92 | 92, 93 | 93, 94 | 94, 95 | 95, 96 | 96, 97 | 97, 98 | 98, 99 | 99, 100 | 100, 101 | 101, 102 | 102, 103 | 103, 104 | 104, 105 | 105, 106 | 106, 107 | 107, 108 | 108, 109 | 109, 110 | 110, 111 | 111, 112 | 112, 113 | 113, 114 | 114, 115 | 115, 116 | 116, 117 | 117, 118 | 118, 119 | 119, 120 | 120, 121 | 126, 122 | 128, 123 | 129, 124 | 130, 125 | 131, 126 | 137, 127 | 139, 128 | 140, 129 | 141, 130 | 142, 131 | 144, 132 | 146, 133 | 149, 134 | 151, 135 | 152, 136 | 153, 137 | 154, 138 | 156, 139 | 157, 140 | 158, 141 | 159, 142 | 160, 143 | 163, 144 | 164, 145 | 165, 146 | 166, 147 | 167, 148 | 168, 149 | 171, 150 | 172, 151 | 173, 152 | 174, 153 | 175, 154 | 176, 155 | 177, 156 | 178, 157 | 179, 158 | 180, 159 | 181, 160 | 182, 161 | 183, 162 | 184, 163 | 185, 164 | 186, 165 | 187, 166 | 188, 167 | 189, 168 | 190, 169 | 191, 170 | 192, 171 | 193, 172 | 195, 173 | 196, 174 | 197, 175 | 198, 176 | 199, 177 | 200, 178 | 202, 179 | 203, 180 | 204, 181 | 205, 182 | 206, 183 | 207, 184 | 208, 185 | 209, 186 | 210, 187 | 211, 188 | 212, 189 | 213, 190 | 214, 191 | 215, 192 | 216, 193 | 217, 194 | 218, 195 | 219, 196 | 220, 197 | 221, 198 | 222, 199 | 223, 200 | 224, 201 | 225, 202 | 226, 203 | 227, 204 | 228, 205 | 229, 206 | 230, 207 | 231, 208 | 232, 209 | 233, 210 | 234, 211 | 235, 212 | 236, 213 | 237, 214 | 238, 215 | 239, 216 | 240, 217 | 241, 218 | 242, 219 | 243, 220 | 244, 221 | 245, 222 | 246, 223 | 247, 224 | 248, 225 | 249, 226 | 250, 227 | 251, 228 | 252, 229 | 253, 230 | 254, 231 | 255, 232 | 256, 233 | 257, 234 | 258, 235 | 259, 236 | 260, 237 | 261, 238 | 262, 239 | 263, 240 | 264, 241 | 265, 242 | 266, 243 | 267, 244 | 268, 245 | 269, 246 | 270, 247 | 271, 248 | 272, 249 | 273, 250 | 274, 251 | 275, 252 | 276, 253 | 277, 254 | 278, 255 | 279, 256 | 280, 257 | 281, 258 | 282, 259 | 283, 260 | 284, 261 | 298, 262 | 313, 263 | 315, 264 | 317, 265 | 331, 266 | 332, 267 | 334, 268 | 335, 269 | 336, 270 | 337, 271 | 338, 272 | 339, 273 | 340, 274 | 341, 275 | 342, 276 | 343, 277 | 344, 278 | 348, 279 | 349, 280 | 350, 281 | 351, 282 | 352, 283 | 353, 284 | 354, 285 | 355, 286 | 361, 287 | 365, 288 | 366, 289 | 367, 290 | 369, 291 | 370, 292 | 371, 293 | 372, 294 | 374, 295 | 375, 296 | 376, 297 | 377, 298 | 388, 299 | 406, 300 | 407, 301 | 408, 302 | 409, 303 | 410, 304 | 411, 305 | 412, 306 | 416, 307 | 417, 308 | 418, 309 | 430, 310 | 431, 311 | 432, 312 | 434, 313 | 435, 314 | 436, 315 | 437, 316 | 439, 317 | 441, 318 | 442, 319 | 446, 320 | 447, 321 | 448, 322 | 451, 323 | 453, 324 | 462, 325 | 474, 326 | 480, 327 | 487, 328 | 539, 329 | 540, 330 | 649, 331 | 654, 332 | 655, 333 | 656, 334 | 657, 335 | 692, 336 | 698, 337 | 700, 338 | 1059, 339 | 1076, 340 | 1077, 341 | 1078, 342 | 1079, 343 | 1080, 344 | 1081, 345 | 1082, 346 | 1156, 347 | 1157, 348 | 1158, 349 | 1159, 350 | 1160, 351 | 1161, 352 | 1162, 353 | 1163, 354 | 1164, 355 | 1165, 356 | 1166, 357 | 1167, 358 | 1168, 359 | 1169, 360 | 1170, 361 | 1171, 362 | 1172, 363 | 1173, 364 | 1174, 365 | 1175, 366 | 1176, 367 | 1177, 368 | 1178, 369 | 1179, 370 | 1180, 371 | 1181, 372 | 1182, 373 | 1183, 374 | 1184, 375 | 1185, 376 | 1186, 377 | 1187, 378 | 1188, 379 | 1189, 380 | 1200, 381 | 1201, 382 | 1202, 383 | 1203, 384 | 1204, 385 | 1207, 386 | 1208, 387 | 1209, 388 | 1210, 389 | 1211, 390 | 1212, 391 | 1213, 392 | 1214, 393 | 1215, 394 | 1216, 395 | 1217, 396 | 1218, 397 | 1219, 398 | 1263, 399 | 1264, 400 | 1265, 401 | 1266, 402 | 1636, 403 | 1642, 404 | 1647, 405 | 1648, 406 | 1649, 407 | 1650, 408 | 1651, 409 | 1675, 410 | 1676, 411 | 1750, 412 | 1975, 413 | 1976, 414 | 1977, 415 | 1980, 416 | 1982, 417 | 1983, 418 | 1985, 419 | 1987, 420 | 1988 421 | ] 422 | 423 | level_cnum_list = [ 424 | '0_8', 425 | '1_16', 426 | '3_0', 427 | '2_3', 428 | '2_4', 429 | '2_5', 430 | '2_10', 431 | '2_7', 432 | '1_14', 433 | '2_11', 434 | '2_6', 435 | '2_9', 436 | '2_8', 437 | '1_5', 438 | '1_12', 439 | '2_1', 440 | '2_13', 441 | '1_1', 442 | '1_4', 443 | '2_2', 444 | '2_14', 445 | '1_0', 446 | '2_0', 447 | '0_1' 448 | ] 449 | 450 | 451 | subject_meta_cols = [ 452 | 'num', 453 | 'max_level', 454 | 'sum_level', 455 | 'max_cnum', 456 | 'sum_cnum', 457 | ] 458 | 459 | subject_features = [f'subj_{f}' for f in subject_id_list] 460 | level_cnum_features = [f'subj_{f}' for f in level_cnum_list] 461 | subject_meta_features = [f'subj_{f}' for f in subject_meta_cols] 462 | 463 | user_lag_num_features = [ 464 | 'DateAnswered_dt_diff', 465 | 'DateAnswered_dt_diff_cumsum', 466 | 'DateAnswered_dt_diff_shift', 467 | 'DateAnswered_dt_diff_cumsum_shift', 468 | 'answer_num', 469 | 'answer_num_norm', 470 | 'quiz_answer_num', 471 | 'quiz_answer_num_norm', 472 | 'quiz_unique_num', 473 | 'subj_unique_num', 474 | 'group_unique_num', 475 | 'subjcat_unique_num', 476 | ] 477 | user_lag_cat_features = [ 478 | 'answer_num_div5', 479 | 'quiz_answer_num_div5', 480 | 'change_subjcat', 481 | 'answered_subjcat', 482 | 'prev_question', 483 | 'prev_subjcat', 484 | ] 485 | user_lag_multicat_features = [ 486 | 'prev10_question', 487 | 'prev10_subjcat', 488 | ] 489 | user_lag_features = user_lag_num_features + user_lag_cat_features + user_lag_multicat_features 490 | 491 | answer_date_features = [ 492 | 'DateAnswered_weekday', 493 | 'DateAnswered_hour', 494 | 'DateAnswered_day', 495 | 'DateAnswered_wom' 496 | ] 497 | count_encording_cat = [ 498 | 'QuestionId', 499 | 'UserId', 500 | 'Gender', 501 | 'PremiumPupil', 502 | 'Confidence', 503 | 'GroupId', 504 | 'QuizId', 505 | 'SchemeOfWorkId', 506 | 'age_years', 507 | 'DateAnswered_weekday', 508 | 'DateAnswered_hour', 509 | 'DateAnswered_day', 510 | 'DateAnswered_wom', 511 | 'answer_num_div5', 512 | 'quiz_answer_num_div5', 513 | 'change_subjcat', 514 | 'answered_subjcat', 515 | 'prev_question', 516 | 'prev_subjcat', 517 | 'SubjectId_cat', 518 | # 'DateOfBirth_NaN', 519 | 'pri_to_high_stu', 520 | ['UserId', 'DateAnswered_weekday'], 521 | ['UserId', 'DateAnswered_hour'], 522 | ['UserId', 'DateAnswered_day'], 523 | ['UserId', 'DateAnswered_wom'], 524 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 525 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 526 | ['UserId', 'Confidence'], 527 | ['UserId', 'SchemeOfWorkId'], 528 | ['UserId', 'GroupId'], 529 | ['UserId', 'QuizId'], 530 | ['UserId', 'SubjectId_cat'], 531 | ['UserId', 'answer_num_div5'], 532 | ['UserId', 'quiz_answer_num_div5'], 533 | ['UserId', 'change_subjcat'], 534 | ['UserId', 'answered_subjcat'], 535 | ['QuestionId', 'Confidence'], 536 | ['QuestionId', 'SchemeOfWorkId'], 537 | ['QuestionId', 'age_years'], 538 | ['QuestionId', 'Gender'], 539 | ['QuestionId', 'answer_num_div5'], 540 | ['QuestionId', 'quiz_answer_num_div5'], 541 | ['QuestionId', 'change_subjcat'], 542 | ['QuestionId', 'answered_subjcat'], 543 | ['SubjectId_cat', 'Confidence'], 544 | ['SubjectId_cat', 'SchemeOfWorkId'], 545 | ['SubjectId_cat', 'age_years'], 546 | ['SubjectId_cat', 'Gender'], 547 | ['SubjectId_cat', 'answer_num_div5'], 548 | ['SubjectId_cat', 'quiz_answer_num_div5'], 549 | ['SubjectId_cat', 'change_subjcat'], 550 | ['SubjectId_cat', 'answered_subjcat'], 551 | ['QuestionId', 'GroupId'], 552 | ['QuestionId', 'QuizId'], 553 | ] 554 | count_encording_features = [] 555 | for col in count_encording_cat: 556 | if not isinstance(col, list): 557 | col = [col] 558 | name = "_".join(col) 559 | count_encording_features.append(f'{name}_ce') 560 | 561 | # te_smooth_factor = 5 562 | te_smooth_factor = 2 563 | target_encording_cat = [ 564 | 'QuestionId', 565 | 'UserId', 566 | 'Gender', 567 | 'PremiumPupil', 568 | 'Confidence', 569 | 'GroupId', 570 | 'QuizId', 571 | 'SchemeOfWorkId', 572 | 'age_years', 573 | 'DateAnswered_weekday', 574 | 'DateAnswered_hour', 575 | 'DateAnswered_day', 576 | 'DateAnswered_wom', 577 | 'answer_num_div5', 578 | 'quiz_answer_num_div5', 579 | 'change_subjcat', 580 | 'answered_subjcat', 581 | 'prev_question', 582 | 'prev_subjcat', 583 | 'SubjectId_cat', 584 | 'DateOfBirth_NaN', 585 | 'pri_to_high_stu', 586 | ['DateAnswered_day', 'DateAnswered_hour'], 587 | ['DateAnswered_weekday', 'DateAnswered_hour'], 588 | ['DateAnswered_weekday', 'DateAnswered_wom'], 589 | ['UserId', 'DateAnswered_weekday'], 590 | ['UserId', 'DateAnswered_hour'], 591 | ['UserId', 'DateAnswered_day'], 592 | ['UserId', 'DateAnswered_wom'], 593 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 594 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 595 | ['UserId', 'Confidence'], 596 | ['UserId', 'SchemeOfWorkId'], 597 | ['UserId', 'GroupId'], 598 | ['UserId', 'QuizId'], 599 | ['UserId', 'SubjectId_cat'], 600 | ['UserId', 'answer_num_div5'], 601 | ['UserId', 'quiz_answer_num_div5'], 602 | ['UserId', 'change_subjcat'], 603 | ['UserId', 'answered_subjcat'], 604 | ['QuestionId', 'Confidence'], 605 | ['QuestionId', 'SchemeOfWorkId'], 606 | ['QuestionId', 'age_years'], 607 | ['QuestionId', 'Gender'], 608 | ['QuestionId', 'answer_num_div5'], 609 | ['QuestionId', 'quiz_answer_num_div5'], 610 | ['QuestionId', 'change_subjcat'], 611 | ['QuestionId', 'answered_subjcat'], 612 | ['SubjectId_cat', 'Confidence'], 613 | ['SubjectId_cat', 'SchemeOfWorkId'], 614 | ['SubjectId_cat', 'age_years'], 615 | ['SubjectId_cat', 'Gender'], 616 | ['SubjectId_cat', 'answer_num_div5'], 617 | ['SubjectId_cat', 'quiz_answer_num_div5'], 618 | ['SubjectId_cat', 'change_subjcat'], 619 | ['SubjectId_cat', 'answered_subjcat'], 620 | ] 621 | target_encording_features = [] 622 | for tar in ['IsCorrect']: 623 | for col in target_encording_cat: 624 | if not isinstance(col, list): 625 | col = [col] 626 | name = "_".join(col) 627 | target_encording_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 628 | 629 | target_encording_ansval_features = [] 630 | for tar in ['AnswerValue_1', 'AnswerValue_2', 'AnswerValue_3', 'AnswerValue_4']: 631 | for col in target_encording_cat: 632 | if not isinstance(col, list): 633 | col = [col] 634 | name = "_".join(col) 635 | target_encording_ansval_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 636 | 637 | subj_conbi_cols = [ 638 | 'UserId', 639 | 'age_years', 640 | 'answered_subjcat', 641 | 'SchemeOfWorkId', 642 | 'Confidence', 643 | ] 644 | target_encording_subj_conbi_cat = [] 645 | for col in subject_features: 646 | for col2 in subj_conbi_cols: 647 | target_encording_subj_conbi_cat.append([col, col2]) 648 | 649 | target_encording_subj_conbi_features = [] 650 | for tar in ['IsCorrect']: 651 | for col in target_encording_subj_conbi_cat: 652 | if not isinstance(col, list): 653 | col = [col] 654 | name = "_".join(col) 655 | target_encording_subj_conbi_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 656 | 657 | target_encording_subj_agg_features = [] 658 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 659 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_agg_{agg_func}_IsCorrect') 660 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 661 | for conbi_col in subj_conbi_cols: 662 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_{conbi_col}_agg_{agg_func}_IsCorrect') 663 | 664 | svd_n_components = 5 665 | svd_features = [] 666 | svd_features += [f'ques_subj_svd_{i}' for i in range(svd_n_components)] 667 | svd_features += [f'user_subj_svd_{i}' for i in range(svd_n_components)] 668 | 669 | ################################################################ 670 | dense_features = [ 671 | 'age_days' 672 | ] 673 | dense_features += count_encording_features 674 | dense_features += target_encording_features 675 | dense_features += subject_meta_features 676 | dense_features += target_encording_ansval_features 677 | dense_features += user_lag_num_features 678 | dense_features += target_encording_subj_agg_features 679 | dense_features += svd_features 680 | 681 | sparse_features = [ 682 | # 'QuestionId', 683 | # 'UserId', 684 | 'Gender', 685 | 'PremiumPupil', 686 | 'Confidence', 687 | # 'GroupId', 688 | # 'QuizId', 689 | 'SchemeOfWorkId', 690 | 'age_years', 691 | 'SubjectId_cat', 692 | 'DateOfBirth_NaN', 693 | 'pri_to_high_stu', 694 | ] 695 | sparse_features += answer_date_features 696 | sparse_features += user_lag_cat_features 697 | 698 | varlen_sparse_features = [ 699 | # 'SubjectId', 700 | # 'SubjectId_level' 701 | ] 702 | # varlen_sparse_features = varlen_sparse_features + user_lag_multicat_features 703 | ################################################################ -------------------------------------------------------------------------------- /src/stacking.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import ast 4 | from tqdm import tqdm 5 | 6 | import os 7 | import zipfile 8 | import shutil 9 | 10 | import datetime 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | sns.set() 14 | 15 | from sklearn import metrics 16 | from sklearn.linear_model import Ridge, RidgeClassifier 17 | from scipy.special import softmax 18 | from sklearn.model_selection import KFold 19 | 20 | import cloudpickle 21 | 22 | INPUT_DIR = '../data' 23 | SUB_DIR='../submission' 24 | FOLD_DIR = '../folds' 25 | FOLD_NAME = 'mskf_user' 26 | FOLD_NUM = 5 27 | RANDOM_STATE = 46 28 | 29 | exp_list_t1 = [ 30 | ('task1_lgbm', 'bbb432be523149369a5f95cdfd63578c'), # lgb, all feature, te_smooth5 31 | ('task1_xgb_2', '69f2b85366b34198a1ea2dc512e05f3a'), # xgb, all feature, te_smooth5 32 | ('task1_cat', 'b286e8fb8f824027869bf2ec6247b8ec'), # cat, all feature, te_smooth5 33 | ('task1_lgbm_fs100', 'bf0bfd4d47664b349c02dd92eaa50fc4'), # lgb, fs100, te_smooth5 34 | ('task1_xgb_fs100', '2cf16a7701da409daf40cbfe78b3b7bf'), # xgb, fs100, te_smooth5 35 | ('task1_mlp_fs100', '92598df31e9f4311adccbc8e267e7c01'), # mlp, fs100, te_smooth5 36 | ('task12_multitask_mlp_fs100', '38a138a4ad5e49b999ec0eca380be5dc'), # mlp multi, fs100, te_smooth5 37 | ('task1_cat_fs100', '468d28a067bc479caa65e12153965df4'), # cat depth 8, fs100, te_smooth5 38 | ('task1_cat_fs100', 'e4950f4d436a4d58a89c5bd90e254428'), # cat depth 10, fs100, te_smooth5 39 | ('task1_lgbm', 'c4def7cc4f3f4f44971ac474d2993e92'), # lgb, all feature, te_smooth2 40 | #--- 41 | ('task1_xgb_fs50_meta', '0da188f3e0014876919457befc0b67e9'), 42 | ('task1_xgb_fs100_meta_2', '915471c8af044216ae069ed8568f7f2a'), 43 | ('task1_cat_fs50_meta', 'bfeb141ae68044ebafadd0a4e286e6e3'), 44 | ('task12_multitask_mlp_fs50_meta', '629ddeed745148298c6148d30d7a238f'), 45 | ] 46 | 47 | exp_list_t2 = [ 48 | ('task2_lgbm', '90443d723c874103a4b1c68f2830446e'), # lgb, all feature, te_smooth5 49 | ('task2_xgb_2', '19a82f5124d349ee9550b871a88025d2'), # xgb, all feature, te_smooth5 50 | ('task2_lgbm_fs100', '751bb6f02e5746798740f2b90f183a92'), # lgb, fs100, te_smooth5 51 | ('task2_xgb_fs100', '89909e8bf54443328830085dca5f26cc'), # xgb, fs100, te_smooth5 52 | ('task2_mlp_fs100', '2f1d7ef010874723a8dd70aa49891f5d'), # mlp, fs100, te_smooth5 53 | ('task12_multitask_mlp_fs100', '38a138a4ad5e49b999ec0eca380be5dc'), # mlp multi, fs100, te_smooth5 54 | ('task2_cat_fs100', '5f4c75b620ef4eccbb14581589340db6'), # cat depth 8, fs100, te_smooth5 55 | # --- 56 | ('task2_xgb_fs50_meta', 'ad5e1413cdfa45198ffd3a8d20f17886'), 57 | ('task2_xgb_fs100_meta', 'a704720e1cfa42668da970f50c65af2e'), 58 | ('task12_multitask_mlp_fs50_meta', '629ddeed745148298c6148d30d7a238f'), 59 | ] 60 | 61 | meta_pred_features = [] 62 | for ename, run_id in exp_list_t1: 63 | meta_pred_features.append(f't1_{ename}_{run_id}') 64 | for ename, run_id in exp_list_t2: 65 | for idx in range(4): 66 | meta_pred_features.append(f't2_{ename}_{run_id}_{idx}') 67 | 68 | meta_agg_t1 = [ 69 | 'preds_t1_mean_0', 70 | 'preds_t1_std_0', 71 | 'preds_t1_max_0', 72 | 'preds_t1_min_0', 73 | 'preds_t1_diff_0', 74 | ] 75 | 76 | meta_agg_t2 = [ 77 | 'preds_t2_mean_0', 78 | 'preds_t2_mean_1', 79 | 'preds_t2_mean_2', 80 | 'preds_t2_mean_3', 81 | 'preds_t2_mean_max_0', 82 | 'preds_t2_mean_min_0', 83 | 'preds_t2_std_0', 84 | 'preds_t2_std_1', 85 | 'preds_t2_std_2', 86 | 'preds_t2_std_3', 87 | 'preds_t2_std_sum_0', 88 | 'preds_t2_max_0', 89 | 'preds_t2_max_1', 90 | 'preds_t2_max_2', 91 | 'preds_t2_max_3', 92 | 'preds_t2_min_0', 93 | 'preds_t2_min_1', 94 | 'preds_t2_min_2', 95 | 'preds_t2_min_3', 96 | ] 97 | 98 | meta_mul = [] 99 | for i in range(4): 100 | meta_mul.append(f'preds_t1_mean_mul_t2_mean_{i}') 101 | meta_mul.append(f'preds_t1_std_mul_t2_std_{i}') 102 | meta_mul.append(f'preds_t1_std_mul_t2_mean_{i}') 103 | 104 | dense_features = [] 105 | dense_features += meta_pred_features 106 | dense_features += meta_agg_t1 107 | dense_features += meta_agg_t2 108 | dense_features += meta_mul 109 | 110 | def load_from_pkl(load_path): 111 | frb = open(load_path , 'rb') 112 | obj = cloudpickle.loads(frb.read()) 113 | return obj 114 | 115 | 116 | def save_as_pkl(obj, save_path): 117 | fwb = open(save_path, 'wb') 118 | fwb.write( cloudpickle.dumps(obj) ) 119 | return 120 | 121 | def add_preds(results, TARGET_TASK, EXP_NAME, run_id): 122 | oof = pd.read_csv(f'../save/{EXP_NAME}/preds_val_task{TARGET_TASK}_{run_id}.csv') 123 | test_preds = pd.read_csv(f'../save/{EXP_NAME}/preds_test_task{TARGET_TASK}_{run_id}.csv') 124 | results[f't{TARGET_TASK}_{EXP_NAME}_{run_id}'] = {} 125 | results[f't{TARGET_TASK}_{EXP_NAME}_{run_id}']['oof'] = oof 126 | results[f't{TARGET_TASK}_{EXP_NAME}_{run_id}']['test_preds'] = test_preds 127 | return results 128 | 129 | def make_submission_t1(preds_test, auc, best_acc, best_th, EXP_NAME): 130 | preds_test_th = [] 131 | for i in preds_test: 132 | if i > best_th: 133 | preds_test_th.append(1) 134 | else: 135 | preds_test_th.append(0) 136 | 137 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 138 | sub = pd.read_csv(test_data_path) 139 | sub['IsCorrect'] = preds_test_th 140 | 141 | if not os.path.exists(f'../submission/{EXP_NAME}'): 142 | os.mkdir(f'../submission/{EXP_NAME}') 143 | 144 | sub_name = f'submission_task1__auc{auc}__acc{best_acc}__th{best_th}' 145 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 146 | if not os.path.exists(valid_sub_dir): 147 | os.mkdir(valid_sub_dir) 148 | 149 | sub.to_csv(f'{valid_sub_dir}/submission_task_1.csv', index=False) 150 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 151 | new_zip.write(f'{valid_sub_dir}/submission_task_1.csv', arcname='submission_task_1.csv') 152 | shutil.rmtree(valid_sub_dir) 153 | 154 | submission_path = f'../submission/{EXP_NAME}/submission_task1__auc{auc}__acc{best_acc}__th{best_th}.csv' 155 | sub.to_csv(submission_path, index=False) 156 | return 157 | 158 | def make_submission_t2(preds_test, multi_acc, EXP_NAME): 159 | preds_test_label = np.argmax(preds_test, axis=1) + 1 160 | 161 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 162 | sub = pd.read_csv(test_data_path) 163 | sub['AnswerValue'] = preds_test_label 164 | 165 | if not os.path.exists(f'{SUB_DIR}/{EXP_NAME}'): 166 | os.mkdir(f'{SUB_DIR}/{EXP_NAME}') 167 | 168 | sub_name = f'submission_task2__acc{multi_acc}' 169 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 170 | if not os.path.exists(valid_sub_dir): 171 | os.mkdir(valid_sub_dir) 172 | 173 | sub.to_csv(f'{valid_sub_dir}/submission_task_2.csv', index=False) 174 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 175 | new_zip.write(f'{valid_sub_dir}/submission_task_2.csv', arcname='submission_task_2.csv') 176 | shutil.rmtree(valid_sub_dir) 177 | return 178 | 179 | def predict_cv_t1(model, train_x, train_y, test_x, folds): 180 | 181 | preds = [] 182 | preds_test = [] 183 | va_idxes = [] 184 | 185 | for fold_idx in tqdm(range(FOLD_NUM)): 186 | 187 | tr_idx = folds[folds.kfold != fold_idx].index.tolist() 188 | va_idx = folds[folds.kfold == fold_idx].index.tolist() 189 | 190 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 191 | tr_y, va_y = train_y[tr_idx], train_y[va_idx] 192 | 193 | model.fit(tr_x, tr_y) 194 | 195 | pred = model.predict(va_x) 196 | preds.append(pred) 197 | pred_test = model.predict(test_x) 198 | preds_test.append(pred_test) 199 | 200 | va_idxes.append(va_idx) 201 | 202 | va_idxes = np.concatenate(va_idxes) 203 | preds = np.concatenate(preds, axis=0) 204 | order = np.argsort(va_idxes) 205 | pred_train = preds[order] 206 | 207 | preds_test = np.mean(preds_test, axis=0) 208 | 209 | return pred_train, preds_test 210 | 211 | def predict_cv_t2(model, train_x, train_y, test_x, folds): 212 | 213 | preds = [] 214 | preds_test = [] 215 | va_idxes = [] 216 | 217 | for fold_idx in tqdm(range(FOLD_NUM)): 218 | 219 | tr_idx = folds[folds.kfold != fold_idx].index.tolist() 220 | va_idx = folds[folds.kfold == fold_idx].index.tolist() 221 | 222 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 223 | tr_y, va_y = train_y[tr_idx], train_y[va_idx] 224 | 225 | model.fit(tr_x, tr_y) 226 | 227 | pred = model.decision_function(va_x) 228 | pred = softmax(pred, axis=1) 229 | preds.append(pred) 230 | pred_test = model.decision_function(test_x) 231 | pred_test = softmax(pred_test, axis=1) 232 | preds_test.append(pred_test) 233 | 234 | va_idxes.append(va_idx) 235 | 236 | va_idxes = np.concatenate(va_idxes) 237 | preds = np.concatenate(preds, axis=0) 238 | order = np.argsort(va_idxes) 239 | pred_train = preds[order] 240 | preds_test = np.mean(preds_test, axis=0) 241 | 242 | return pred_train, preds_test 243 | 244 | if __name__ == "__main__": 245 | 246 | folds = pd.read_csv(f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv') 247 | 248 | data_path = f'{INPUT_DIR}/train_data/train_task_1_2.csv' 249 | answer_path = f'{INPUT_DIR}/metadata/answer_metadata_task_1_2.csv' 250 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 251 | 252 | train = pd.read_csv(data_path) 253 | test = pd.read_csv(test_data_path) 254 | answer = pd.read_csv(answer_path) 255 | answer['AnswerId'] = answer['AnswerId'].fillna(-1).astype('int32') 256 | answer['Confidence'] = answer['Confidence'].fillna(-1).astype('int8') 257 | answer['SchemeOfWorkId'] = answer['SchemeOfWorkId'].fillna(-1).astype('int16') 258 | 259 | y_t1 = train['IsCorrect'].values 260 | y_t2 = train['AnswerValue'].values 261 | 262 | df = pd.merge(train, answer, on='AnswerId', how='left') 263 | df_test = pd.merge(test, answer, on='AnswerId', how='left') 264 | 265 | results_t1 = {} 266 | for ename, run_id in exp_list_t1: 267 | results_t1 = add_preds(results_t1, TARGET_TASK='1', EXP_NAME=ename, run_id=run_id) 268 | 269 | results_t2 = {} 270 | for ename, run_id in exp_list_t2: 271 | results_t2 = add_preds(results_t2, TARGET_TASK='2', EXP_NAME=ename, run_id=run_id) 272 | 273 | rdf_oof = pd.DataFrame() 274 | rdf_test = pd.DataFrame() 275 | for ename, run_id in exp_list_t1: 276 | rdf_oof = pd.concat([rdf_oof, results_t1[f't1_{ename}_{run_id}']['oof']], axis=1) 277 | rdf_test = pd.concat([rdf_test, results_t1[f't1_{ename}_{run_id}']['test_preds']], axis=1) 278 | 279 | cols = [ename for ename, _ in exp_list_t1] 280 | rdf_oof.columns = cols 281 | rdf_test.columns = cols 282 | 283 | for ename, run_id in exp_list_t1: 284 | df[f't1_{ename}_{run_id}'] = results_t1[f't1_{ename}_{run_id}']['oof'] 285 | df_test[f't1_{ename}_{run_id}'] = results_t1[f't1_{ename}_{run_id}']['test_preds'] 286 | 287 | meta_oofs_t1 = pd.DataFrame() 288 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.mean(axis=1)).add_prefix('preds_t1_mean_')], axis=1) 289 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.std(axis=1)).add_prefix('preds_t1_std_')], axis=1) 290 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.max(axis=1)).add_prefix('preds_t1_max_')], axis=1) 291 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.min(axis=1)).add_prefix('preds_t1_min_')], axis=1) 292 | meta_oofs_t1 = pd.concat([meta_oofs_t1, pd.DataFrame(rdf_oof.max(axis=1) - rdf_oof.min(axis=1)).add_prefix('preds_t1_diff_')], axis=1) 293 | 294 | meta_tests_t1 = pd.DataFrame() 295 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.mean(axis=1)).add_prefix('preds_t1_mean_')], axis=1) 296 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.std(axis=1)).add_prefix('preds_t1_std_')], axis=1) 297 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.max(axis=1)).add_prefix('preds_t1_max_')], axis=1) 298 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.min(axis=1)).add_prefix('preds_t1_min_')], axis=1) 299 | meta_tests_t1 = pd.concat([meta_tests_t1, pd.DataFrame(rdf_test.max(axis=1) - rdf_test.min(axis=1)).add_prefix('preds_t1_diff_')], axis=1) 300 | 301 | df = pd.concat([df, meta_oofs_t1], axis=1) 302 | df_test = pd.concat([df_test, meta_tests_t1], axis=1) 303 | 304 | oofs = np.zeros((len(rdf_oof), 4, len(exp_list_t2))) 305 | tests = np.zeros((len(rdf_test), 4, len(exp_list_t2))) 306 | for eidx, (ename, run_id) in enumerate(exp_list_t2): 307 | oofs[:, :, eidx] = results_t2[f't2_{ename}_{run_id}']['oof'].values 308 | tests[:, :, eidx] = results_t2[f't2_{ename}_{run_id}']['test_preds'].values 309 | 310 | for eidx, (ename, run_id) in enumerate(exp_list_t2): 311 | df = pd.concat([df, pd.DataFrame(oofs[:, :, eidx]).add_prefix(f't2_{ename}_{run_id}_')], axis=1) 312 | df_test = pd.concat([df_test,pd.DataFrame(tests[:, :, eidx]).add_prefix(f't2_{ename}_{run_id}_')], axis=1) 313 | 314 | meta_oofs_t2 = pd.DataFrame() 315 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.mean(axis=2)).add_prefix('preds_t2_mean_')], axis=1) 316 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.mean(axis=2).max(axis=1)).add_prefix('preds_t2_mean_max_')], axis=1) 317 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.mean(axis=2).min(axis=1)).add_prefix('preds_t2_mean_min_')], axis=1) 318 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.std(axis=2)).add_prefix('preds_t2_std_')], axis=1) 319 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.std(axis=2).sum(axis=1)).add_prefix('preds_t2_std_sum_')], axis=1) 320 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.max(axis=2)).add_prefix('preds_t2_max_')], axis=1) 321 | meta_oofs_t2 = pd.concat([meta_oofs_t2, pd.DataFrame(oofs.min(axis=2)).add_prefix('preds_t2_min_')], axis=1) 322 | 323 | meta_tests_t2 = pd.DataFrame() 324 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.mean(axis=2)).add_prefix('preds_t2_mean_')], axis=1) 325 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.mean(axis=2).max(axis=1)).add_prefix('preds_t2_mean_max_')], axis=1) 326 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.mean(axis=2).min(axis=1)).add_prefix('preds_t2_mean_min_')], axis=1) 327 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.std(axis=2)).add_prefix('preds_t2_std_')], axis=1) 328 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.std(axis=2).sum(axis=1)).add_prefix('preds_t2_std_sum_')], axis=1) 329 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.max(axis=2)).add_prefix('preds_t2_max_')], axis=1) 330 | meta_tests_t2 = pd.concat([meta_tests_t2, pd.DataFrame(tests.min(axis=2)).add_prefix('preds_t2_min_')], axis=1) 331 | 332 | df = pd.concat([df, meta_oofs_t2], axis=1) 333 | df_test = pd.concat([df_test, meta_tests_t2], axis=1) 334 | 335 | for i in range(4): 336 | df[f'preds_t1_mean_mul_t2_mean_{i}'] = df['preds_t1_mean_0'].mul(df[f'preds_t2_mean_{i}']) 337 | df[f'preds_t1_std_mul_t2_std_{i}'] = df['preds_t1_std_0'].mul(df[f'preds_t2_std_{i}']) 338 | df[f'preds_t1_std_mul_t2_mean_{i}'] = df['preds_t1_std_0'].mul(df[f'preds_t2_mean_{i}']) 339 | df_test[f'preds_t1_mean_mul_t2_mean_{i}'] = df_test['preds_t1_mean_0'].mul(df_test[f'preds_t2_mean_{i}']) 340 | df_test[f'preds_t1_std_mul_t2_std_{i}'] = df_test['preds_t1_std_0'].mul(df_test[f'preds_t2_std_{i}']) 341 | df_test[f'preds_t1_std_mul_t2_mean_{i}'] = df_test['preds_t1_std_0'].mul(df_test[f'preds_t2_mean_{i}']) 342 | 343 | ridge_alpha = 1.0 344 | smodel = Ridge(alpha=ridge_alpha) 345 | pred_oof, preds_test = predict_cv_t1( 346 | smodel, 347 | df[dense_features], 348 | y_t1, 349 | df_test[dense_features], 350 | folds, 351 | ) 352 | auc = metrics.roc_auc_score(y_t1, pred_oof) 353 | 354 | rows = [] 355 | for th in tqdm(range(490, 510, 1)): 356 | th = th * 0.001 357 | preds_th = [] 358 | for i in pred_oof: 359 | if i > th: 360 | preds_th.append(1) 361 | else: 362 | preds_th.append(0) 363 | acc = metrics.accuracy_score(y_t1, preds_th) 364 | rows.append([th, acc]) 365 | acc_th = pd.DataFrame(rows, columns=['th', 'acc']) 366 | scores = acc_th.sort_values('acc', ascending=False).head(1).values[0] 367 | best_th, best_acc = scores[0], scores[1] 368 | 369 | exp_name = '__'.join([ename for ename, _ in exp_list_t1]) 370 | exp_name = f'stacking_{ename}' 371 | make_submission_t1( 372 | preds_test, auc=auc, best_acc=best_acc, best_th=best_th, 373 | EXP_NAME=exp_name) 374 | 375 | ridge_alpha = 1.0 376 | smodel = RidgeClassifier(alpha=ridge_alpha) 377 | pred_oof, preds_test = predict_cv_t2( 378 | smodel, 379 | df[dense_features], 380 | y_t2, 381 | df_test[dense_features], 382 | folds, 383 | ) 384 | 385 | pred_oof_label = np.argmax(pred_oof, axis=1) + 1 386 | multi_acc = metrics.accuracy_score(y_t2, pred_oof_label) 387 | 388 | exp_name = '__'.join([ename for ename, _ in exp_list_t2]) 389 | exp_name = f'stacking_{ename}' 390 | make_submission_t2( 391 | preds_test, multi_acc=multi_acc, 392 | EXP_NAME=exp_name) 393 | -------------------------------------------------------------------------------- /exp/task1_cat/config.py: -------------------------------------------------------------------------------- 1 | FOLD_NAME = 'mskf_user' 2 | FOLD_NUM = 5 3 | RANDOM_STATE = 46 4 | 5 | TARGET_TASK = '1' 6 | 7 | CAT_PARAMS = { 8 | 'depth': 9, 9 | 'learning_rate': 0.2, 10 | 'bagging_temperature': 0.2, 11 | 'od_type': 'Iter', 12 | 'metric_period': 50, 13 | 'iterations': 3000, 14 | 'od_wait': 20, 15 | 'random_seed': RANDOM_STATE, 16 | } 17 | 18 | if TARGET_TASK == '1': 19 | CAT_PARAMS['loss_function'] = 'Logloss' 20 | CAT_PARAMS['eval_metric'] = 'AUC' 21 | if TARGET_TASK == '2': 22 | CAT_PARAMS['loss_function'] = 'MultiClass' 23 | CAT_PARAMS['eval_metric'] = 'Accuracy' 24 | 25 | subject_id_list = [ 26 | 3, 27 | 32, 28 | 33, 29 | 34, 30 | 35, 31 | 36, 32 | 37, 33 | 38, 34 | 39, 35 | 40, 36 | 41, 37 | 42, 38 | 44, 39 | 45, 40 | 46, 41 | 47, 42 | 48, 43 | 49, 44 | 50, 45 | 51, 46 | 52, 47 | 53, 48 | 54, 49 | 55, 50 | 56, 51 | 57, 52 | 58, 53 | 59, 54 | 60, 55 | 61, 56 | 62, 57 | 63, 58 | 64, 59 | 65, 60 | 66, 61 | 67, 62 | 68, 63 | 69, 64 | 70, 65 | 71, 66 | 72, 67 | 73, 68 | 74, 69 | 75, 70 | 76, 71 | 77, 72 | 78, 73 | 79, 74 | 80, 75 | 81, 76 | 83, 77 | 84, 78 | 85, 79 | 86, 80 | 87, 81 | 88, 82 | 89, 83 | 90, 84 | 91, 85 | 92, 86 | 93, 87 | 94, 88 | 95, 89 | 96, 90 | 97, 91 | 98, 92 | 99, 93 | 100, 94 | 101, 95 | 102, 96 | 103, 97 | 104, 98 | 105, 99 | 106, 100 | 107, 101 | 108, 102 | 109, 103 | 110, 104 | 111, 105 | 112, 106 | 113, 107 | 114, 108 | 115, 109 | 116, 110 | 117, 111 | 118, 112 | 119, 113 | 120, 114 | 126, 115 | 128, 116 | 129, 117 | 130, 118 | 131, 119 | 137, 120 | 139, 121 | 140, 122 | 141, 123 | 142, 124 | 144, 125 | 146, 126 | 149, 127 | 151, 128 | 152, 129 | 153, 130 | 154, 131 | 156, 132 | 157, 133 | 158, 134 | 159, 135 | 160, 136 | 163, 137 | 164, 138 | 165, 139 | 166, 140 | 167, 141 | 168, 142 | 171, 143 | 172, 144 | 173, 145 | 174, 146 | 175, 147 | 176, 148 | 177, 149 | 178, 150 | 179, 151 | 180, 152 | 181, 153 | 182, 154 | 183, 155 | 184, 156 | 185, 157 | 186, 158 | 187, 159 | 188, 160 | 189, 161 | 190, 162 | 191, 163 | 192, 164 | 193, 165 | 195, 166 | 196, 167 | 197, 168 | 198, 169 | 199, 170 | 200, 171 | 202, 172 | 203, 173 | 204, 174 | 205, 175 | 206, 176 | 207, 177 | 208, 178 | 209, 179 | 210, 180 | 211, 181 | 212, 182 | 213, 183 | 214, 184 | 215, 185 | 216, 186 | 217, 187 | 218, 188 | 219, 189 | 220, 190 | 221, 191 | 222, 192 | 223, 193 | 224, 194 | 225, 195 | 226, 196 | 227, 197 | 228, 198 | 229, 199 | 230, 200 | 231, 201 | 232, 202 | 233, 203 | 234, 204 | 235, 205 | 236, 206 | 237, 207 | 238, 208 | 239, 209 | 240, 210 | 241, 211 | 242, 212 | 243, 213 | 244, 214 | 245, 215 | 246, 216 | 247, 217 | 248, 218 | 249, 219 | 250, 220 | 251, 221 | 252, 222 | 253, 223 | 254, 224 | 255, 225 | 256, 226 | 257, 227 | 258, 228 | 259, 229 | 260, 230 | 261, 231 | 262, 232 | 263, 233 | 264, 234 | 265, 235 | 266, 236 | 267, 237 | 268, 238 | 269, 239 | 270, 240 | 271, 241 | 272, 242 | 273, 243 | 274, 244 | 275, 245 | 276, 246 | 277, 247 | 278, 248 | 279, 249 | 280, 250 | 281, 251 | 282, 252 | 283, 253 | 284, 254 | 298, 255 | 313, 256 | 315, 257 | 317, 258 | 331, 259 | 332, 260 | 334, 261 | 335, 262 | 336, 263 | 337, 264 | 338, 265 | 339, 266 | 340, 267 | 341, 268 | 342, 269 | 343, 270 | 344, 271 | 348, 272 | 349, 273 | 350, 274 | 351, 275 | 352, 276 | 353, 277 | 354, 278 | 355, 279 | 361, 280 | 365, 281 | 366, 282 | 367, 283 | 369, 284 | 370, 285 | 371, 286 | 372, 287 | 374, 288 | 375, 289 | 376, 290 | 377, 291 | 388, 292 | 406, 293 | 407, 294 | 408, 295 | 409, 296 | 410, 297 | 411, 298 | 412, 299 | 416, 300 | 417, 301 | 418, 302 | 430, 303 | 431, 304 | 432, 305 | 434, 306 | 435, 307 | 436, 308 | 437, 309 | 439, 310 | 441, 311 | 442, 312 | 446, 313 | 447, 314 | 448, 315 | 451, 316 | 453, 317 | 462, 318 | 474, 319 | 480, 320 | 487, 321 | 539, 322 | 540, 323 | 649, 324 | 654, 325 | 655, 326 | 656, 327 | 657, 328 | 692, 329 | 698, 330 | 700, 331 | 1059, 332 | 1076, 333 | 1077, 334 | 1078, 335 | 1079, 336 | 1080, 337 | 1081, 338 | 1082, 339 | 1156, 340 | 1157, 341 | 1158, 342 | 1159, 343 | 1160, 344 | 1161, 345 | 1162, 346 | 1163, 347 | 1164, 348 | 1165, 349 | 1166, 350 | 1167, 351 | 1168, 352 | 1169, 353 | 1170, 354 | 1171, 355 | 1172, 356 | 1173, 357 | 1174, 358 | 1175, 359 | 1176, 360 | 1177, 361 | 1178, 362 | 1179, 363 | 1180, 364 | 1181, 365 | 1182, 366 | 1183, 367 | 1184, 368 | 1185, 369 | 1186, 370 | 1187, 371 | 1188, 372 | 1189, 373 | 1200, 374 | 1201, 375 | 1202, 376 | 1203, 377 | 1204, 378 | 1207, 379 | 1208, 380 | 1209, 381 | 1210, 382 | 1211, 383 | 1212, 384 | 1213, 385 | 1214, 386 | 1215, 387 | 1216, 388 | 1217, 389 | 1218, 390 | 1219, 391 | 1263, 392 | 1264, 393 | 1265, 394 | 1266, 395 | 1636, 396 | 1642, 397 | 1647, 398 | 1648, 399 | 1649, 400 | 1650, 401 | 1651, 402 | 1675, 403 | 1676, 404 | 1750, 405 | 1975, 406 | 1976, 407 | 1977, 408 | 1980, 409 | 1982, 410 | 1983, 411 | 1985, 412 | 1987, 413 | 1988 414 | ] 415 | 416 | level_cnum_list = [ 417 | '0_8', 418 | '1_16', 419 | '3_0', 420 | '2_3', 421 | '2_4', 422 | '2_5', 423 | '2_10', 424 | '2_7', 425 | '1_14', 426 | '2_11', 427 | '2_6', 428 | '2_9', 429 | '2_8', 430 | '1_5', 431 | '1_12', 432 | '2_1', 433 | '2_13', 434 | '1_1', 435 | '1_4', 436 | '2_2', 437 | '2_14', 438 | '1_0', 439 | '2_0', 440 | '0_1' 441 | ] 442 | 443 | 444 | subject_meta_cols = [ 445 | 'num', 446 | 'max_level', 447 | 'sum_level', 448 | 'max_cnum', 449 | 'sum_cnum', 450 | ] 451 | 452 | subject_features = [f'subj_{f}' for f in subject_id_list] 453 | level_cnum_features = [f'subj_{f}' for f in level_cnum_list] 454 | subject_meta_features = [f'subj_{f}' for f in subject_meta_cols] 455 | 456 | user_lag_num_features = [ 457 | 'DateAnswered_dt_diff', 458 | 'DateAnswered_dt_diff_cumsum', 459 | 'DateAnswered_dt_diff_shift', 460 | 'DateAnswered_dt_diff_cumsum_shift', 461 | 'answer_num', 462 | 'answer_num_norm', 463 | 'quiz_answer_num', 464 | 'quiz_answer_num_norm', 465 | 'quiz_unique_num', 466 | 'subj_unique_num', 467 | 'group_unique_num', 468 | 'subjcat_unique_num', 469 | ] 470 | user_lag_cat_features = [ 471 | 'answer_num_div5', 472 | 'quiz_answer_num_div5', 473 | 'change_subjcat', 474 | 'answered_subjcat', 475 | 'prev_question', 476 | 'prev_subjcat', 477 | ] 478 | user_lag_multicat_features = [ 479 | 'prev10_question', 480 | 'prev10_subjcat', 481 | ] 482 | user_lag_features = user_lag_num_features + user_lag_cat_features + user_lag_multicat_features 483 | 484 | answer_date_features = [ 485 | 'DateAnswered_weekday', 486 | 'DateAnswered_hour', 487 | 'DateAnswered_day', 488 | 'DateAnswered_wom' 489 | ] 490 | count_encording_cat = [ 491 | 'QuestionId', 492 | 'UserId', 493 | 'Gender', 494 | 'PremiumPupil', 495 | 'Confidence', 496 | 'GroupId', 497 | 'QuizId', 498 | 'SchemeOfWorkId', 499 | 'age_years', 500 | 'DateAnswered_weekday', 501 | 'DateAnswered_hour', 502 | 'DateAnswered_day', 503 | 'DateAnswered_wom', 504 | 'answer_num_div5', 505 | 'quiz_answer_num_div5', 506 | 'change_subjcat', 507 | 'answered_subjcat', 508 | 'prev_question', 509 | 'prev_subjcat', 510 | 'SubjectId_cat', 511 | 'pri_to_high_stu', 512 | ['UserId', 'DateAnswered_weekday'], 513 | ['UserId', 'DateAnswered_hour'], 514 | ['UserId', 'DateAnswered_day'], 515 | ['UserId', 'DateAnswered_wom'], 516 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 517 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 518 | ['UserId', 'Confidence'], 519 | ['UserId', 'SchemeOfWorkId'], 520 | ['UserId', 'GroupId'], 521 | ['UserId', 'QuizId'], 522 | ['UserId', 'SubjectId_cat'], 523 | ['UserId', 'answer_num_div5'], 524 | ['UserId', 'quiz_answer_num_div5'], 525 | ['UserId', 'change_subjcat'], 526 | ['UserId', 'answered_subjcat'], 527 | ['UserId', 'age_years'], 528 | ['UserId', 'age_years', 'Confidence'], 529 | ['QuestionId', 'Confidence'], 530 | ['QuestionId', 'SchemeOfWorkId'], 531 | ['QuestionId', 'age_years'], 532 | ['QuestionId', 'Gender'], 533 | ['QuestionId', 'answer_num_div5'], 534 | ['QuestionId', 'quiz_answer_num_div5'], 535 | ['QuestionId', 'change_subjcat'], 536 | ['QuestionId', 'answered_subjcat'], 537 | ['QuestionId', 'PremiumPupil'], 538 | ['QuestionId', 'Gender', 'PremiumPupil'], 539 | ['QuestionId', 'age_years', 'Gender'], 540 | ['QuestionId', 'age_years', 'PremiumPupil'], 541 | ['QuestionId', 'age_years', 'Gender', 'PremiumPupil'], 542 | ['QuestionId', 'Confidence', 'PremiumPupil'], 543 | ['QuestionId', 'Confidence', 'Gender', 'PremiumPupil'], 544 | ['QuestionId', 'Confidence', 'age_years', 'Gender'], 545 | ['QuestionId', 'Confidence', 'age_years', 'PremiumPupil'], 546 | ['QuestionId', 'Confidence', 'age_years', 'Gender', 'PremiumPupil'], 547 | ['QuestionId', 'prev_question'], 548 | ['QuestionId', 'DateOfBirth_NaN'], 549 | ['QuestionId', 'pri_to_high_stu'], 550 | ['SubjectId_cat', 'Confidence'], 551 | ['SubjectId_cat', 'SchemeOfWorkId'], 552 | ['SubjectId_cat', 'age_years'], 553 | ['SubjectId_cat', 'Gender'], 554 | ['SubjectId_cat', 'answer_num_div5'], 555 | ['SubjectId_cat', 'quiz_answer_num_div5'], 556 | ['SubjectId_cat', 'change_subjcat'], 557 | ['SubjectId_cat', 'answered_subjcat'], 558 | ['QuestionId', 'GroupId'], 559 | ['QuestionId', 'QuizId'], 560 | ['SchemeOfWorkId', 'Confidence'], 561 | ['SchemeOfWorkId', 'GroupId'], 562 | ['SchemeOfWorkId', 'QuizId'], 563 | ['SchemeOfWorkId', 'age_years'], 564 | ['SchemeOfWorkId', 'Gender'], 565 | ['SchemeOfWorkId', 'answer_num_div5'], 566 | ['SchemeOfWorkId', 'quiz_answer_num_div5'], 567 | ['SchemeOfWorkId', 'change_subjcat'], 568 | ['SchemeOfWorkId', 'answered_subjcat'], 569 | ['SchemeOfWorkId', 'PremiumPupil'], 570 | ['SchemeOfWorkId', 'Gender', 'PremiumPupil'], 571 | ['SchemeOfWorkId', 'age_years', 'Gender'], 572 | ['SchemeOfWorkId', 'age_years', 'PremiumPupil'], 573 | ['SchemeOfWorkId', 'age_years', 'Gender', 'PremiumPupil'], 574 | ] 575 | count_encording_features = [] 576 | for col in count_encording_cat: 577 | if not isinstance(col, list): 578 | col = [col] 579 | name = "_".join(col) 580 | count_encording_features.append(f'{name}_ce') 581 | 582 | te_smooth_factor = 5 583 | target_encording_cat = [ 584 | 'QuestionId', 585 | 'UserId', 586 | 'Gender', 587 | 'PremiumPupil', 588 | 'Confidence', 589 | 'GroupId', 590 | 'QuizId', 591 | 'SchemeOfWorkId', 592 | 'age_years', 593 | 'DateAnswered_weekday', 594 | 'DateAnswered_hour', 595 | 'DateAnswered_day', 596 | 'DateAnswered_wom', 597 | 'answer_num_div5', 598 | 'quiz_answer_num_div5', 599 | 'change_subjcat', 600 | 'answered_subjcat', 601 | 'prev_question', 602 | 'prev_subjcat', 603 | 'SubjectId_cat', 604 | 'DateOfBirth_NaN', 605 | 'pri_to_high_stu', 606 | ['DateAnswered_day', 'DateAnswered_hour'], 607 | ['DateAnswered_weekday', 'DateAnswered_hour'], 608 | ['DateAnswered_weekday', 'DateAnswered_wom'], 609 | ['UserId', 'DateAnswered_weekday'], 610 | ['UserId', 'DateAnswered_hour'], 611 | ['UserId', 'DateAnswered_day'], 612 | ['UserId', 'DateAnswered_wom'], 613 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 614 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 615 | ['UserId', 'Confidence'], 616 | ['UserId', 'SchemeOfWorkId'], 617 | ['UserId', 'GroupId'], 618 | ['UserId', 'QuizId'], 619 | ['UserId', 'SubjectId_cat'], 620 | ['UserId', 'answer_num_div5'], 621 | ['UserId', 'quiz_answer_num_div5'], 622 | ['UserId', 'change_subjcat'], 623 | ['UserId', 'answered_subjcat'], 624 | ['UserId', 'age_years'], 625 | ['UserId', 'age_years', 'Confidence'], 626 | ['QuestionId', 'Confidence'], 627 | ['QuestionId', 'SchemeOfWorkId'], 628 | ['QuestionId', 'age_years'], 629 | ['QuestionId', 'Gender'], 630 | ['QuestionId', 'PremiumPupil'], 631 | ['QuestionId', 'Gender', 'PremiumPupil'], 632 | ['QuestionId', 'age_years', 'Gender'], 633 | ['QuestionId', 'age_years', 'PremiumPupil'], 634 | ['QuestionId', 'age_years', 'Gender', 'PremiumPupil'], 635 | ['QuestionId', 'Confidence', 'PremiumPupil'], 636 | ['QuestionId', 'Confidence', 'Gender', 'PremiumPupil'], 637 | ['QuestionId', 'Confidence', 'age_years', 'Gender'], 638 | ['QuestionId', 'Confidence', 'age_years', 'PremiumPupil'], 639 | ['QuestionId', 'Confidence', 'age_years', 'Gender', 'PremiumPupil'], 640 | ['QuestionId', 'answer_num_div5'], 641 | ['QuestionId', 'quiz_answer_num_div5'], 642 | ['QuestionId', 'GroupId'], 643 | ['QuestionId', 'QuizId'], 644 | ['QuestionId', 'change_subjcat'], 645 | ['QuestionId', 'answered_subjcat'], 646 | ['QuestionId', 'prev_question'], 647 | ['QuestionId', 'DateOfBirth_NaN'], 648 | ['QuestionId', 'pri_to_high_stu'], 649 | ['SubjectId_cat', 'Confidence'], 650 | ['SubjectId_cat', 'SchemeOfWorkId'], 651 | ['SubjectId_cat', 'age_years'], 652 | ['SubjectId_cat', 'Gender'], 653 | ['SubjectId_cat', 'answer_num_div5'], 654 | ['SubjectId_cat', 'quiz_answer_num_div5'], 655 | ['SubjectId_cat', 'change_subjcat'], 656 | ['SubjectId_cat', 'answered_subjcat'], 657 | ['SchemeOfWorkId', 'Confidence'], 658 | ['SchemeOfWorkId', 'GroupId'], 659 | ['SchemeOfWorkId', 'QuizId'], 660 | ['SchemeOfWorkId', 'age_years'], 661 | ['SchemeOfWorkId', 'Gender'], 662 | ['SchemeOfWorkId', 'answer_num_div5'], 663 | ['SchemeOfWorkId', 'quiz_answer_num_div5'], 664 | ['SchemeOfWorkId', 'change_subjcat'], 665 | ['SchemeOfWorkId', 'answered_subjcat'], 666 | ['SchemeOfWorkId', 'PremiumPupil'], 667 | ['SchemeOfWorkId', 'Gender', 'PremiumPupil'], 668 | ['SchemeOfWorkId', 'age_years', 'Gender'], 669 | ['SchemeOfWorkId', 'age_years', 'PremiumPupil'], 670 | ['SchemeOfWorkId', 'age_years', 'Gender', 'PremiumPupil'], 671 | ] 672 | target_encording_features = [] 673 | for tar in ['IsCorrect']: 674 | for col in target_encording_cat: 675 | if not isinstance(col, list): 676 | col = [col] 677 | name = "_".join(col) 678 | target_encording_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 679 | 680 | target_encording_ansval_features = [] 681 | for tar in ['AnswerValue_1', 'AnswerValue_2', 'AnswerValue_3', 'AnswerValue_4']: 682 | for col in target_encording_cat: 683 | if not isinstance(col, list): 684 | col = [col] 685 | name = "_".join(col) 686 | target_encording_ansval_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 687 | 688 | subj_conbi_cols = [ 689 | 'UserId', 690 | 'age_years', 691 | 'answered_subjcat', 692 | 'SchemeOfWorkId', 693 | 'Confidence', 694 | ] 695 | target_encording_subj_conbi_cat = [] 696 | for col in subject_features: 697 | for col2 in subj_conbi_cols: 698 | target_encording_subj_conbi_cat.append([col, col2]) 699 | 700 | target_encording_subj_conbi_features = [] 701 | for tar in ['IsCorrect']: 702 | for col in target_encording_subj_conbi_cat: 703 | if not isinstance(col, list): 704 | col = [col] 705 | name = "_".join(col) 706 | target_encording_subj_conbi_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 707 | 708 | target_encording_subj_agg_features = [] 709 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 710 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_agg_{agg_func}_IsCorrect') 711 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 712 | for conbi_col in subj_conbi_cols: 713 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_{conbi_col}_agg_{agg_func}_IsCorrect') 714 | 715 | svd_n_components = 5 716 | svd_features = [] 717 | svd_features += [f'ques_subj_svd_{i}' for i in range(svd_n_components)] 718 | svd_features += [f'user_subj_svd_{i}' for i in range(svd_n_components)] 719 | 720 | ################################################################ 721 | dense_features = [ 722 | 'age_days' 723 | ] 724 | dense_features += count_encording_features 725 | dense_features += target_encording_features 726 | dense_features += subject_meta_features 727 | dense_features += target_encording_ansval_features 728 | dense_features += user_lag_num_features 729 | dense_features += target_encording_subj_agg_features 730 | dense_features += svd_features 731 | 732 | sparse_features = [ 733 | # 'QuestionId', 734 | # 'UserId', 735 | 'Gender', 736 | 'PremiumPupil', 737 | 'Confidence', 738 | # 'GroupId' 739 | # 'QuizId', 740 | 'SchemeOfWorkId', 741 | 'age_years', 742 | 'SubjectId_cat', 743 | 'DateOfBirth_NaN', 744 | 'pri_to_high_stu', 745 | ] 746 | sparse_features += answer_date_features 747 | sparse_features += user_lag_cat_features 748 | 749 | varlen_sparse_features = [ 750 | # 'SubjectId', 751 | # 'SubjectId_level' 752 | ] 753 | # varlen_sparse_features = varlen_sparse_features + user_lag_multicat_features 754 | ################################################################ -------------------------------------------------------------------------------- /exp/task1_lgbm/config.py: -------------------------------------------------------------------------------- 1 | FOLD_NAME = 'mskf_user' 2 | FOLD_NUM = 5 3 | RANDOM_STATE = 46 4 | 5 | TARGET_TASK = '1' 6 | 7 | LGB_MDOEL_PARAMS = { 8 | "boosting_type": "gbdt", 9 | "learning_rate": 0.1, 10 | "max_depth": 10, 11 | "num_leaves": 256, 12 | "colsample_bytree": 0.8, 13 | "min_child_weight": 0, 14 | "random_state": RANDOM_STATE, 15 | "verbose": -1, 16 | "n_jobs": 20, 17 | } 18 | 19 | LGB_TRAIN_PARAMS = { 20 | "num_boost_round": 2000, 21 | "early_stopping_rounds": 50, 22 | "verbose_eval": 100, 23 | } 24 | 25 | if TARGET_TASK == '1': 26 | LGB_MDOEL_PARAMS['metric'] = ['auc', 'binary_logloss'] 27 | LGB_MDOEL_PARAMS['objective'] = 'binary' 28 | if TARGET_TASK == '2': 29 | LGB_MDOEL_PARAMS['metric'] = ['multi_logloss', 'multi_error'] 30 | LGB_MDOEL_PARAMS['objective'] = 'multiclass' 31 | LGB_MDOEL_PARAMS['num_class'] = 4 32 | 33 | subject_id_list = [ 34 | 3, 35 | 32, 36 | 33, 37 | 34, 38 | 35, 39 | 36, 40 | 37, 41 | 38, 42 | 39, 43 | 40, 44 | 41, 45 | 42, 46 | 44, 47 | 45, 48 | 46, 49 | 47, 50 | 48, 51 | 49, 52 | 50, 53 | 51, 54 | 52, 55 | 53, 56 | 54, 57 | 55, 58 | 56, 59 | 57, 60 | 58, 61 | 59, 62 | 60, 63 | 61, 64 | 62, 65 | 63, 66 | 64, 67 | 65, 68 | 66, 69 | 67, 70 | 68, 71 | 69, 72 | 70, 73 | 71, 74 | 72, 75 | 73, 76 | 74, 77 | 75, 78 | 76, 79 | 77, 80 | 78, 81 | 79, 82 | 80, 83 | 81, 84 | 83, 85 | 84, 86 | 85, 87 | 86, 88 | 87, 89 | 88, 90 | 89, 91 | 90, 92 | 91, 93 | 92, 94 | 93, 95 | 94, 96 | 95, 97 | 96, 98 | 97, 99 | 98, 100 | 99, 101 | 100, 102 | 101, 103 | 102, 104 | 103, 105 | 104, 106 | 105, 107 | 106, 108 | 107, 109 | 108, 110 | 109, 111 | 110, 112 | 111, 113 | 112, 114 | 113, 115 | 114, 116 | 115, 117 | 116, 118 | 117, 119 | 118, 120 | 119, 121 | 120, 122 | 126, 123 | 128, 124 | 129, 125 | 130, 126 | 131, 127 | 137, 128 | 139, 129 | 140, 130 | 141, 131 | 142, 132 | 144, 133 | 146, 134 | 149, 135 | 151, 136 | 152, 137 | 153, 138 | 154, 139 | 156, 140 | 157, 141 | 158, 142 | 159, 143 | 160, 144 | 163, 145 | 164, 146 | 165, 147 | 166, 148 | 167, 149 | 168, 150 | 171, 151 | 172, 152 | 173, 153 | 174, 154 | 175, 155 | 176, 156 | 177, 157 | 178, 158 | 179, 159 | 180, 160 | 181, 161 | 182, 162 | 183, 163 | 184, 164 | 185, 165 | 186, 166 | 187, 167 | 188, 168 | 189, 169 | 190, 170 | 191, 171 | 192, 172 | 193, 173 | 195, 174 | 196, 175 | 197, 176 | 198, 177 | 199, 178 | 200, 179 | 202, 180 | 203, 181 | 204, 182 | 205, 183 | 206, 184 | 207, 185 | 208, 186 | 209, 187 | 210, 188 | 211, 189 | 212, 190 | 213, 191 | 214, 192 | 215, 193 | 216, 194 | 217, 195 | 218, 196 | 219, 197 | 220, 198 | 221, 199 | 222, 200 | 223, 201 | 224, 202 | 225, 203 | 226, 204 | 227, 205 | 228, 206 | 229, 207 | 230, 208 | 231, 209 | 232, 210 | 233, 211 | 234, 212 | 235, 213 | 236, 214 | 237, 215 | 238, 216 | 239, 217 | 240, 218 | 241, 219 | 242, 220 | 243, 221 | 244, 222 | 245, 223 | 246, 224 | 247, 225 | 248, 226 | 249, 227 | 250, 228 | 251, 229 | 252, 230 | 253, 231 | 254, 232 | 255, 233 | 256, 234 | 257, 235 | 258, 236 | 259, 237 | 260, 238 | 261, 239 | 262, 240 | 263, 241 | 264, 242 | 265, 243 | 266, 244 | 267, 245 | 268, 246 | 269, 247 | 270, 248 | 271, 249 | 272, 250 | 273, 251 | 274, 252 | 275, 253 | 276, 254 | 277, 255 | 278, 256 | 279, 257 | 280, 258 | 281, 259 | 282, 260 | 283, 261 | 284, 262 | 298, 263 | 313, 264 | 315, 265 | 317, 266 | 331, 267 | 332, 268 | 334, 269 | 335, 270 | 336, 271 | 337, 272 | 338, 273 | 339, 274 | 340, 275 | 341, 276 | 342, 277 | 343, 278 | 344, 279 | 348, 280 | 349, 281 | 350, 282 | 351, 283 | 352, 284 | 353, 285 | 354, 286 | 355, 287 | 361, 288 | 365, 289 | 366, 290 | 367, 291 | 369, 292 | 370, 293 | 371, 294 | 372, 295 | 374, 296 | 375, 297 | 376, 298 | 377, 299 | 388, 300 | 406, 301 | 407, 302 | 408, 303 | 409, 304 | 410, 305 | 411, 306 | 412, 307 | 416, 308 | 417, 309 | 418, 310 | 430, 311 | 431, 312 | 432, 313 | 434, 314 | 435, 315 | 436, 316 | 437, 317 | 439, 318 | 441, 319 | 442, 320 | 446, 321 | 447, 322 | 448, 323 | 451, 324 | 453, 325 | 462, 326 | 474, 327 | 480, 328 | 487, 329 | 539, 330 | 540, 331 | 649, 332 | 654, 333 | 655, 334 | 656, 335 | 657, 336 | 692, 337 | 698, 338 | 700, 339 | 1059, 340 | 1076, 341 | 1077, 342 | 1078, 343 | 1079, 344 | 1080, 345 | 1081, 346 | 1082, 347 | 1156, 348 | 1157, 349 | 1158, 350 | 1159, 351 | 1160, 352 | 1161, 353 | 1162, 354 | 1163, 355 | 1164, 356 | 1165, 357 | 1166, 358 | 1167, 359 | 1168, 360 | 1169, 361 | 1170, 362 | 1171, 363 | 1172, 364 | 1173, 365 | 1174, 366 | 1175, 367 | 1176, 368 | 1177, 369 | 1178, 370 | 1179, 371 | 1180, 372 | 1181, 373 | 1182, 374 | 1183, 375 | 1184, 376 | 1185, 377 | 1186, 378 | 1187, 379 | 1188, 380 | 1189, 381 | 1200, 382 | 1201, 383 | 1202, 384 | 1203, 385 | 1204, 386 | 1207, 387 | 1208, 388 | 1209, 389 | 1210, 390 | 1211, 391 | 1212, 392 | 1213, 393 | 1214, 394 | 1215, 395 | 1216, 396 | 1217, 397 | 1218, 398 | 1219, 399 | 1263, 400 | 1264, 401 | 1265, 402 | 1266, 403 | 1636, 404 | 1642, 405 | 1647, 406 | 1648, 407 | 1649, 408 | 1650, 409 | 1651, 410 | 1675, 411 | 1676, 412 | 1750, 413 | 1975, 414 | 1976, 415 | 1977, 416 | 1980, 417 | 1982, 418 | 1983, 419 | 1985, 420 | 1987, 421 | 1988 422 | ] 423 | 424 | level_cnum_list = [ 425 | '0_8', 426 | '1_16', 427 | '3_0', 428 | '2_3', 429 | '2_4', 430 | '2_5', 431 | '2_10', 432 | '2_7', 433 | '1_14', 434 | '2_11', 435 | '2_6', 436 | '2_9', 437 | '2_8', 438 | '1_5', 439 | '1_12', 440 | '2_1', 441 | '2_13', 442 | '1_1', 443 | '1_4', 444 | '2_2', 445 | '2_14', 446 | '1_0', 447 | '2_0', 448 | '0_1' 449 | ] 450 | 451 | 452 | subject_meta_cols = [ 453 | 'num', 454 | 'max_level', 455 | 'sum_level', 456 | 'max_cnum', 457 | 'sum_cnum', 458 | ] 459 | 460 | subject_features = [f'subj_{f}' for f in subject_id_list] 461 | level_cnum_features = [f'subj_{f}' for f in level_cnum_list] 462 | subject_meta_features = [f'subj_{f}' for f in subject_meta_cols] 463 | 464 | user_lag_num_features = [ 465 | 'DateAnswered_dt_diff', 466 | 'DateAnswered_dt_diff_cumsum', 467 | 'DateAnswered_dt_diff_shift', 468 | 'DateAnswered_dt_diff_cumsum_shift', 469 | 'answer_num', 470 | 'answer_num_norm', 471 | 'quiz_answer_num', 472 | 'quiz_answer_num_norm', 473 | 'quiz_unique_num', 474 | 'subj_unique_num', 475 | 'group_unique_num', 476 | 'subjcat_unique_num', 477 | ] 478 | user_lag_cat_features = [ 479 | 'answer_num_div5', 480 | 'quiz_answer_num_div5', 481 | 'change_subjcat', 482 | 'answered_subjcat', 483 | 'prev_question', 484 | 'prev_subjcat', 485 | ] 486 | user_lag_multicat_features = [ 487 | 'prev10_question', 488 | 'prev10_subjcat', 489 | ] 490 | user_lag_features = user_lag_num_features + user_lag_cat_features + user_lag_multicat_features 491 | 492 | answer_date_features = [ 493 | 'DateAnswered_weekday', 494 | 'DateAnswered_hour', 495 | 'DateAnswered_day', 496 | 'DateAnswered_wom' 497 | ] 498 | count_encording_cat = [ 499 | 'QuestionId', 500 | 'UserId', 501 | 'Gender', 502 | 'PremiumPupil', 503 | 'Confidence', 504 | 'GroupId', 505 | 'QuizId', 506 | 'SchemeOfWorkId', 507 | 'age_years', 508 | 'DateAnswered_weekday', 509 | 'DateAnswered_hour', 510 | 'DateAnswered_day', 511 | 'DateAnswered_wom', 512 | 'answer_num_div5', 513 | 'quiz_answer_num_div5', 514 | 'change_subjcat', 515 | 'answered_subjcat', 516 | 'prev_question', 517 | 'prev_subjcat', 518 | 'SubjectId_cat', 519 | 'pri_to_high_stu', 520 | ['UserId', 'DateAnswered_weekday'], 521 | ['UserId', 'DateAnswered_hour'], 522 | ['UserId', 'DateAnswered_day'], 523 | ['UserId', 'DateAnswered_wom'], 524 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 525 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 526 | ['UserId', 'Confidence'], 527 | ['UserId', 'SchemeOfWorkId'], 528 | ['UserId', 'GroupId'], 529 | ['UserId', 'QuizId'], 530 | ['UserId', 'SubjectId_cat'], 531 | ['UserId', 'answer_num_div5'], 532 | ['UserId', 'quiz_answer_num_div5'], 533 | ['UserId', 'change_subjcat'], 534 | ['UserId', 'answered_subjcat'], 535 | ['UserId', 'age_years'], 536 | ['UserId', 'age_years', 'Confidence'], 537 | ['QuestionId', 'Confidence'], 538 | ['QuestionId', 'SchemeOfWorkId'], 539 | ['QuestionId', 'age_years'], 540 | ['QuestionId', 'Gender'], 541 | ['QuestionId', 'answer_num_div5'], 542 | ['QuestionId', 'quiz_answer_num_div5'], 543 | ['QuestionId', 'change_subjcat'], 544 | ['QuestionId', 'answered_subjcat'], 545 | ['QuestionId', 'PremiumPupil'], 546 | ['QuestionId', 'Gender', 'PremiumPupil'], 547 | ['QuestionId', 'age_years', 'Gender'], 548 | ['QuestionId', 'age_years', 'PremiumPupil'], 549 | ['QuestionId', 'age_years', 'Gender', 'PremiumPupil'], 550 | ['QuestionId', 'Confidence', 'PremiumPupil'], 551 | ['QuestionId', 'Confidence', 'Gender', 'PremiumPupil'], 552 | ['QuestionId', 'Confidence', 'age_years', 'Gender'], 553 | ['QuestionId', 'Confidence', 'age_years', 'PremiumPupil'], 554 | ['QuestionId', 'Confidence', 'age_years', 'Gender', 'PremiumPupil'], 555 | ['QuestionId', 'prev_question'], 556 | ['QuestionId', 'DateOfBirth_NaN'], 557 | ['QuestionId', 'pri_to_high_stu'], 558 | ['SubjectId_cat', 'Confidence'], 559 | ['SubjectId_cat', 'SchemeOfWorkId'], 560 | ['SubjectId_cat', 'age_years'], 561 | ['SubjectId_cat', 'Gender'], 562 | ['SubjectId_cat', 'answer_num_div5'], 563 | ['SubjectId_cat', 'quiz_answer_num_div5'], 564 | ['SubjectId_cat', 'change_subjcat'], 565 | ['SubjectId_cat', 'answered_subjcat'], 566 | ['QuestionId', 'GroupId'], 567 | ['QuestionId', 'QuizId'], 568 | ['SchemeOfWorkId', 'Confidence'], 569 | ['SchemeOfWorkId', 'GroupId'], 570 | ['SchemeOfWorkId', 'QuizId'], 571 | ['SchemeOfWorkId', 'age_years'], 572 | ['SchemeOfWorkId', 'Gender'], 573 | ['SchemeOfWorkId', 'answer_num_div5'], 574 | ['SchemeOfWorkId', 'quiz_answer_num_div5'], 575 | ['SchemeOfWorkId', 'change_subjcat'], 576 | ['SchemeOfWorkId', 'answered_subjcat'], 577 | ['SchemeOfWorkId', 'PremiumPupil'], 578 | ['SchemeOfWorkId', 'Gender', 'PremiumPupil'], 579 | ['SchemeOfWorkId', 'age_years', 'Gender'], 580 | ['SchemeOfWorkId', 'age_years', 'PremiumPupil'], 581 | ['SchemeOfWorkId', 'age_years', 'Gender', 'PremiumPupil'], 582 | ] 583 | count_encording_features = [] 584 | for col in count_encording_cat: 585 | if not isinstance(col, list): 586 | col = [col] 587 | name = "_".join(col) 588 | count_encording_features.append(f'{name}_ce') 589 | 590 | te_smooth_factor = 5 591 | # te_smooth_factor = 2 592 | target_encording_cat = [ 593 | 'QuestionId', 594 | 'UserId', 595 | 'Gender', 596 | 'PremiumPupil', 597 | 'Confidence', 598 | 'GroupId', 599 | 'QuizId', 600 | 'SchemeOfWorkId', 601 | 'age_years', 602 | 'DateAnswered_weekday', 603 | 'DateAnswered_hour', 604 | 'DateAnswered_day', 605 | 'DateAnswered_wom', 606 | 'answer_num_div5', 607 | 'quiz_answer_num_div5', 608 | 'change_subjcat', 609 | 'answered_subjcat', 610 | 'prev_question', 611 | 'prev_subjcat', 612 | 'SubjectId_cat', 613 | 'DateOfBirth_NaN', 614 | 'pri_to_high_stu', 615 | ['DateAnswered_day', 'DateAnswered_hour'], 616 | ['DateAnswered_weekday', 'DateAnswered_hour'], 617 | ['DateAnswered_weekday', 'DateAnswered_wom'], 618 | ['UserId', 'DateAnswered_weekday'], 619 | ['UserId', 'DateAnswered_hour'], 620 | ['UserId', 'DateAnswered_day'], 621 | ['UserId', 'DateAnswered_wom'], 622 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 623 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 624 | ['UserId', 'Confidence'], 625 | ['UserId', 'SchemeOfWorkId'], 626 | ['UserId', 'GroupId'], 627 | ['UserId', 'QuizId'], 628 | ['UserId', 'SubjectId_cat'], 629 | ['UserId', 'answer_num_div5'], 630 | ['UserId', 'quiz_answer_num_div5'], 631 | ['UserId', 'change_subjcat'], 632 | ['UserId', 'answered_subjcat'], 633 | ['UserId', 'age_years'], 634 | ['UserId', 'age_years', 'Confidence'], 635 | ['QuestionId', 'Confidence'], 636 | ['QuestionId', 'SchemeOfWorkId'], 637 | ['QuestionId', 'age_years'], 638 | ['QuestionId', 'Gender'], 639 | ['QuestionId', 'PremiumPupil'], 640 | ['QuestionId', 'Gender', 'PremiumPupil'], 641 | ['QuestionId', 'age_years', 'Gender'], 642 | ['QuestionId', 'age_years', 'PremiumPupil'], 643 | ['QuestionId', 'age_years', 'Gender', 'PremiumPupil'], 644 | ['QuestionId', 'Confidence', 'PremiumPupil'], 645 | ['QuestionId', 'Confidence', 'Gender', 'PremiumPupil'], 646 | ['QuestionId', 'Confidence', 'age_years', 'Gender'], 647 | ['QuestionId', 'Confidence', 'age_years', 'PremiumPupil'], 648 | ['QuestionId', 'Confidence', 'age_years', 'Gender', 'PremiumPupil'], 649 | ['QuestionId', 'answer_num_div5'], 650 | ['QuestionId', 'quiz_answer_num_div5'], 651 | ['QuestionId', 'GroupId'], 652 | ['QuestionId', 'QuizId'], 653 | ['QuestionId', 'change_subjcat'], 654 | ['QuestionId', 'answered_subjcat'], 655 | ['QuestionId', 'prev_question'], 656 | ['QuestionId', 'DateOfBirth_NaN'], 657 | ['QuestionId', 'pri_to_high_stu'], 658 | ['SubjectId_cat', 'Confidence'], 659 | ['SubjectId_cat', 'SchemeOfWorkId'], 660 | ['SubjectId_cat', 'age_years'], 661 | ['SubjectId_cat', 'Gender'], 662 | ['SubjectId_cat', 'answer_num_div5'], 663 | ['SubjectId_cat', 'quiz_answer_num_div5'], 664 | ['SubjectId_cat', 'change_subjcat'], 665 | ['SubjectId_cat', 'answered_subjcat'], 666 | ['SchemeOfWorkId', 'Confidence'], 667 | ['SchemeOfWorkId', 'GroupId'], 668 | ['SchemeOfWorkId', 'QuizId'], 669 | ['SchemeOfWorkId', 'age_years'], 670 | ['SchemeOfWorkId', 'Gender'], 671 | ['SchemeOfWorkId', 'answer_num_div5'], 672 | ['SchemeOfWorkId', 'quiz_answer_num_div5'], 673 | ['SchemeOfWorkId', 'change_subjcat'], 674 | ['SchemeOfWorkId', 'answered_subjcat'], 675 | ['SchemeOfWorkId', 'PremiumPupil'], 676 | ['SchemeOfWorkId', 'Gender', 'PremiumPupil'], 677 | ['SchemeOfWorkId', 'age_years', 'Gender'], 678 | ['SchemeOfWorkId', 'age_years', 'PremiumPupil'], 679 | ['SchemeOfWorkId', 'age_years', 'Gender', 'PremiumPupil'], 680 | ] 681 | target_encording_features = [] 682 | for tar in ['IsCorrect']: 683 | for col in target_encording_cat: 684 | if not isinstance(col, list): 685 | col = [col] 686 | name = "_".join(col) 687 | target_encording_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 688 | 689 | target_encording_ansval_features = [] 690 | for tar in ['AnswerValue_1', 'AnswerValue_2', 'AnswerValue_3', 'AnswerValue_4']: 691 | for col in target_encording_cat: 692 | if not isinstance(col, list): 693 | col = [col] 694 | name = "_".join(col) 695 | target_encording_ansval_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 696 | 697 | te_smooth_factor = 5 698 | subj_conbi_cols = [ 699 | 'UserId', 700 | 'age_years', 701 | 'answered_subjcat', 702 | 'SchemeOfWorkId', 703 | 'Confidence', 704 | ] 705 | target_encording_subj_conbi_cat = [] 706 | for col in subject_features: 707 | for col2 in subj_conbi_cols: 708 | target_encording_subj_conbi_cat.append([col, col2]) 709 | 710 | target_encording_subj_conbi_features = [] 711 | for tar in ['IsCorrect']: 712 | for col in target_encording_subj_conbi_cat: 713 | if not isinstance(col, list): 714 | col = [col] 715 | name = "_".join(col) 716 | target_encording_subj_conbi_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 717 | 718 | target_encording_subj_agg_features = [] 719 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 720 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_agg_{agg_func}_IsCorrect') 721 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 722 | for conbi_col in subj_conbi_cols: 723 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_{conbi_col}_agg_{agg_func}_IsCorrect') 724 | 725 | svd_n_components = 5 726 | svd_features = [] 727 | svd_features += [f'ques_subj_svd_{i}' for i in range(svd_n_components)] 728 | svd_features += [f'user_subj_svd_{i}' for i in range(svd_n_components)] 729 | 730 | ################################################################ 731 | dense_features = [ 732 | 'age_days' 733 | ] 734 | dense_features += count_encording_features 735 | dense_features += target_encording_features 736 | dense_features += subject_meta_features 737 | dense_features += target_encording_ansval_features 738 | dense_features += user_lag_num_features 739 | dense_features += target_encording_subj_agg_features 740 | dense_features += svd_features 741 | 742 | sparse_features = [ 743 | # 'QuestionId', 744 | # 'UserId', 745 | 'Gender', 746 | 'PremiumPupil', 747 | 'Confidence', 748 | # 'GroupId' 749 | # 'QuizId', 750 | 'SchemeOfWorkId', 751 | 'age_years', 752 | 'SubjectId_cat', 753 | 'DateOfBirth_NaN', 754 | 'pri_to_high_stu', 755 | ] 756 | sparse_features += answer_date_features 757 | sparse_features += user_lag_cat_features 758 | 759 | 760 | varlen_sparse_features = [ 761 | # 'SubjectId', 762 | # 'SubjectId_level' 763 | ] 764 | # varlen_sparse_features = varlen_sparse_features + user_lag_multicat_features 765 | ################################################################ -------------------------------------------------------------------------------- /exp/task1_lgbm_2/config.py: -------------------------------------------------------------------------------- 1 | FOLD_NAME = 'mskf_user' 2 | FOLD_NUM = 5 3 | RANDOM_STATE = 46 4 | 5 | TARGET_TASK = '1' 6 | 7 | LGB_MDOEL_PARAMS = { 8 | "boosting_type": "gbdt", 9 | "learning_rate": 0.1, 10 | "max_depth": 10, 11 | "num_leaves": 256, 12 | "colsample_bytree": 0.8, 13 | "min_child_weight": 0, 14 | "random_state": RANDOM_STATE, 15 | "verbose": -1, 16 | "n_jobs": 20, 17 | } 18 | 19 | LGB_TRAIN_PARAMS = { 20 | "num_boost_round": 2000, 21 | "early_stopping_rounds": 50, 22 | "verbose_eval": 100, 23 | } 24 | 25 | if TARGET_TASK == '1': 26 | LGB_MDOEL_PARAMS['metric'] = ['auc', 'binary_logloss'] 27 | LGB_MDOEL_PARAMS['objective'] = 'binary' 28 | if TARGET_TASK == '2': 29 | LGB_MDOEL_PARAMS['metric'] = ['multi_logloss', 'multi_error'] 30 | LGB_MDOEL_PARAMS['objective'] = 'multiclass' 31 | LGB_MDOEL_PARAMS['num_class'] = 4 32 | 33 | subject_id_list = [ 34 | 3, 35 | 32, 36 | 33, 37 | 34, 38 | 35, 39 | 36, 40 | 37, 41 | 38, 42 | 39, 43 | 40, 44 | 41, 45 | 42, 46 | 44, 47 | 45, 48 | 46, 49 | 47, 50 | 48, 51 | 49, 52 | 50, 53 | 51, 54 | 52, 55 | 53, 56 | 54, 57 | 55, 58 | 56, 59 | 57, 60 | 58, 61 | 59, 62 | 60, 63 | 61, 64 | 62, 65 | 63, 66 | 64, 67 | 65, 68 | 66, 69 | 67, 70 | 68, 71 | 69, 72 | 70, 73 | 71, 74 | 72, 75 | 73, 76 | 74, 77 | 75, 78 | 76, 79 | 77, 80 | 78, 81 | 79, 82 | 80, 83 | 81, 84 | 83, 85 | 84, 86 | 85, 87 | 86, 88 | 87, 89 | 88, 90 | 89, 91 | 90, 92 | 91, 93 | 92, 94 | 93, 95 | 94, 96 | 95, 97 | 96, 98 | 97, 99 | 98, 100 | 99, 101 | 100, 102 | 101, 103 | 102, 104 | 103, 105 | 104, 106 | 105, 107 | 106, 108 | 107, 109 | 108, 110 | 109, 111 | 110, 112 | 111, 113 | 112, 114 | 113, 115 | 114, 116 | 115, 117 | 116, 118 | 117, 119 | 118, 120 | 119, 121 | 120, 122 | 126, 123 | 128, 124 | 129, 125 | 130, 126 | 131, 127 | 137, 128 | 139, 129 | 140, 130 | 141, 131 | 142, 132 | 144, 133 | 146, 134 | 149, 135 | 151, 136 | 152, 137 | 153, 138 | 154, 139 | 156, 140 | 157, 141 | 158, 142 | 159, 143 | 160, 144 | 163, 145 | 164, 146 | 165, 147 | 166, 148 | 167, 149 | 168, 150 | 171, 151 | 172, 152 | 173, 153 | 174, 154 | 175, 155 | 176, 156 | 177, 157 | 178, 158 | 179, 159 | 180, 160 | 181, 161 | 182, 162 | 183, 163 | 184, 164 | 185, 165 | 186, 166 | 187, 167 | 188, 168 | 189, 169 | 190, 170 | 191, 171 | 192, 172 | 193, 173 | 195, 174 | 196, 175 | 197, 176 | 198, 177 | 199, 178 | 200, 179 | 202, 180 | 203, 181 | 204, 182 | 205, 183 | 206, 184 | 207, 185 | 208, 186 | 209, 187 | 210, 188 | 211, 189 | 212, 190 | 213, 191 | 214, 192 | 215, 193 | 216, 194 | 217, 195 | 218, 196 | 219, 197 | 220, 198 | 221, 199 | 222, 200 | 223, 201 | 224, 202 | 225, 203 | 226, 204 | 227, 205 | 228, 206 | 229, 207 | 230, 208 | 231, 209 | 232, 210 | 233, 211 | 234, 212 | 235, 213 | 236, 214 | 237, 215 | 238, 216 | 239, 217 | 240, 218 | 241, 219 | 242, 220 | 243, 221 | 244, 222 | 245, 223 | 246, 224 | 247, 225 | 248, 226 | 249, 227 | 250, 228 | 251, 229 | 252, 230 | 253, 231 | 254, 232 | 255, 233 | 256, 234 | 257, 235 | 258, 236 | 259, 237 | 260, 238 | 261, 239 | 262, 240 | 263, 241 | 264, 242 | 265, 243 | 266, 244 | 267, 245 | 268, 246 | 269, 247 | 270, 248 | 271, 249 | 272, 250 | 273, 251 | 274, 252 | 275, 253 | 276, 254 | 277, 255 | 278, 256 | 279, 257 | 280, 258 | 281, 259 | 282, 260 | 283, 261 | 284, 262 | 298, 263 | 313, 264 | 315, 265 | 317, 266 | 331, 267 | 332, 268 | 334, 269 | 335, 270 | 336, 271 | 337, 272 | 338, 273 | 339, 274 | 340, 275 | 341, 276 | 342, 277 | 343, 278 | 344, 279 | 348, 280 | 349, 281 | 350, 282 | 351, 283 | 352, 284 | 353, 285 | 354, 286 | 355, 287 | 361, 288 | 365, 289 | 366, 290 | 367, 291 | 369, 292 | 370, 293 | 371, 294 | 372, 295 | 374, 296 | 375, 297 | 376, 298 | 377, 299 | 388, 300 | 406, 301 | 407, 302 | 408, 303 | 409, 304 | 410, 305 | 411, 306 | 412, 307 | 416, 308 | 417, 309 | 418, 310 | 430, 311 | 431, 312 | 432, 313 | 434, 314 | 435, 315 | 436, 316 | 437, 317 | 439, 318 | 441, 319 | 442, 320 | 446, 321 | 447, 322 | 448, 323 | 451, 324 | 453, 325 | 462, 326 | 474, 327 | 480, 328 | 487, 329 | 539, 330 | 540, 331 | 649, 332 | 654, 333 | 655, 334 | 656, 335 | 657, 336 | 692, 337 | 698, 338 | 700, 339 | 1059, 340 | 1076, 341 | 1077, 342 | 1078, 343 | 1079, 344 | 1080, 345 | 1081, 346 | 1082, 347 | 1156, 348 | 1157, 349 | 1158, 350 | 1159, 351 | 1160, 352 | 1161, 353 | 1162, 354 | 1163, 355 | 1164, 356 | 1165, 357 | 1166, 358 | 1167, 359 | 1168, 360 | 1169, 361 | 1170, 362 | 1171, 363 | 1172, 364 | 1173, 365 | 1174, 366 | 1175, 367 | 1176, 368 | 1177, 369 | 1178, 370 | 1179, 371 | 1180, 372 | 1181, 373 | 1182, 374 | 1183, 375 | 1184, 376 | 1185, 377 | 1186, 378 | 1187, 379 | 1188, 380 | 1189, 381 | 1200, 382 | 1201, 383 | 1202, 384 | 1203, 385 | 1204, 386 | 1207, 387 | 1208, 388 | 1209, 389 | 1210, 390 | 1211, 391 | 1212, 392 | 1213, 393 | 1214, 394 | 1215, 395 | 1216, 396 | 1217, 397 | 1218, 398 | 1219, 399 | 1263, 400 | 1264, 401 | 1265, 402 | 1266, 403 | 1636, 404 | 1642, 405 | 1647, 406 | 1648, 407 | 1649, 408 | 1650, 409 | 1651, 410 | 1675, 411 | 1676, 412 | 1750, 413 | 1975, 414 | 1976, 415 | 1977, 416 | 1980, 417 | 1982, 418 | 1983, 419 | 1985, 420 | 1987, 421 | 1988 422 | ] 423 | 424 | level_cnum_list = [ 425 | '0_8', 426 | '1_16', 427 | '3_0', 428 | '2_3', 429 | '2_4', 430 | '2_5', 431 | '2_10', 432 | '2_7', 433 | '1_14', 434 | '2_11', 435 | '2_6', 436 | '2_9', 437 | '2_8', 438 | '1_5', 439 | '1_12', 440 | '2_1', 441 | '2_13', 442 | '1_1', 443 | '1_4', 444 | '2_2', 445 | '2_14', 446 | '1_0', 447 | '2_0', 448 | '0_1' 449 | ] 450 | 451 | 452 | subject_meta_cols = [ 453 | 'num', 454 | 'max_level', 455 | 'sum_level', 456 | 'max_cnum', 457 | 'sum_cnum', 458 | ] 459 | 460 | subject_features = [f'subj_{f}' for f in subject_id_list] 461 | level_cnum_features = [f'subj_{f}' for f in level_cnum_list] 462 | subject_meta_features = [f'subj_{f}' for f in subject_meta_cols] 463 | 464 | user_lag_num_features = [ 465 | 'DateAnswered_dt_diff', 466 | 'DateAnswered_dt_diff_cumsum', 467 | 'DateAnswered_dt_diff_shift', 468 | 'DateAnswered_dt_diff_cumsum_shift', 469 | 'answer_num', 470 | 'answer_num_norm', 471 | 'quiz_answer_num', 472 | 'quiz_answer_num_norm', 473 | 'quiz_unique_num', 474 | 'subj_unique_num', 475 | 'group_unique_num', 476 | 'subjcat_unique_num', 477 | ] 478 | user_lag_cat_features = [ 479 | 'answer_num_div5', 480 | 'quiz_answer_num_div5', 481 | 'change_subjcat', 482 | 'answered_subjcat', 483 | 'prev_question', 484 | 'prev_subjcat', 485 | ] 486 | user_lag_multicat_features = [ 487 | 'prev10_question', 488 | 'prev10_subjcat', 489 | ] 490 | user_lag_features = user_lag_num_features + user_lag_cat_features + user_lag_multicat_features 491 | 492 | answer_date_features = [ 493 | 'DateAnswered_weekday', 494 | 'DateAnswered_hour', 495 | 'DateAnswered_day', 496 | 'DateAnswered_wom' 497 | ] 498 | count_encording_cat = [ 499 | 'QuestionId', 500 | 'UserId', 501 | 'Gender', 502 | 'PremiumPupil', 503 | 'Confidence', 504 | 'GroupId', 505 | 'QuizId', 506 | 'SchemeOfWorkId', 507 | 'age_years', 508 | 'DateAnswered_weekday', 509 | 'DateAnswered_hour', 510 | 'DateAnswered_day', 511 | 'DateAnswered_wom', 512 | 'answer_num_div5', 513 | 'quiz_answer_num_div5', 514 | 'change_subjcat', 515 | 'answered_subjcat', 516 | 'prev_question', 517 | 'prev_subjcat', 518 | 'SubjectId_cat', 519 | 'pri_to_high_stu', 520 | ['UserId', 'DateAnswered_weekday'], 521 | ['UserId', 'DateAnswered_hour'], 522 | ['UserId', 'DateAnswered_day'], 523 | ['UserId', 'DateAnswered_wom'], 524 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 525 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 526 | ['UserId', 'Confidence'], 527 | ['UserId', 'SchemeOfWorkId'], 528 | ['UserId', 'GroupId'], 529 | ['UserId', 'QuizId'], 530 | ['UserId', 'SubjectId_cat'], 531 | ['UserId', 'answer_num_div5'], 532 | ['UserId', 'quiz_answer_num_div5'], 533 | ['UserId', 'change_subjcat'], 534 | ['UserId', 'answered_subjcat'], 535 | ['UserId', 'age_years'], 536 | ['UserId', 'age_years', 'Confidence'], 537 | ['QuestionId', 'Confidence'], 538 | ['QuestionId', 'SchemeOfWorkId'], 539 | ['QuestionId', 'age_years'], 540 | ['QuestionId', 'Gender'], 541 | ['QuestionId', 'answer_num_div5'], 542 | ['QuestionId', 'quiz_answer_num_div5'], 543 | ['QuestionId', 'change_subjcat'], 544 | ['QuestionId', 'answered_subjcat'], 545 | ['QuestionId', 'PremiumPupil'], 546 | ['QuestionId', 'Gender', 'PremiumPupil'], 547 | ['QuestionId', 'age_years', 'Gender'], 548 | ['QuestionId', 'age_years', 'PremiumPupil'], 549 | ['QuestionId', 'age_years', 'Gender', 'PremiumPupil'], 550 | ['QuestionId', 'Confidence', 'PremiumPupil'], 551 | ['QuestionId', 'Confidence', 'Gender', 'PremiumPupil'], 552 | ['QuestionId', 'Confidence', 'age_years', 'Gender'], 553 | ['QuestionId', 'Confidence', 'age_years', 'PremiumPupil'], 554 | ['QuestionId', 'Confidence', 'age_years', 'Gender', 'PremiumPupil'], 555 | ['QuestionId', 'prev_question'], 556 | ['QuestionId', 'DateOfBirth_NaN'], 557 | ['QuestionId', 'pri_to_high_stu'], 558 | ['SubjectId_cat', 'Confidence'], 559 | ['SubjectId_cat', 'SchemeOfWorkId'], 560 | ['SubjectId_cat', 'age_years'], 561 | ['SubjectId_cat', 'Gender'], 562 | ['SubjectId_cat', 'answer_num_div5'], 563 | ['SubjectId_cat', 'quiz_answer_num_div5'], 564 | ['SubjectId_cat', 'change_subjcat'], 565 | ['SubjectId_cat', 'answered_subjcat'], 566 | ['QuestionId', 'GroupId'], 567 | ['QuestionId', 'QuizId'], 568 | ['SchemeOfWorkId', 'Confidence'], 569 | ['SchemeOfWorkId', 'GroupId'], 570 | ['SchemeOfWorkId', 'QuizId'], 571 | ['SchemeOfWorkId', 'age_years'], 572 | ['SchemeOfWorkId', 'Gender'], 573 | ['SchemeOfWorkId', 'answer_num_div5'], 574 | ['SchemeOfWorkId', 'quiz_answer_num_div5'], 575 | ['SchemeOfWorkId', 'change_subjcat'], 576 | ['SchemeOfWorkId', 'answered_subjcat'], 577 | ['SchemeOfWorkId', 'PremiumPupil'], 578 | ['SchemeOfWorkId', 'Gender', 'PremiumPupil'], 579 | ['SchemeOfWorkId', 'age_years', 'Gender'], 580 | ['SchemeOfWorkId', 'age_years', 'PremiumPupil'], 581 | ['SchemeOfWorkId', 'age_years', 'Gender', 'PremiumPupil'], 582 | ] 583 | count_encording_features = [] 584 | for col in count_encording_cat: 585 | if not isinstance(col, list): 586 | col = [col] 587 | name = "_".join(col) 588 | count_encording_features.append(f'{name}_ce') 589 | 590 | # te_smooth_factor = 5 591 | te_smooth_factor = 2 592 | target_encording_cat = [ 593 | 'QuestionId', 594 | 'UserId', 595 | 'Gender', 596 | 'PremiumPupil', 597 | 'Confidence', 598 | 'GroupId', 599 | 'QuizId', 600 | 'SchemeOfWorkId', 601 | 'age_years', 602 | 'DateAnswered_weekday', 603 | 'DateAnswered_hour', 604 | 'DateAnswered_day', 605 | 'DateAnswered_wom', 606 | 'answer_num_div5', 607 | 'quiz_answer_num_div5', 608 | 'change_subjcat', 609 | 'answered_subjcat', 610 | 'prev_question', 611 | 'prev_subjcat', 612 | 'SubjectId_cat', 613 | 'DateOfBirth_NaN', 614 | 'pri_to_high_stu', 615 | ['DateAnswered_day', 'DateAnswered_hour'], 616 | ['DateAnswered_weekday', 'DateAnswered_hour'], 617 | ['DateAnswered_weekday', 'DateAnswered_wom'], 618 | ['UserId', 'DateAnswered_weekday'], 619 | ['UserId', 'DateAnswered_hour'], 620 | ['UserId', 'DateAnswered_day'], 621 | ['UserId', 'DateAnswered_wom'], 622 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 623 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 624 | ['UserId', 'Confidence'], 625 | ['UserId', 'SchemeOfWorkId'], 626 | ['UserId', 'GroupId'], 627 | ['UserId', 'QuizId'], 628 | ['UserId', 'SubjectId_cat'], 629 | ['UserId', 'answer_num_div5'], 630 | ['UserId', 'quiz_answer_num_div5'], 631 | ['UserId', 'change_subjcat'], 632 | ['UserId', 'answered_subjcat'], 633 | ['UserId', 'age_years'], 634 | ['UserId', 'age_years', 'Confidence'], 635 | ['QuestionId', 'Confidence'], 636 | ['QuestionId', 'SchemeOfWorkId'], 637 | ['QuestionId', 'age_years'], 638 | ['QuestionId', 'Gender'], 639 | ['QuestionId', 'PremiumPupil'], 640 | ['QuestionId', 'Gender', 'PremiumPupil'], 641 | ['QuestionId', 'age_years', 'Gender'], 642 | ['QuestionId', 'age_years', 'PremiumPupil'], 643 | ['QuestionId', 'age_years', 'Gender', 'PremiumPupil'], 644 | ['QuestionId', 'Confidence', 'PremiumPupil'], 645 | ['QuestionId', 'Confidence', 'Gender', 'PremiumPupil'], 646 | ['QuestionId', 'Confidence', 'age_years', 'Gender'], 647 | ['QuestionId', 'Confidence', 'age_years', 'PremiumPupil'], 648 | ['QuestionId', 'Confidence', 'age_years', 'Gender', 'PremiumPupil'], 649 | ['QuestionId', 'answer_num_div5'], 650 | ['QuestionId', 'quiz_answer_num_div5'], 651 | ['QuestionId', 'GroupId'], 652 | ['QuestionId', 'QuizId'], 653 | ['QuestionId', 'change_subjcat'], 654 | ['QuestionId', 'answered_subjcat'], 655 | ['QuestionId', 'prev_question'], 656 | ['QuestionId', 'DateOfBirth_NaN'], 657 | ['QuestionId', 'pri_to_high_stu'], 658 | ['SubjectId_cat', 'Confidence'], 659 | ['SubjectId_cat', 'SchemeOfWorkId'], 660 | ['SubjectId_cat', 'age_years'], 661 | ['SubjectId_cat', 'Gender'], 662 | ['SubjectId_cat', 'answer_num_div5'], 663 | ['SubjectId_cat', 'quiz_answer_num_div5'], 664 | ['SubjectId_cat', 'change_subjcat'], 665 | ['SubjectId_cat', 'answered_subjcat'], 666 | ['SchemeOfWorkId', 'Confidence'], 667 | ['SchemeOfWorkId', 'GroupId'], 668 | ['SchemeOfWorkId', 'QuizId'], 669 | ['SchemeOfWorkId', 'age_years'], 670 | ['SchemeOfWorkId', 'Gender'], 671 | ['SchemeOfWorkId', 'answer_num_div5'], 672 | ['SchemeOfWorkId', 'quiz_answer_num_div5'], 673 | ['SchemeOfWorkId', 'change_subjcat'], 674 | ['SchemeOfWorkId', 'answered_subjcat'], 675 | ['SchemeOfWorkId', 'PremiumPupil'], 676 | ['SchemeOfWorkId', 'Gender', 'PremiumPupil'], 677 | ['SchemeOfWorkId', 'age_years', 'Gender'], 678 | ['SchemeOfWorkId', 'age_years', 'PremiumPupil'], 679 | ['SchemeOfWorkId', 'age_years', 'Gender', 'PremiumPupil'], 680 | ] 681 | target_encording_features = [] 682 | for tar in ['IsCorrect']: 683 | for col in target_encording_cat: 684 | if not isinstance(col, list): 685 | col = [col] 686 | name = "_".join(col) 687 | target_encording_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 688 | 689 | target_encording_ansval_features = [] 690 | for tar in ['AnswerValue_1', 'AnswerValue_2', 'AnswerValue_3', 'AnswerValue_4']: 691 | for col in target_encording_cat: 692 | if not isinstance(col, list): 693 | col = [col] 694 | name = "_".join(col) 695 | target_encording_ansval_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 696 | 697 | te_smooth_factor = 5 698 | subj_conbi_cols = [ 699 | 'UserId', 700 | 'age_years', 701 | 'answered_subjcat', 702 | 'SchemeOfWorkId', 703 | 'Confidence', 704 | ] 705 | target_encording_subj_conbi_cat = [] 706 | for col in subject_features: 707 | for col2 in subj_conbi_cols: 708 | target_encording_subj_conbi_cat.append([col, col2]) 709 | 710 | target_encording_subj_conbi_features = [] 711 | for tar in ['IsCorrect']: 712 | for col in target_encording_subj_conbi_cat: 713 | if not isinstance(col, list): 714 | col = [col] 715 | name = "_".join(col) 716 | target_encording_subj_conbi_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 717 | 718 | target_encording_subj_agg_features = [] 719 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 720 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_agg_{agg_func}_IsCorrect') 721 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 722 | for conbi_col in subj_conbi_cols: 723 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_{conbi_col}_agg_{agg_func}_IsCorrect') 724 | 725 | svd_n_components = 5 726 | svd_features = [] 727 | svd_features += [f'ques_subj_svd_{i}' for i in range(svd_n_components)] 728 | svd_features += [f'user_subj_svd_{i}' for i in range(svd_n_components)] 729 | 730 | ################################################################ 731 | dense_features = [ 732 | 'age_days' 733 | ] 734 | dense_features += count_encording_features 735 | dense_features += target_encording_features 736 | dense_features += subject_meta_features 737 | dense_features += target_encording_ansval_features 738 | dense_features += user_lag_num_features 739 | dense_features += target_encording_subj_agg_features 740 | dense_features += svd_features 741 | 742 | sparse_features = [ 743 | # 'QuestionId', 744 | # 'UserId', 745 | 'Gender', 746 | 'PremiumPupil', 747 | 'Confidence', 748 | # 'GroupId' 749 | # 'QuizId', 750 | 'SchemeOfWorkId', 751 | 'age_years', 752 | 'SubjectId_cat', 753 | 'DateOfBirth_NaN', 754 | 'pri_to_high_stu', 755 | ] 756 | sparse_features += answer_date_features 757 | sparse_features += user_lag_cat_features 758 | 759 | 760 | varlen_sparse_features = [ 761 | # 'SubjectId', 762 | # 'SubjectId_level' 763 | ] 764 | # varlen_sparse_features = varlen_sparse_features + user_lag_multicat_features 765 | ################################################################ -------------------------------------------------------------------------------- /exp/task2_lgbm_fs100/config.py: -------------------------------------------------------------------------------- 1 | FOLD_NAME = 'mskf_user' 2 | FOLD_NUM = 5 3 | RANDOM_STATE = 46 4 | 5 | TARGET_TASK = '2' 6 | 7 | LGB_MDOEL_PARAMS = { 8 | "boosting_type": "gbdt", 9 | "learning_rate": 0.1, 10 | "max_depth": 10, 11 | "num_leaves": 256, 12 | "colsample_bytree": 0.8, 13 | "min_child_weight": 0, 14 | "random_state": RANDOM_STATE, 15 | "verbose": -1, 16 | "n_jobs": 20, 17 | } 18 | LGB_TRAIN_PARAMS = { 19 | "num_boost_round": 2000, 20 | "early_stopping_rounds": 50, 21 | "verbose_eval": 100, 22 | } 23 | 24 | if TARGET_TASK == '1': 25 | LGB_MDOEL_PARAMS['metric'] = ['auc', 'binary_logloss'] 26 | LGB_MDOEL_PARAMS['objective'] = 'binary' 27 | if TARGET_TASK == '2': 28 | LGB_MDOEL_PARAMS['metric'] = ['multi_logloss', 'multi_error'] 29 | LGB_MDOEL_PARAMS['objective'] = 'multiclass' 30 | LGB_MDOEL_PARAMS['num_class'] = 4 31 | 32 | subject_id_list = [ 33 | 3, 34 | 32, 35 | 33, 36 | 34, 37 | 35, 38 | 36, 39 | 37, 40 | 38, 41 | 39, 42 | 40, 43 | 41, 44 | 42, 45 | 44, 46 | 45, 47 | 46, 48 | 47, 49 | 48, 50 | 49, 51 | 50, 52 | 51, 53 | 52, 54 | 53, 55 | 54, 56 | 55, 57 | 56, 58 | 57, 59 | 58, 60 | 59, 61 | 60, 62 | 61, 63 | 62, 64 | 63, 65 | 64, 66 | 65, 67 | 66, 68 | 67, 69 | 68, 70 | 69, 71 | 70, 72 | 71, 73 | 72, 74 | 73, 75 | 74, 76 | 75, 77 | 76, 78 | 77, 79 | 78, 80 | 79, 81 | 80, 82 | 81, 83 | 83, 84 | 84, 85 | 85, 86 | 86, 87 | 87, 88 | 88, 89 | 89, 90 | 90, 91 | 91, 92 | 92, 93 | 93, 94 | 94, 95 | 95, 96 | 96, 97 | 97, 98 | 98, 99 | 99, 100 | 100, 101 | 101, 102 | 102, 103 | 103, 104 | 104, 105 | 105, 106 | 106, 107 | 107, 108 | 108, 109 | 109, 110 | 110, 111 | 111, 112 | 112, 113 | 113, 114 | 114, 115 | 115, 116 | 116, 117 | 117, 118 | 118, 119 | 119, 120 | 120, 121 | 126, 122 | 128, 123 | 129, 124 | 130, 125 | 131, 126 | 137, 127 | 139, 128 | 140, 129 | 141, 130 | 142, 131 | 144, 132 | 146, 133 | 149, 134 | 151, 135 | 152, 136 | 153, 137 | 154, 138 | 156, 139 | 157, 140 | 158, 141 | 159, 142 | 160, 143 | 163, 144 | 164, 145 | 165, 146 | 166, 147 | 167, 148 | 168, 149 | 171, 150 | 172, 151 | 173, 152 | 174, 153 | 175, 154 | 176, 155 | 177, 156 | 178, 157 | 179, 158 | 180, 159 | 181, 160 | 182, 161 | 183, 162 | 184, 163 | 185, 164 | 186, 165 | 187, 166 | 188, 167 | 189, 168 | 190, 169 | 191, 170 | 192, 171 | 193, 172 | 195, 173 | 196, 174 | 197, 175 | 198, 176 | 199, 177 | 200, 178 | 202, 179 | 203, 180 | 204, 181 | 205, 182 | 206, 183 | 207, 184 | 208, 185 | 209, 186 | 210, 187 | 211, 188 | 212, 189 | 213, 190 | 214, 191 | 215, 192 | 216, 193 | 217, 194 | 218, 195 | 219, 196 | 220, 197 | 221, 198 | 222, 199 | 223, 200 | 224, 201 | 225, 202 | 226, 203 | 227, 204 | 228, 205 | 229, 206 | 230, 207 | 231, 208 | 232, 209 | 233, 210 | 234, 211 | 235, 212 | 236, 213 | 237, 214 | 238, 215 | 239, 216 | 240, 217 | 241, 218 | 242, 219 | 243, 220 | 244, 221 | 245, 222 | 246, 223 | 247, 224 | 248, 225 | 249, 226 | 250, 227 | 251, 228 | 252, 229 | 253, 230 | 254, 231 | 255, 232 | 256, 233 | 257, 234 | 258, 235 | 259, 236 | 260, 237 | 261, 238 | 262, 239 | 263, 240 | 264, 241 | 265, 242 | 266, 243 | 267, 244 | 268, 245 | 269, 246 | 270, 247 | 271, 248 | 272, 249 | 273, 250 | 274, 251 | 275, 252 | 276, 253 | 277, 254 | 278, 255 | 279, 256 | 280, 257 | 281, 258 | 282, 259 | 283, 260 | 284, 261 | 298, 262 | 313, 263 | 315, 264 | 317, 265 | 331, 266 | 332, 267 | 334, 268 | 335, 269 | 336, 270 | 337, 271 | 338, 272 | 339, 273 | 340, 274 | 341, 275 | 342, 276 | 343, 277 | 344, 278 | 348, 279 | 349, 280 | 350, 281 | 351, 282 | 352, 283 | 353, 284 | 354, 285 | 355, 286 | 361, 287 | 365, 288 | 366, 289 | 367, 290 | 369, 291 | 370, 292 | 371, 293 | 372, 294 | 374, 295 | 375, 296 | 376, 297 | 377, 298 | 388, 299 | 406, 300 | 407, 301 | 408, 302 | 409, 303 | 410, 304 | 411, 305 | 412, 306 | 416, 307 | 417, 308 | 418, 309 | 430, 310 | 431, 311 | 432, 312 | 434, 313 | 435, 314 | 436, 315 | 437, 316 | 439, 317 | 441, 318 | 442, 319 | 446, 320 | 447, 321 | 448, 322 | 451, 323 | 453, 324 | 462, 325 | 474, 326 | 480, 327 | 487, 328 | 539, 329 | 540, 330 | 649, 331 | 654, 332 | 655, 333 | 656, 334 | 657, 335 | 692, 336 | 698, 337 | 700, 338 | 1059, 339 | 1076, 340 | 1077, 341 | 1078, 342 | 1079, 343 | 1080, 344 | 1081, 345 | 1082, 346 | 1156, 347 | 1157, 348 | 1158, 349 | 1159, 350 | 1160, 351 | 1161, 352 | 1162, 353 | 1163, 354 | 1164, 355 | 1165, 356 | 1166, 357 | 1167, 358 | 1168, 359 | 1169, 360 | 1170, 361 | 1171, 362 | 1172, 363 | 1173, 364 | 1174, 365 | 1175, 366 | 1176, 367 | 1177, 368 | 1178, 369 | 1179, 370 | 1180, 371 | 1181, 372 | 1182, 373 | 1183, 374 | 1184, 375 | 1185, 376 | 1186, 377 | 1187, 378 | 1188, 379 | 1189, 380 | 1200, 381 | 1201, 382 | 1202, 383 | 1203, 384 | 1204, 385 | 1207, 386 | 1208, 387 | 1209, 388 | 1210, 389 | 1211, 390 | 1212, 391 | 1213, 392 | 1214, 393 | 1215, 394 | 1216, 395 | 1217, 396 | 1218, 397 | 1219, 398 | 1263, 399 | 1264, 400 | 1265, 401 | 1266, 402 | 1636, 403 | 1642, 404 | 1647, 405 | 1648, 406 | 1649, 407 | 1650, 408 | 1651, 409 | 1675, 410 | 1676, 411 | 1750, 412 | 1975, 413 | 1976, 414 | 1977, 415 | 1980, 416 | 1982, 417 | 1983, 418 | 1985, 419 | 1987, 420 | 1988 421 | ] 422 | 423 | level_cnum_list = [ 424 | '0_8', 425 | '1_16', 426 | '3_0', 427 | '2_3', 428 | '2_4', 429 | '2_5', 430 | '2_10', 431 | '2_7', 432 | '1_14', 433 | '2_11', 434 | '2_6', 435 | '2_9', 436 | '2_8', 437 | '1_5', 438 | '1_12', 439 | '2_1', 440 | '2_13', 441 | '1_1', 442 | '1_4', 443 | '2_2', 444 | '2_14', 445 | '1_0', 446 | '2_0', 447 | '0_1' 448 | ] 449 | 450 | 451 | subject_meta_cols = [ 452 | 'num', 453 | 'max_level', 454 | 'sum_level', 455 | 'max_cnum', 456 | 'sum_cnum', 457 | ] 458 | 459 | subject_features = [f'subj_{f}' for f in subject_id_list] 460 | level_cnum_features = [f'subj_{f}' for f in level_cnum_list] 461 | subject_meta_features = [f'subj_{f}' for f in subject_meta_cols] 462 | 463 | user_lag_num_features = [ 464 | 'DateAnswered_dt_diff', 465 | 'DateAnswered_dt_diff_cumsum', 466 | 'DateAnswered_dt_diff_shift', 467 | 'DateAnswered_dt_diff_cumsum_shift', 468 | 'answer_num', 469 | 'answer_num_norm', 470 | 'quiz_answer_num', 471 | 'quiz_answer_num_norm', 472 | 'quiz_unique_num', 473 | 'subj_unique_num', 474 | 'group_unique_num', 475 | 'subjcat_unique_num', 476 | ] 477 | user_lag_cat_features = [ 478 | 'answer_num_div5', 479 | 'quiz_answer_num_div5', 480 | 'change_subjcat', 481 | 'answered_subjcat', 482 | 'prev_question', 483 | 'prev_subjcat', 484 | ] 485 | user_lag_multicat_features = [ 486 | 'prev10_question', 487 | 'prev10_subjcat', 488 | ] 489 | user_lag_features = user_lag_num_features + user_lag_cat_features + user_lag_multicat_features 490 | 491 | answer_date_features = [ 492 | 'DateAnswered_weekday', 493 | 'DateAnswered_hour', 494 | 'DateAnswered_day', 495 | 'DateAnswered_wom' 496 | ] 497 | count_encording_cat = [ 498 | 'QuestionId', 499 | 'UserId', 500 | 'Gender', 501 | 'PremiumPupil', 502 | 'Confidence', 503 | 'GroupId', 504 | 'QuizId', 505 | 'SchemeOfWorkId', 506 | 'age_years', 507 | 'DateAnswered_weekday', 508 | 'DateAnswered_hour', 509 | 'DateAnswered_day', 510 | 'DateAnswered_wom', 511 | 'answer_num_div5', 512 | 'quiz_answer_num_div5', 513 | 'change_subjcat', 514 | 'answered_subjcat', 515 | 'prev_question', 516 | 'prev_subjcat', 517 | 'SubjectId_cat', 518 | # 'DateOfBirth_NaN', 519 | 'pri_to_high_stu', 520 | ['UserId', 'DateAnswered_weekday'], 521 | ['UserId', 'DateAnswered_hour'], 522 | ['UserId', 'DateAnswered_day'], 523 | ['UserId', 'DateAnswered_wom'], 524 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 525 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 526 | ['UserId', 'Confidence'], 527 | ['UserId', 'SchemeOfWorkId'], 528 | ['UserId', 'GroupId'], 529 | ['UserId', 'QuizId'], 530 | ['UserId', 'SubjectId_cat'], 531 | ['UserId', 'answer_num_div5'], 532 | ['UserId', 'quiz_answer_num_div5'], 533 | ['UserId', 'change_subjcat'], 534 | ['UserId', 'answered_subjcat'], 535 | ['QuestionId', 'Confidence'], 536 | ['QuestionId', 'SchemeOfWorkId'], 537 | ['QuestionId', 'age_years'], 538 | ['QuestionId', 'Gender'], 539 | ['QuestionId', 'answer_num_div5'], 540 | ['QuestionId', 'quiz_answer_num_div5'], 541 | ['QuestionId', 'change_subjcat'], 542 | ['QuestionId', 'answered_subjcat'], 543 | ['SubjectId_cat', 'Confidence'], 544 | ['SubjectId_cat', 'SchemeOfWorkId'], 545 | ['SubjectId_cat', 'age_years'], 546 | ['SubjectId_cat', 'Gender'], 547 | ['SubjectId_cat', 'answer_num_div5'], 548 | ['SubjectId_cat', 'quiz_answer_num_div5'], 549 | ['SubjectId_cat', 'change_subjcat'], 550 | ['SubjectId_cat', 'answered_subjcat'], 551 | ['QuestionId', 'GroupId'], 552 | ['QuestionId', 'QuizId'], 553 | ] 554 | count_encording_features = [] 555 | for col in count_encording_cat: 556 | if not isinstance(col, list): 557 | col = [col] 558 | name = "_".join(col) 559 | count_encording_features.append(f'{name}_ce') 560 | 561 | te_smooth_factor = 5 562 | target_encording_cat = [ 563 | 'QuestionId', 564 | 'UserId', 565 | 'Gender', 566 | 'PremiumPupil', 567 | 'Confidence', 568 | 'GroupId', 569 | 'QuizId', 570 | 'SchemeOfWorkId', 571 | 'age_years', 572 | 'DateAnswered_weekday', 573 | 'DateAnswered_hour', 574 | 'DateAnswered_day', 575 | 'DateAnswered_wom', 576 | 'answer_num_div5', 577 | 'quiz_answer_num_div5', 578 | 'change_subjcat', 579 | 'answered_subjcat', 580 | 'prev_question', 581 | 'prev_subjcat', 582 | 'SubjectId_cat', 583 | 'DateOfBirth_NaN', 584 | 'pri_to_high_stu', 585 | ['DateAnswered_day', 'DateAnswered_hour'], 586 | ['DateAnswered_weekday', 'DateAnswered_hour'], 587 | ['DateAnswered_weekday', 'DateAnswered_wom'], 588 | ['UserId', 'DateAnswered_weekday'], 589 | ['UserId', 'DateAnswered_hour'], 590 | ['UserId', 'DateAnswered_day'], 591 | ['UserId', 'DateAnswered_wom'], 592 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_hour'], 593 | ['UserId', 'DateAnswered_weekday', 'DateAnswered_wom'], 594 | ['UserId', 'Confidence'], 595 | ['UserId', 'SchemeOfWorkId'], 596 | ['UserId', 'GroupId'], 597 | ['UserId', 'QuizId'], 598 | ['UserId', 'SubjectId_cat'], 599 | ['UserId', 'answer_num_div5'], 600 | ['UserId', 'quiz_answer_num_div5'], 601 | ['UserId', 'change_subjcat'], 602 | ['UserId', 'answered_subjcat'], 603 | ['QuestionId', 'Confidence'], 604 | ['QuestionId', 'SchemeOfWorkId'], 605 | ['QuestionId', 'age_years'], 606 | ['QuestionId', 'Gender'], 607 | ['QuestionId', 'answer_num_div5'], 608 | ['QuestionId', 'quiz_answer_num_div5'], 609 | ['QuestionId', 'change_subjcat'], 610 | ['QuestionId', 'answered_subjcat'], 611 | ['SubjectId_cat', 'Confidence'], 612 | ['SubjectId_cat', 'SchemeOfWorkId'], 613 | ['SubjectId_cat', 'age_years'], 614 | ['SubjectId_cat', 'Gender'], 615 | ['SubjectId_cat', 'answer_num_div5'], 616 | ['SubjectId_cat', 'quiz_answer_num_div5'], 617 | ['SubjectId_cat', 'change_subjcat'], 618 | ['SubjectId_cat', 'answered_subjcat'], 619 | ] 620 | target_encording_features = [] 621 | for tar in ['IsCorrect']: 622 | for col in target_encording_cat: 623 | if not isinstance(col, list): 624 | col = [col] 625 | name = "_".join(col) 626 | target_encording_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 627 | 628 | target_encording_ansval_features = [] 629 | for tar in ['AnswerValue_1', 'AnswerValue_2', 'AnswerValue_3', 'AnswerValue_4']: 630 | for col in target_encording_cat: 631 | if not isinstance(col, list): 632 | col = [col] 633 | name = "_".join(col) 634 | target_encording_ansval_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 635 | 636 | subj_conbi_cols = [ 637 | 'UserId', 638 | 'age_years', 639 | 'answered_subjcat', 640 | 'SchemeOfWorkId', 641 | 'Confidence', 642 | ] 643 | target_encording_subj_conbi_cat = [] 644 | for col in subject_features: 645 | for col2 in subj_conbi_cols: 646 | target_encording_subj_conbi_cat.append([col, col2]) 647 | 648 | target_encording_subj_conbi_features = [] 649 | for tar in ['IsCorrect']: 650 | for col in target_encording_subj_conbi_cat: 651 | if not isinstance(col, list): 652 | col = [col] 653 | name = "_".join(col) 654 | target_encording_subj_conbi_features.append(f'TE_s{te_smooth_factor}_{name}_{tar}') 655 | 656 | target_encording_subj_agg_features = [] 657 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 658 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_agg_{agg_func}_IsCorrect') 659 | for agg_func in ['sum', 'mean', 'std', 'max', 'min']: 660 | for conbi_col in subj_conbi_cols: 661 | target_encording_subj_agg_features.append(f'TE_s{te_smooth_factor}_subj_{conbi_col}_agg_{agg_func}_IsCorrect') 662 | 663 | svd_n_components = 5 664 | svd_features = [] 665 | svd_features += [f'ques_subj_svd_{i}' for i in range(svd_n_components)] 666 | svd_features += [f'user_subj_svd_{i}' for i in range(svd_n_components)] 667 | 668 | fs_features = [ 669 | "Confidence_ce", 670 | "DateAnswered_dt_diff", 671 | "DateAnswered_dt_diff_cumsum", 672 | "DateAnswered_dt_diff_cumsum_shift", 673 | "DateAnswered_dt_diff_shift", 674 | "GroupId_ce", 675 | "QuestionId_GroupId_ce", 676 | "QuestionId_QuizId_ce", 677 | "QuestionId_ce", 678 | "QuestionId_quiz_answer_num_div5_ce", 679 | "QuizId_ce", 680 | "SchemeOfWorkId_GroupId_ce", 681 | "SchemeOfWorkId_QuizId_ce", 682 | "SubjectId_cat", 683 | "TE_s5_DateAnswered_day_AnswerValue_1", 684 | "TE_s5_DateAnswered_day_DateAnswered_hour_IsCorrect", 685 | "TE_s5_GroupId_IsCorrect", 686 | "TE_s5_QuestionId_AnswerValue_1", 687 | "TE_s5_QuestionId_AnswerValue_2", 688 | "TE_s5_QuestionId_AnswerValue_3", 689 | "TE_s5_QuestionId_AnswerValue_4", 690 | "TE_s5_QuestionId_Confidence_AnswerValue_1", 691 | "TE_s5_QuestionId_Confidence_AnswerValue_2", 692 | "TE_s5_QuestionId_Confidence_AnswerValue_3", 693 | "TE_s5_QuestionId_Confidence_AnswerValue_4", 694 | "TE_s5_QuestionId_Confidence_Gender_PremiumPupil_IsCorrect", 695 | "TE_s5_QuestionId_Confidence_IsCorrect", 696 | "TE_s5_QuestionId_Confidence_PremiumPupil_IsCorrect", 697 | "TE_s5_QuestionId_Confidence_age_years_Gender_IsCorrect", 698 | "TE_s5_QuestionId_Confidence_age_years_Gender_PremiumPupil_IsCorrect", 699 | "TE_s5_QuestionId_Confidence_age_years_PremiumPupil_IsCorrect", 700 | "TE_s5_QuestionId_DateOfBirth_NaN_AnswerValue_3", 701 | "TE_s5_QuestionId_Gender_AnswerValue_1", 702 | "TE_s5_QuestionId_Gender_AnswerValue_2", 703 | "TE_s5_QuestionId_Gender_AnswerValue_4", 704 | "TE_s5_QuestionId_Gender_PremiumPupil_AnswerValue_1", 705 | "TE_s5_QuestionId_Gender_PremiumPupil_AnswerValue_2", 706 | "TE_s5_QuestionId_Gender_PremiumPupil_AnswerValue_3", 707 | "TE_s5_QuestionId_Gender_PremiumPupil_AnswerValue_4", 708 | "TE_s5_QuestionId_Gender_PremiumPupil_IsCorrect", 709 | "TE_s5_QuestionId_GroupId_AnswerValue_1", 710 | "TE_s5_QuestionId_GroupId_AnswerValue_2", 711 | "TE_s5_QuestionId_GroupId_AnswerValue_3", 712 | "TE_s5_QuestionId_GroupId_AnswerValue_4", 713 | "TE_s5_QuestionId_GroupId_IsCorrect", 714 | "TE_s5_QuestionId_IsCorrect", 715 | "TE_s5_QuestionId_PremiumPupil_AnswerValue_3", 716 | "TE_s5_QuestionId_QuizId_AnswerValue_1", 717 | "TE_s5_QuestionId_QuizId_AnswerValue_2", 718 | "TE_s5_QuestionId_QuizId_AnswerValue_3", 719 | "TE_s5_QuestionId_QuizId_AnswerValue_4", 720 | "TE_s5_QuestionId_QuizId_IsCorrect", 721 | "TE_s5_QuestionId_SchemeOfWorkId_AnswerValue_1", 722 | "TE_s5_QuestionId_SchemeOfWorkId_AnswerValue_2", 723 | "TE_s5_QuestionId_SchemeOfWorkId_AnswerValue_3", 724 | "TE_s5_QuestionId_SchemeOfWorkId_AnswerValue_4", 725 | "TE_s5_QuestionId_SchemeOfWorkId_IsCorrect", 726 | "TE_s5_QuestionId_age_years_Gender_IsCorrect", 727 | "TE_s5_QuestionId_age_years_Gender_PremiumPupil_IsCorrect", 728 | "TE_s5_QuestionId_change_subjcat_AnswerValue_4", 729 | "TE_s5_QuestionId_prev_question_IsCorrect", 730 | "TE_s5_QuestionId_pri_to_high_stu_AnswerValue_1", 731 | "TE_s5_QuestionId_pri_to_high_stu_AnswerValue_4", 732 | "TE_s5_QuestionId_pri_to_high_stu_IsCorrect", 733 | "TE_s5_QuestionId_quiz_answer_num_div5_AnswerValue_1", 734 | "TE_s5_QuestionId_quiz_answer_num_div5_AnswerValue_2", 735 | "TE_s5_QuestionId_quiz_answer_num_div5_AnswerValue_3", 736 | "TE_s5_QuestionId_quiz_answer_num_div5_AnswerValue_4", 737 | "TE_s5_QuestionId_quiz_answer_num_div5_IsCorrect", 738 | "TE_s5_QuizId_IsCorrect", 739 | "TE_s5_SchemeOfWorkId_GroupId_IsCorrect", 740 | "TE_s5_SchemeOfWorkId_IsCorrect", 741 | "TE_s5_SchemeOfWorkId_QuizId_IsCorrect", 742 | "TE_s5_SchemeOfWorkId_age_years_PremiumPupil_IsCorrect", 743 | "TE_s5_SchemeOfWorkId_answered_subjcat_IsCorrect", 744 | "TE_s5_SchemeOfWorkId_change_subjcat_IsCorrect", 745 | "TE_s5_SchemeOfWorkId_quiz_answer_num_div5_IsCorrect", 746 | "TE_s5_SubjectId_cat_SchemeOfWorkId_IsCorrect", 747 | "TE_s5_SubjectId_cat_quiz_answer_num_div5_AnswerValue_3", 748 | "TE_s5_UserId_AnswerValue_4", 749 | "TE_s5_UserId_Confidence_AnswerValue_1", 750 | "TE_s5_UserId_Confidence_AnswerValue_4", 751 | "TE_s5_UserId_Confidence_IsCorrect", 752 | "TE_s5_UserId_DateAnswered_day_AnswerValue_1", 753 | "TE_s5_UserId_DateAnswered_day_AnswerValue_3", 754 | "TE_s5_UserId_DateAnswered_day_AnswerValue_4", 755 | "TE_s5_UserId_DateAnswered_day_IsCorrect", 756 | "TE_s5_UserId_DateAnswered_hour_IsCorrect", 757 | "TE_s5_UserId_DateAnswered_weekday_DateAnswered_hour_AnswerValue_1", 758 | "TE_s5_UserId_DateAnswered_weekday_DateAnswered_hour_AnswerValue_3", 759 | "TE_s5_UserId_DateAnswered_weekday_DateAnswered_hour_AnswerValue_4", 760 | "TE_s5_UserId_DateAnswered_weekday_DateAnswered_hour_IsCorrect", 761 | "TE_s5_UserId_DateAnswered_weekday_DateAnswered_wom_IsCorrect", 762 | "TE_s5_UserId_DateAnswered_weekday_IsCorrect", 763 | "TE_s5_UserId_DateAnswered_wom_IsCorrect", 764 | "TE_s5_UserId_GroupId_AnswerValue_1", 765 | "TE_s5_UserId_GroupId_AnswerValue_4", 766 | "TE_s5_UserId_GroupId_IsCorrect", 767 | "TE_s5_UserId_IsCorrect", 768 | "TE_s5_UserId_QuizId_AnswerValue_1", 769 | "TE_s5_UserId_QuizId_AnswerValue_2", 770 | "TE_s5_UserId_QuizId_AnswerValue_3", 771 | "TE_s5_UserId_QuizId_AnswerValue_4", 772 | "TE_s5_UserId_QuizId_IsCorrect", 773 | "TE_s5_UserId_SchemeOfWorkId_IsCorrect", 774 | "TE_s5_UserId_SubjectId_cat_AnswerValue_1", 775 | "TE_s5_UserId_SubjectId_cat_IsCorrect", 776 | "TE_s5_UserId_age_years_AnswerValue_1", 777 | "TE_s5_UserId_age_years_Confidence_AnswerValue_1", 778 | "TE_s5_UserId_age_years_Confidence_AnswerValue_4", 779 | "TE_s5_UserId_age_years_Confidence_IsCorrect", 780 | "TE_s5_UserId_age_years_IsCorrect", 781 | "TE_s5_UserId_answer_num_div5_AnswerValue_4", 782 | "TE_s5_UserId_answer_num_div5_IsCorrect", 783 | "TE_s5_UserId_answered_subjcat_AnswerValue_3", 784 | "TE_s5_UserId_quiz_answer_num_div5_IsCorrect", 785 | "TE_s5_prev_question_IsCorrect", 786 | "TE_s5_subj_Confidence_agg_max_IsCorrect", 787 | "TE_s5_subj_Confidence_agg_min_IsCorrect", 788 | "TE_s5_subj_Confidence_agg_sum_IsCorrect", 789 | "TE_s5_subj_SchemeOfWorkId_agg_sum_IsCorrect", 790 | "TE_s5_subj_UserId_agg_max_IsCorrect", 791 | "TE_s5_subj_UserId_agg_min_IsCorrect", 792 | "TE_s5_subj_UserId_agg_std_IsCorrect", 793 | "UserId_Confidence_ce", 794 | "UserId_DateAnswered_day_ce", 795 | "UserId_DateAnswered_hour_ce", 796 | "UserId_DateAnswered_weekday_DateAnswered_hour_ce", 797 | "UserId_GroupId_ce", 798 | "UserId_QuizId_ce", 799 | "UserId_SchemeOfWorkId_ce", 800 | "UserId_SubjectId_cat_ce", 801 | "UserId_age_years_Confidence_ce", 802 | "quiz_answer_num", 803 | "quiz_answer_num_norm", 804 | "user_subj_svd_0", 805 | "user_subj_svd_1", 806 | "user_subj_svd_2", 807 | "user_subj_svd_4", 808 | ] 809 | 810 | ################################################################ 811 | dense_features = [ 812 | 'age_days' 813 | ] 814 | dense_features += count_encording_features 815 | dense_features += target_encording_features 816 | dense_features += subject_meta_features 817 | dense_features += target_encording_ansval_features 818 | dense_features += user_lag_num_features 819 | dense_features += target_encording_subj_agg_features 820 | dense_features += svd_features 821 | dense_features = [f for f in dense_features if f in fs_features] 822 | 823 | sparse_features = [ 824 | # 'QuestionId', 825 | # 'UserId', 826 | 'Gender', 827 | 'PremiumPupil', 828 | 'Confidence', 829 | # 'GroupId', 830 | # 'QuizId', 831 | 'SchemeOfWorkId', 832 | 'age_years', 833 | 'SubjectId_cat', 834 | 'DateOfBirth_NaN', 835 | 'pri_to_high_stu', 836 | ] 837 | sparse_features += answer_date_features 838 | sparse_features += user_lag_cat_features 839 | sparse_features = [f for f in sparse_features if f in fs_features] 840 | 841 | varlen_sparse_features = [ 842 | # 'SubjectId', 843 | # 'SubjectId_level' 844 | ] 845 | # varlen_sparse_features = varlen_sparse_features + user_lag_multicat_features 846 | ################################################################ -------------------------------------------------------------------------------- /src/train_mlp_multitask.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | import zipfile 7 | import shutil 8 | import logging 9 | 10 | import mlflow 11 | from sklearn import metrics 12 | import torch 13 | import torch.nn as nn 14 | from tensorflow.python.keras.preprocessing.sequence import pad_sequences 15 | 16 | from utils import ( 17 | seed_everything, 18 | Timer, 19 | reduce_mem_usage, 20 | load_from_pkl, save_as_pkl 21 | ) 22 | from dataset import SimpleDataLoader 23 | from model import ( 24 | DNN_multitask_v2 25 | ) 26 | 27 | logging.basicConfig(level=logging.INFO) 28 | 29 | DEVICE = os.environ.get('DEVICE') 30 | 31 | EXP_NAME = os.environ.get('EXP_NAME') 32 | EXP_DIR = os.environ.get('EXP_DIR') 33 | 34 | INPUT_DIR = os.environ.get('INPUT_DIR') 35 | FEATURE_DIR = os.environ.get('FEATURE_DIR') 36 | FOLD_DIR = os.environ.get('FOLD_DIR') 37 | SUB_DIR = os.environ.get('SUB_DIR') 38 | 39 | sys.path.append(f'{EXP_DIR}/{EXP_NAME}') 40 | import config 41 | 42 | VARLEN_MAX_LEN = config.VARLEN_MAX_LEN 43 | 44 | FOLD_NAME = config.FOLD_NAME 45 | FOLD_NUM = config.FOLD_NUM 46 | RANDOM_STATE = config.RANDOM_STATE 47 | 48 | EPOCH_NUM = config.EPOCH_NUM 49 | BATCH_SIZE = config.BATCH_SIZE 50 | DNN_HIDDEN_UNITS = config.DNN_HIDDEN_UNITS 51 | DNN_HIDDEN_UNITS_EACH_TASK = config.DNN_HIDDEN_UNITS_EACH_TASK 52 | DNN_DROPOUT = config.DNN_DROPOUT 53 | DNN_ACTIVATION = config.DNN_ACTIVATION 54 | L2_REG = config.L2_REG 55 | INIT_STD = config.INIT_STD 56 | 57 | SPAESE_EMBEDDING_DIM = config.SPAESE_EMBEDDING_DIM 58 | 59 | LR = config.LR 60 | OPTIMIZER = config.OPTIMIZER 61 | LOSS = config.LOSS 62 | OPTIM_TARGET = config.OPTIM_TARGET 63 | 64 | dense_features = config.dense_features 65 | sparse_features = config.sparse_features 66 | varlen_sparse_features = config.varlen_sparse_features 67 | 68 | 69 | def save_mlflow(run_id, cv, fold_best_scores, best_acc, best_th, oof_metric_t2): 70 | 71 | mlflow.log_param("fold_name", FOLD_NAME) 72 | mlflow.log_param("fold_num", FOLD_NUM) 73 | 74 | mlflow.log_param("batch_size", BATCH_SIZE) 75 | mlflow.log_param("loss", LOSS) 76 | mlflow.log_param("optimizer", OPTIMIZER) 77 | mlflow.log_param("learning rate", LR) 78 | mlflow.log_param("random_state", RANDOM_STATE) 79 | 80 | mlflow.log_param("dnn_hidden_layer", DNN_HIDDEN_UNITS) 81 | mlflow.log_param("dnn_dropout", DNN_DROPOUT) 82 | mlflow.log_param("dnn_activation", DNN_ACTIVATION) 83 | mlflow.log_param("l2_reg", L2_REG) 84 | mlflow.log_param("init_std", INIT_STD) 85 | 86 | mlflow.log_param("embedding_dim", SPAESE_EMBEDDING_DIM) 87 | 88 | for feat in dense_features: 89 | feat = feat.replace('#', '') 90 | mlflow.log_param(f'f__dense__{feat}', 1) 91 | for feat in sparse_features: 92 | feat = feat.replace('#', '') 93 | mlflow.log_param(f'f__sparse__{feat}', 1) 94 | for feat in varlen_sparse_features: 95 | feat = feat.replace('#', '') 96 | mlflow.log_param(f'f__varspa__{feat}', 1) 97 | 98 | mlflow.log_metric("oof acc", best_acc) 99 | mlflow.log_metric("th", best_th) 100 | mlflow.log_metric("oof acc t2", oof_metric_t2) 101 | mlflow.log_metric("cv", cv) 102 | for fold_idx in range(FOLD_NUM): 103 | mlflow.log_metric(f'val_metric_t1_{fold_idx}', fold_best_scores[fold_idx][1]) 104 | for fold_idx in range(FOLD_NUM): 105 | mlflow.log_metric(f'val_metric_t2_{fold_idx}', fold_best_scores[fold_idx][2]) 106 | 107 | return 108 | 109 | 110 | def save_best_score(fold_best_scores, model, val_metric, val_metric_t2, run_id, fold_idx): 111 | if not os.path.exists(f'../save/{EXP_NAME}'): 112 | os.mkdir(f'../save/{EXP_NAME}') 113 | os.mkdir(f'../save/{EXP_NAME}/model_weight') 114 | weight_path = f'../save/{EXP_NAME}/model_weight/train_weights_mlflow-{run_id}_fold{fold_idx}.h5' 115 | torch.save(model.state_dict(), weight_path) 116 | fold_best_scores[fold_idx] = (weight_path, val_metric, val_metric_t2) 117 | mlflow.log_artifact(weight_path) 118 | return fold_best_scores 119 | 120 | 121 | def multi_acc(y_pred, y_test): 122 | y_pred_softmax = torch.log_softmax(y_pred, dim=1) 123 | _, y_pred_tags = torch.max(y_pred_softmax, dim=1) 124 | correct_pred = (y_pred_tags == y_test).float() 125 | acc = correct_pred.sum() / len(correct_pred) 126 | return acc 127 | 128 | 129 | def save_as_feather(feat, suffix, X_train, X_test): 130 | X_train[[feat]].reset_index(drop=True).to_feather(f'{FEATURE_DIR}/{feat}_{suffix}_train.feather') 131 | X_test[[feat]].reset_index(drop=True).to_feather(f'{FEATURE_DIR}/{feat}_{suffix}_test.feather') 132 | return 133 | 134 | 135 | if __name__ == "__main__": 136 | 137 | t = Timer() 138 | with t.timer(f'fix seed RANDOM_STATE:{RANDOM_STATE}'): 139 | seed_everything(RANDOM_STATE) 140 | 141 | with t.timer(f'read label'): 142 | data_path = f'{INPUT_DIR}/train_data/train_task_1_2.csv' 143 | y_train = pd.read_csv(data_path, usecols=['IsCorrect', 'AnswerValue']) 144 | y_train_t1 = y_train['IsCorrect'].values 145 | y_train_t2 = (y_train['AnswerValue'] - 1).values # starting at zero 146 | 147 | with t.timer(f'apply mms'): 148 | for feat in dense_features: 149 | if os.path.exists(f'{FEATURE_DIR}/{feat}_mms_test.feather'): 150 | continue 151 | # MMS 152 | f_train = pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') 153 | f_test = pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') 154 | tmp = pd.concat([f_train[feat], f_test[feat]]) 155 | print(tmp) 156 | max_v, min_v = tmp.max(), tmp.min() 157 | logging.info(f'{feat} - MMS Scaling > max: {max_v}, min: {min_v}') 158 | f_train[feat] = (f_train[feat] - min_v) / (max_v - min_v) 159 | f_test[feat] = (f_test[feat] - min_v) / (max_v - min_v) 160 | save_as_feather(feat, 'mms', f_train, f_test) 161 | 162 | skip_fr = False 163 | if skip_fr is False: 164 | with t.timer(f'read features'): 165 | unique_num_dic = {} 166 | feature_index = {} 167 | 168 | X_train = pd.DataFrame() 169 | X_test = pd.DataFrame() 170 | fidx = 0 171 | for feat in dense_features: 172 | logging.info(f'[{feat}] read feature ...') 173 | feature_index[feat] = fidx 174 | fidx += 1 175 | X_train = pd.concat([ 176 | X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_mms_train.feather') 177 | ], axis=1) 178 | X_test = pd.concat([ 179 | X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_mms_test.feather') 180 | ], axis=1) 181 | X_train = reduce_mem_usage(X_train) 182 | X_test = reduce_mem_usage(X_test) 183 | for feat in sparse_features: 184 | logging.info(f'[{feat}] read feature ...') 185 | feature_index[feat] = fidx 186 | fidx += 1 187 | X_train = pd.concat([ 188 | X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') 189 | ], axis=1) 190 | X_test = pd.concat([ 191 | X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') 192 | ], axis=1) 193 | unique_num = pd.concat([ 194 | X_train[feat], X_test[feat] 195 | ]).nunique() 196 | unique_num_dic[feat] = unique_num 197 | X_train = reduce_mem_usage(X_train) 198 | X_test = reduce_mem_usage(X_test) 199 | 200 | for feat in varlen_sparse_features: 201 | logging.info(f'[{feat}] read feature ...') 202 | feature_index[feat] = (fidx, fidx + VARLEN_MAX_LEN) 203 | fidx += VARLEN_MAX_LEN 204 | 205 | # train_feat = pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather').values 206 | train_feat = pd.read_parquet(f'{FEATURE_DIR}/{feat}_train.parquet').values 207 | varlen_list = [i[0] for i in train_feat] 208 | varlen_list = pad_sequences(varlen_list, maxlen=VARLEN_MAX_LEN, padding='post', ) 209 | X_train = pd.concat([ 210 | X_train, pd.DataFrame(varlen_list) 211 | ], axis=1) 212 | 213 | # test_feat = pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather').values 214 | test_feat = pd.read_parquet(f'{FEATURE_DIR}/{feat}_test.parquet').values 215 | varlen_list = [i[0] for i in test_feat] 216 | varlen_list = pad_sequences(varlen_list, maxlen=VARLEN_MAX_LEN, padding='post', ) 217 | X_test = pd.concat([ 218 | X_test, pd.DataFrame(varlen_list) 219 | ], axis=1) 220 | 221 | tmp = [] 222 | for i in [i[0] for i in train_feat] + [i[0] for i in test_feat]: 223 | tmp.extend(i) 224 | unique_num = len(set(tmp)) + 1 225 | unique_num_dic[feat] = unique_num 226 | 227 | print('Unique Num', unique_num_dic) 228 | print('Feature index', feature_index) 229 | 230 | save_as_pkl(X_train, f'X_train_{EXP_NAME}.pkl') 231 | save_as_pkl(X_test, f'X_test_{EXP_NAME}.pkl') 232 | save_as_pkl(unique_num_dic, f'unique_num_dic_{EXP_NAME}.pkl') 233 | save_as_pkl(feature_index, f'feature_index_{EXP_NAME}.pkl') 234 | 235 | elif skip_fr is True: 236 | X_train = load_from_pkl(f'X_train_{EXP_NAME}.pkl') 237 | X_test = load_from_pkl(f'X_test_{EXP_NAME}.pkl') 238 | unique_num_dic = load_from_pkl(f'unique_num_dic_{EXP_NAME}.pkl') 239 | feature_index = load_from_pkl(f'feature_index_{EXP_NAME}.pkl') 240 | 241 | X_train = X_train.fillna(0.0) 242 | X_test = X_test.fillna(0.0) 243 | 244 | with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'): 245 | folds = pd.read_csv(f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv') 246 | 247 | mlflow.set_experiment(EXP_NAME) 248 | mlflow.start_run() 249 | run_id = mlflow.active_run().info.run_id 250 | 251 | fold_best_scores = {} # fold_idx:best_cv_score 252 | for fold_idx in range(FOLD_NUM): 253 | 254 | trn_idx = folds[folds.kfold != fold_idx].index.tolist() 255 | val_idx = folds[folds.kfold == fold_idx].index.tolist() 256 | 257 | x_trn = X_train.iloc[trn_idx] 258 | x_val = X_train.iloc[val_idx] 259 | 260 | # if 261 | y_trn = y_train_t1[trn_idx] 262 | y_val = y_train_t1[val_idx] 263 | y2_trn = y_train_t2[trn_idx] 264 | y2_val = y_train_t2[val_idx] 265 | 266 | # if 267 | train_loader = SimpleDataLoader( 268 | [torch.from_numpy(x_trn.values), torch.from_numpy(y_trn), torch.from_numpy(y2_trn)], 269 | batch_size=BATCH_SIZE, 270 | shuffle=True 271 | ) 272 | 273 | sparse = False 274 | embedding_dict = nn.ModuleDict( 275 | { 276 | feat: nn.Embedding( 277 | unique_num_dic[feat], SPAESE_EMBEDDING_DIM, sparse=sparse 278 | ) for feat in sparse_features + varlen_sparse_features 279 | } 280 | ) 281 | 282 | dnn_input_len = len(dense_features) + len(sparse_features + varlen_sparse_features) * SPAESE_EMBEDDING_DIM 283 | 284 | # if 285 | model = DNN_multitask_v2( 286 | dnn_input=dnn_input_len, 287 | dnn_hidden_units=DNN_HIDDEN_UNITS, 288 | dnn_hidden_units_task=DNN_HIDDEN_UNITS_EACH_TASK, 289 | dnn_dropout=DNN_DROPOUT, 290 | activation=DNN_ACTIVATION, use_bn=True, l2_reg=L2_REG, init_std=INIT_STD, 291 | device=DEVICE, 292 | feature_index=feature_index, 293 | embedding_dict=embedding_dict, 294 | dense_features=dense_features, 295 | sparse_features=sparse_features, 296 | varlen_sparse_features=varlen_sparse_features, 297 | ) 298 | 299 | loss_func = nn.BCELoss() 300 | loss_func_t2 = nn.CrossEntropyLoss() 301 | 302 | optim = torch.optim.Adam(model.parameters(), lr=LR) 303 | metric_func = metrics.roc_auc_score 304 | metric_func_t2 = multi_acc 305 | 306 | loss_history = [] 307 | steps_per_epoch = (len(x_trn) - 1) // BATCH_SIZE + 1 308 | 309 | if OPTIM_TARGET in ['AUC_t1', 'ACC_t2']: 310 | best_score = 0.0 311 | elif OPTIM_TARGET in ['BCE_t1', 'CE_t2', 'BCE-CE_t12']: 312 | best_score = 999.9 313 | for epoch in range(EPOCH_NUM): 314 | 315 | # train 316 | loss_history_epoch = [] 317 | loss_t2_history_epoch = [] 318 | metric_history_epoch = [] 319 | metric_t2_history_epoch = [] 320 | 321 | logging.info(f'[{DEVICE}][FOLD:{fold_idx}] EPOCH - {epoch} / {EPOCH_NUM}') 322 | model = model.train() 323 | for bi, (bx, by, by2) in tqdm(enumerate(train_loader), total=steps_per_epoch): 324 | 325 | optim.zero_grad() 326 | 327 | x = bx.to(DEVICE).float() 328 | 329 | # if 330 | y = by.to(DEVICE).float().squeeze() 331 | y2 = by2.to(DEVICE).long().squeeze() 332 | y_pred, y_pred_t2 = model(x) 333 | y_pred = y_pred.squeeze() 334 | y_pred_t2 = y_pred_t2.squeeze() 335 | 336 | # if 337 | loss_t1 = loss_func(y_pred, y) 338 | loss_t2 = loss_func_t2(y_pred_t2, y2) 339 | 340 | loss = loss_t1 + loss_t2 341 | loss.backward() 342 | 343 | optim.step() 344 | 345 | loss_history_epoch.append(loss.item()) 346 | loss_t2_history_epoch.append(loss_t2.item()) 347 | 348 | # if 349 | y_pred_np = y_pred.cpu().detach().numpy().reshape(-1, 1) 350 | y_np = y.cpu().detach().numpy().reshape(-1, 1) 351 | if len(np.unique(y_np)) == 1: 352 | # AUC error (ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.) 353 | continue 354 | else: 355 | metric = metric_func(y_np, y_pred_np) 356 | metric_history_epoch.append(metric) 357 | 358 | metric_t2 = metric_func_t2(y_pred_t2, y2) 359 | metric_t2_history_epoch.append(metric_t2) 360 | 361 | # if 362 | trn_loss_epoch = sum(loss_history_epoch) / len(loss_history_epoch) 363 | trn_metric_epoch = sum(metric_history_epoch) / len(metric_history_epoch) 364 | 365 | trn_loss_t2_epoch = sum(loss_t2_history_epoch) / len(loss_t2_history_epoch) 366 | trn_metric_t2_epoch = sum(metric_t2_history_epoch) / len(metric_t2_history_epoch) 367 | 368 | # validation evaluate 369 | # if 370 | preds_val, preds_t2_val = model.predict(x_val, BATCH_SIZE) 371 | val_loss = loss_func( 372 | torch.from_numpy(preds_val.reshape(-1, 1)).to(torch.float32), torch.from_numpy(y_val.reshape(-1, 1)).to(torch.float32) 373 | ).item() 374 | val_metric = metric_func(y_val, preds_val) 375 | 376 | val_loss_t2 = loss_func_t2( 377 | torch.from_numpy(preds_t2_val), torch.from_numpy(y2_val).long() 378 | ).item() 379 | val_metric_t2 = metric_func_t2( 380 | torch.from_numpy(preds_t2_val), torch.from_numpy(y2_val) 381 | ).item() 382 | 383 | # if 384 | logging.info(f'Train - Loss(t1): {trn_loss_epoch}, Loss(t2): {trn_loss_t2_epoch}, AUC(t1): {trn_metric_epoch}, ACC(t2): {trn_metric_t2_epoch}') 385 | logging.info(f'Valid - Loss(t1): {val_loss}, Loss(t2): {val_loss_t2}, AUC(t1): {val_metric}, ACC(t2): {val_metric_t2}') 386 | loss_history.append([ 387 | epoch, trn_loss_epoch, val_loss, trn_loss_t2_epoch, val_loss_t2 388 | ]) 389 | 390 | if OPTIM_TARGET in ['AUC_t1']: 391 | score = val_metric 392 | if score > best_score: 393 | best_score = score 394 | fold_best_scores = save_best_score(fold_best_scores, model, val_metric, val_metric_t2, run_id, fold_idx) 395 | elif OPTIM_TARGET in ['BCE_t1']: 396 | score = val_loss 397 | if score < best_score: 398 | best_score = score 399 | fold_best_scores = save_best_score(fold_best_scores, model, val_metric, val_metric_t2, run_id, fold_idx) 400 | elif OPTIM_TARGET in ['ACC_t2']: 401 | score = val_metric_t2 402 | if score > best_score: 403 | best_score = score 404 | fold_best_scores = save_best_score(fold_best_scores, model, val_metric, val_metric_t2, run_id, fold_idx) 405 | elif OPTIM_TARGET in ['CE_t2']: 406 | score = val_loss_t2 407 | if score < best_score: 408 | best_score = score 409 | fold_best_scores = save_best_score(fold_best_scores, model, val_metric, val_metric_t2, run_id, fold_idx) 410 | elif OPTIM_TARGET in ['BCE-CE_t12']: 411 | score = val_loss + val_loss_t2 412 | if score < best_score: 413 | best_score = score 414 | fold_best_scores = save_best_score(fold_best_scores, model, val_metric, val_metric_t2, run_id, fold_idx) 415 | 416 | if not os.path.exists(f'../save/{EXP_NAME}/model_log'): 417 | os.mkdir(f'../save/{EXP_NAME}/model_log') 418 | history_path = f'../save/{EXP_NAME}/model_log/loss_history-{run_id}_fold{fold_idx}.csv' 419 | # if 420 | pd.DataFrame(loss_history, columns=['epoch', 'trn_loss', 'val_loss', 'trn_loss_t2', 'val_loss_t2']).to_csv(history_path) 421 | mlflow.log_artifact(history_path) 422 | 423 | cv = 0.0 424 | cv_t2 = 0.0 425 | for fold_idx in range(FOLD_NUM): 426 | weight_path, score, score_t2 = fold_best_scores[fold_idx] 427 | cv += score / FOLD_NUM 428 | cv_t2 += score_t2 / FOLD_NUM 429 | logging.info(f"CV(t1): {cv}, CV(t2): {cv_t2}") 430 | 431 | oof = np.zeros(len(X_train)) 432 | oof_t2 = np.zeros((len(X_train), 4)) 433 | preds_test = np.zeros(len(X_test)) 434 | preds_test_t2 = np.zeros((len(X_test), 4)) 435 | for fold_idx in range(FOLD_NUM): 436 | 437 | val_idx = folds[folds.kfold == fold_idx].index.tolist() 438 | x_val = X_train.iloc[val_idx] 439 | 440 | model = DNN_multitask_v2( 441 | dnn_input=dnn_input_len, 442 | dnn_hidden_units=DNN_HIDDEN_UNITS, 443 | dnn_hidden_units_task=DNN_HIDDEN_UNITS_EACH_TASK, 444 | dnn_dropout=DNN_DROPOUT, 445 | activation=DNN_ACTIVATION, use_bn=True, l2_reg=L2_REG, init_std=INIT_STD, 446 | device=DEVICE, 447 | feature_index=feature_index, 448 | embedding_dict=embedding_dict, 449 | dense_features=dense_features, 450 | sparse_features=sparse_features, 451 | varlen_sparse_features=varlen_sparse_features, 452 | ) 453 | weight_path, score, score_t2 = fold_best_scores[fold_idx] 454 | model.load_state_dict(torch.load(weight_path)) 455 | 456 | preds_val_fold, preds_val_t2_fold = model.predict(x_val, BATCH_SIZE) 457 | oof[val_idx] = preds_val_fold 458 | oof_t2[val_idx] = preds_val_t2_fold 459 | 460 | preds_test_fold, preds_test_t2_fold = model.predict(X_test, BATCH_SIZE) 461 | preds_test += preds_test_fold / FOLD_NUM 462 | preds_test_t2 += preds_test_t2_fold / FOLD_NUM 463 | 464 | if not os.path.exists(f'../save/{EXP_NAME}'): 465 | os.mkdir(f'../save/{EXP_NAME}') 466 | 467 | pd.DataFrame(oof).to_csv(f'../save/{EXP_NAME}/preds_val_task1_{run_id}.csv', index=False) 468 | pd.DataFrame(preds_test).to_csv(f'../save/{EXP_NAME}/preds_test_task1_{run_id}.csv', index=False) 469 | pd.DataFrame(oof_t2).to_csv(f'../save/{EXP_NAME}/preds_val_task2_{run_id}.csv', index=False) 470 | pd.DataFrame(preds_test_t2).to_csv(f'../save/{EXP_NAME}/preds_test_task2_{run_id}.csv', index=False) 471 | 472 | # oof_t2 = np.argmax(oof_t2, axis=1) 473 | preds_test_t2 = np.argmax(preds_test_t2, axis=1) 474 | 475 | rows = [] 476 | for th in range(40, 60, 1): 477 | th = th * 0.01 478 | preds_th = [] 479 | for i in oof: 480 | if i > th: 481 | preds_th.append(1) 482 | else: 483 | preds_th.append(0) 484 | acc = metrics.accuracy_score(y_train_t1, preds_th) 485 | rows.append([th, acc]) 486 | acc_th = pd.DataFrame(rows, columns=['th', 'acc']) 487 | tmp = acc_th.sort_values('acc', ascending=False).head(1) 488 | best_th, best_acc = tmp.values[0] 489 | 490 | oof_metric_t2 = metric_func_t2( 491 | torch.from_numpy(oof_t2), torch.from_numpy(y_train_t2) 492 | ).item() 493 | 494 | logging.info(f'OOF ACC(t1): {best_acc}, TH:{best_th}, OOF ACC(t2): {oof_metric_t2}') 495 | save_mlflow(run_id, cv, fold_best_scores, best_acc, best_th, oof_metric_t2) 496 | 497 | # task1 498 | preds_test_th = [] 499 | for i in preds_test: 500 | if i > best_th: 501 | preds_test_th.append(1) 502 | else: 503 | preds_test_th.append(0) 504 | 505 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 506 | sub = pd.read_csv(test_data_path) 507 | sub['IsCorrect'] = preds_test_th 508 | 509 | if not os.path.exists(f'{SUB_DIR}/{EXP_NAME}'): 510 | os.mkdir(f'{SUB_DIR}/{EXP_NAME}') 511 | 512 | sub_name = f'submission_task1__auc{cv}__acc{best_acc}__th{best_th}' 513 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 514 | if not os.path.exists(valid_sub_dir): 515 | os.mkdir(valid_sub_dir) 516 | 517 | sub.to_csv(f'{valid_sub_dir}/submission_task_1.csv', index=False) 518 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 519 | new_zip.write(f'{valid_sub_dir}/submission_task_1.csv', arcname='submission_task_1.csv') 520 | shutil.rmtree(valid_sub_dir) 521 | mlflow.log_artifact(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip') 522 | 523 | # task2 524 | test_data_path = f'../submission_templates/submission_task_1_2.csv' 525 | sub = pd.read_csv(test_data_path) 526 | sub['AnswerValue'] = preds_test_t2 527 | sub['AnswerValue'] = sub['AnswerValue'] + 1 528 | 529 | if not os.path.exists(f'{SUB_DIR}/{EXP_NAME}'): 530 | os.mkdir(f'{SUB_DIR}/{EXP_NAME}') 531 | 532 | sub_name = f'submission_task2__acc{oof_metric_t2}' 533 | valid_sub_dir = f'{SUB_DIR}/{EXP_NAME}/{sub_name}' 534 | if not os.path.exists(valid_sub_dir): 535 | os.mkdir(valid_sub_dir) 536 | 537 | sub.to_csv(f'{valid_sub_dir}/submission_task_2.csv', index=False) 538 | with zipfile.ZipFile(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: 539 | new_zip.write(f'{valid_sub_dir}/submission_task_2.csv', arcname='submission_task_2.csv') 540 | shutil.rmtree(valid_sub_dir) 541 | mlflow.log_artifact(f'{SUB_DIR}/{EXP_NAME}/{sub_name}.zip') 542 | 543 | mlflow.end_run() 544 | --------------------------------------------------------------------------------