├── .flake8 ├── .gitignore ├── LICENSE ├── README.md ├── config ├── data │ └── dataset.yaml ├── ensemble.yaml ├── generator │ └── features.yaml ├── models │ ├── autoint.yaml │ ├── catboost.yaml │ ├── fibinet.yaml │ ├── lightgbm.yaml │ ├── wdl.yaml │ ├── xdeepfm.yaml │ └── xgboost.yaml ├── predict.yaml ├── sampling.yaml └── train.yaml ├── environment.yaml ├── input └── .gitkeep ├── libs ├── data │ ├── __init__.py │ └── dataset.py ├── generator │ ├── __init__.py │ ├── encoder.py │ └── features.py ├── models │ ├── __init__.py │ ├── base.py │ ├── boosting.py │ └── dlrm.py └── utils │ ├── __init__.py │ └── utility.py ├── notebook └── eda.py ├── pyproject.toml ├── res ├── meta │ └── .gitkeep └── models │ └── .gitkeep └── scripts ├── covert_to_parquet.py ├── ensemble.py ├── predict.py ├── sampling.py ├── shell ├── cb_experiment.sh ├── fibinet_experiment.sh ├── lgb_experiment.sh ├── run.sh ├── sampling_dataset.sh ├── wdl_experiment.sh └── xdeepfm_experiment.sh └── train.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E203, W503, E501 4 | exclude = 5 | .git, 6 | .gitignore, 7 | */migrations/*, 8 | __pycache__, 9 | per-file-ignores = 10 | src/*/__init__.py:F401,F403, 11 | notebook/eda.py:F401,E402 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | input/* 162 | !input/.gitkeep 163 | output/* 164 | !output/.gitkeep 165 | res/meta/* 166 | res/models/* 167 | !res/meta/.gitkeep 168 | !res/models/.gitkeep -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 ds wook 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # web-ctr-prediction 2 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 3 | 4 | This repository is the 1st solution of [web ctr competition](https://dacon.io/competitions/official/236258/overview/description). 5 | 6 | 7 | ## Setting 8 | - CPU: i7-11799K core 8 9 | - RAM: 32GB 10 | - GPU: NVIDIA GeForce RTX 3090 Ti 11 | 12 | 13 | ## Requirements 14 | 15 | By default, `hydra-core==1.3.0` was added to the requirements given by the competition. 16 | For `pytorch`, refer to the link at https://pytorch.org/get-started/previous-versions/ and reinstall it with the right version of `pytorch` for your environment. 17 | 18 | You can install a library where you can run the file by typing: 19 | 20 | ```sh 21 | $ conda env create --file environment.yaml 22 | ``` 23 | 24 | ## Run code 25 | 26 | Code execution for the new model is as follows: 27 | 28 | Running the learning code shell. 29 | 30 | ```sh 31 | $ python -m scripts.covert_to_parquet 32 | $ sh scripts/shell/sampling_dataset.sh 33 | $ sh scripts/shell/lgb_experiment.sh 34 | $ sh scripts/shell/cb_experiment.sh 35 | $ sh scripts/shell/xdeepfm_experiment.sh 36 | $ sh scripts/shell/fibinet_experiment.sh 37 | $ python -m scripts.ensemble 38 | ``` 39 | 40 | Examples are as follows. 41 | 42 | ```sh 43 | MODEL_NAME="lightgbm" 44 | SAMPLING=0.45 45 | 46 | for seed in 517 1119 47 | do 48 | python -m scripts.train \ 49 | data.train=train_sample_${SAMPLING}_seed${seed} \ 50 | models=${MODEL_NAME} \ 51 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 52 | 53 | python -m scripts.predict \ 54 | models=${MODEL_NAME} \ 55 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \ 56 | output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 57 | done 58 | ``` 59 | 60 | ## Summary 61 | ![competition-model](https://github.com/ds-wook/web-ctr-prediction/assets/46340424/21f6f58c-1844-4d6b-a915-3afcacdca4a2) 62 | 63 | 64 | Simple is better than complex 65 | 66 | ## Negative Sampling 67 | Negative sampling is very important in recommendation systems. This method is very effective when it is not possible to train on large volumes of data. 68 | In my experiment, I used seeds 414 and 602 for a 40% negative sample, and seeds 517 and 1119 for a 45% negative sample. 69 | 70 | ## Features 71 | #### Label Encoder 72 | I encoded the Label of each categorical dataset and trained them together, referring to the [kaggler](https://github.com/jeongyoonlee/Kaggler) code. 73 | 74 | 75 | #### Count features 76 | I encoded the frequency of occurrence of each categorical dataset and trained them. 77 | 78 | #### Gauss Rank 79 | ![gauss rank](https://github.com/ds-wook/web-ctr-prediction/assets/46340424/4d9ce6bc-8d6c-41f4-b001-298bb4538265) 80 | 81 | Routine to rank a set of given ensemble forecasts according to their "value". 82 | This method normally distributes the distribution of each numerical data, resulting in better performance for the model. Experimental results show higher performance than ``MinMaxScaler``. 83 | 84 | ### Model 85 | Considering the characteristics of tabular data, we devised a strategy to train GBDT models and NN models, and then ensemble them. 86 | 87 | #### GBDT 88 | + LightGBM 89 | + With count features 90 | + StratifiedKfold: 5 91 | 92 | + CatBoost 93 | + Use GPU 94 | + Not used cat_features parameter 95 | + With count features 96 | + StratifiedKFold: 5 97 | 98 | #### Deep CTR 99 | 100 | + xDeepFM 101 | + With Gauss Rank 102 | + StratifiedKFold: 5 103 | 104 | + FiBiNET 105 | + With Gauss Rank 106 | + StratifiedKFold: 5 107 | + Long training and inferencing time 108 | 109 | ### Ensemble 110 | #### Sigmoid Ensemble 111 | I used the concept of log-odds from logistic regression to construct an ensemble: 112 | $$\sigma(𝑥)=\frac{1}{1 + e^{-x}}$$ 113 | $$\sigma^{-1}(x)= \log(\frac{x}{1-x})$$ 114 | $$\hat{y}=\sigma(\frac{1}{n}\sum_i^n \sigma^{-1}(x_i))=\sigma(\mathbb{E}[\sigma^{-1}(X)])$$ 115 | 116 | + It seems to perform better than other ensembles (Rank, Voting). 117 | + Since the prediction values are probabilities, we used the logit function and its inverse to perform bagging for the ensemble. 118 | 119 | 120 | ## Benchmark 121 | + Each model result 122 | 123 | |Model|cv|public-lb|private-lb| 124 | |-----|--|---------|----------| 125 | |LightGBM-0.45 sampling|**0.7850**|0.7863|0.7866| 126 | |FiBiNET-0.45 sampling|0.7833|0.7861|0.7862| 127 | |xDeepFM-0.45 sampling|0.7819|**0.7866**|**0.7867**| 128 | |wide&deep-0.45 sampling|0.7807|0.7835|0.7837| 129 | |AutoInt-0.45 sampling|0.7813|0.7846|0.7848| 130 | |CatBoost-0.45 sampling|0.7765|0.7773|0.7778| 131 | 132 | + Ensemble result 133 | 134 | |Method|public-lb|private-lb| 135 | |------|---------|----------| 136 | |Rank Ensemble|0.7889|-| 137 | |Average Ensemble|0.7892|-| 138 | |Weighted average Ensemble|0.7891|-| 139 | |Sigmoid Ensemble|**0.7903**|**0.7905**| 140 | 141 | 142 | ## Doesn't Work 143 | + Day Cross validation 144 | + Day feature 145 | + Catboost with cat_features parameter 146 | + XGBoost with GPU 147 | + Hash features: need more RAM 148 | + DeepFM 149 | + LightGBM DART 150 | 151 | ## Reference 152 | + [LightGBM: A Highly Efficient Gradient Boosting Decision Tree](https://lightgbm.readthedocs.io/en/stable/) 153 | + [Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792) 154 | + [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433) 155 | + [xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://arxiv.org/pdf/1803.05170) 156 | + [CatBoost is a high-performance open source library for gradient boosting on decision trees](https://catboost.ai/) 157 | + [Efficient Click-Through Rate Prediction for Developing Countries via Tabular Learning](https://arxiv.org/pdf/2104.07553) 158 | + [Label Encoder](https://github.com/jeongyoonlee/Kaggler/blob/master/kaggler/preprocessing/categorical.py) 159 | + [Gauss Rank](https://github.com/aldente0630/gauss-rank-scaler) 160 | + [Sigmoid Ensemble](https://www.kaggle.com/competitions/amex-default-prediction/discussion/329103) 161 | -------------------------------------------------------------------------------- /config/data/dataset.yaml: -------------------------------------------------------------------------------- 1 | path: input/web-ctr-prediction/ 2 | meta: res/meta/ 3 | shift: 1 4 | train: train_sample_0.45_seed517 5 | test: test 6 | submit: sample_submission 7 | target: Click 8 | n_splits: 5 9 | seed: 1119 10 | sampling: 0.3 11 | -------------------------------------------------------------------------------- /config/ensemble.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - data: dataset 4 | - generator: features 5 | - models: lightgbm 6 | - override hydra/hydra_logging: disabled 7 | - override hydra/job_logging: disabled 8 | 9 | hydra: 10 | run: 11 | dir: . 12 | output_subdir: null 13 | 14 | output: 15 | path: output 16 | submission: sample_submission 17 | name: sigmoid-ensemble-final-4models-2sample 18 | 19 | preds: 20 | - 5fold-ctr-lightgbm-0.4-seed414 21 | - 5fold-ctr-lightgbm-0.4-seed602 22 | - 5fold-ctr-lightgbm-0.45-seed517 23 | - 5fold-ctr-lightgbm-0.45-seed1119 24 | - 5fold-ctr-fibinet-0.4-seed414 25 | - 5fold-ctr-fibinet-0.4-seed602 26 | - 5fold-ctr-fibinet-0.45-seed517 27 | - 5fold-ctr-fibinet-0.45-seed1119 28 | - 5fold-ctr-catboost-0.45-seed517 29 | - 5fold-ctr-catboost-0.45-seed1119 30 | - 5fold-ctr-xdeepfm-0.45-seed517 31 | - 5fold-ctr-xdeepfm-0.45-seed1119 -------------------------------------------------------------------------------- /config/generator/features.yaml: -------------------------------------------------------------------------------- 1 | cat_features: 2 | - F01 3 | - F02 4 | - F03 5 | - F05 6 | - F07 7 | - F08 8 | - F09 9 | - F10 10 | - F12 11 | - F13 12 | - F15 13 | - F16 14 | - F17 15 | - F20 16 | - F21 17 | - F22 18 | - F23 19 | - F25 20 | - F26 21 | - F28 22 | - F30 23 | - F31 24 | - F34 25 | - F35 26 | - F37 27 | - F39 28 | 29 | num_features: 30 | - F14 31 | - F04 32 | - F11 33 | - F18 34 | - F19 35 | - F24 36 | - F27 37 | - F29 38 | - F32 39 | - F33 40 | - F36 41 | - F38 42 | 43 | drop_features: 44 | - ID 45 | 46 | sparse_features: 47 | - F01 48 | - F02 49 | - F03 50 | - F05 51 | - F07 52 | - F08 53 | - F09 54 | - F10 55 | - F12 56 | - F13 57 | - F15 58 | - F16 59 | - F17 60 | - F20 61 | - F21 62 | - F22 63 | - F23 64 | - F25 65 | - F26 66 | - F28 67 | - F30 68 | - F31 69 | - F34 70 | - F35 71 | - F37 72 | - F39 73 | 74 | dense_features: 75 | - F14 76 | - F04 77 | - F11 78 | - F18 79 | - F19 80 | - F24 81 | - F27 82 | - F29 83 | - F32 84 | - F33 85 | - F36 86 | - F38 87 | -------------------------------------------------------------------------------- /config/models/autoint.yaml: -------------------------------------------------------------------------------- 1 | name: autoint 2 | 3 | device: cuda:0 4 | seed: 42 5 | l2_reg_linear: 0.0001 6 | l2_reg_embedding: 0.0001 7 | dnn_activation: prelu 8 | lr: 0.0001 9 | dnn_dropout: 0.5 10 | verbose: 1 11 | patience: 2 12 | mode: min 13 | batch_size: 4096 14 | epochs: 3 15 | 16 | path: res/models/ 17 | results: 5fold-ctr-autoint-0.4-seed414 -------------------------------------------------------------------------------- /config/models/catboost.yaml: -------------------------------------------------------------------------------- 1 | name: catboost 2 | 3 | params: 4 | iterations: 30000 5 | task_type: GPU 6 | loss_function: Logloss 7 | eval_metric: AUC 8 | simple_ctr: FeatureFreq 9 | combinations_ctr: FeatureFreq 10 | max_ctr_complexity: 4 11 | learning_rate: 0.3 12 | od_type: Iter 13 | allow_writing_files: False 14 | 15 | path: res/models/ 16 | results: 5fold-ctr-catboost 17 | early_stopping_rounds: 100 18 | seed: 42 19 | verbose_eval: 1000 -------------------------------------------------------------------------------- /config/models/fibinet.yaml: -------------------------------------------------------------------------------- 1 | name: fibinet 2 | 3 | device: cuda:0 4 | seed: 42 5 | l2_reg_linear: 0.0001 6 | l2_reg_embedding: 0.0001 7 | dnn_activation: prelu 8 | lr: 0.0001 9 | dnn_dropout: 0.5 10 | verbose: 1 11 | patience: 2 12 | mode: max 13 | batch_size: 4096 14 | epochs: 3 15 | 16 | path: res/models/ 17 | results: 5fold-ctr-fibinet-0.4-seed414 -------------------------------------------------------------------------------- /config/models/lightgbm.yaml: -------------------------------------------------------------------------------- 1 | name: lightgbm 2 | 3 | params: 4 | boosting_type: gbdt 5 | objective: binary 6 | metric: auc 7 | learning_rate: 0.05 8 | bagging_seed: 602 9 | num_leaves: 256 10 | max_depth: -1 11 | min_child_weight: 0.03 12 | bagging_fraction: 0.4 13 | feature_fraction: 0.3 14 | lambda_l1: 0.4 15 | lambda_l2: 0.6 16 | num_threads: -1 17 | 18 | path: res/models/ 19 | results: 5fold-ctr-lightgbm-0.4-seed1119 20 | early_stopping_rounds: 100 21 | num_boost_round: 10000 22 | verbose_eval: 100 23 | seed: 42 -------------------------------------------------------------------------------- /config/models/wdl.yaml: -------------------------------------------------------------------------------- 1 | name: wdl 2 | 3 | device: cuda:0 4 | seed: 42 5 | l2_reg_linear: 0.0001 6 | l2_reg_embedding: 0.0001 7 | dnn_activation: prelu 8 | lr: 0.0001 9 | dnn_dropout: 0.5 10 | verbose: 1 11 | patience: 2 12 | mode: min 13 | batch_size: 4096 14 | epochs: 5 15 | 16 | path: res/models/ 17 | results: 5fold-ctr-wdl-0.4-seed42 -------------------------------------------------------------------------------- /config/models/xdeepfm.yaml: -------------------------------------------------------------------------------- 1 | name: xdeepfm 2 | 3 | device: cuda 4 | seed: 42 5 | l2_reg_linear: 0.0001 6 | l2_reg_embedding: 0.0001 7 | dnn_activation: prelu 8 | lr: 0.0001 9 | dnn_dropout: 0.5 10 | verbose: 1 11 | patience: 2 12 | mode: min 13 | batch_size: 4096 14 | epochs: 5 15 | gpus: 16 | - 0 17 | - 1 18 | 19 | path: res/models/ 20 | results: 5fold-ctr-dien-0.4-seed414 -------------------------------------------------------------------------------- /config/models/xgboost.yaml: -------------------------------------------------------------------------------- 1 | name: xgboost 2 | 3 | params: 4 | eta: 0.03 5 | subsample: 0.8 6 | colsample_bytree: 0.8 7 | alpha: 0.01 8 | lambda: 0.01 9 | seed: 42 10 | method: hist 11 | device: cuda 12 | objective: binary:logistic 13 | eval_metric: auc 14 | n_jobs: -1 15 | 16 | path: res/models/ 17 | results: 5fold-ctr-xgboost-0.4-seed1119 18 | early_stopping_rounds: 100 19 | num_boost_round: 10000 20 | verbose_eval: 100 21 | seed: 42 -------------------------------------------------------------------------------- /config/predict.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - data: dataset 4 | - generator: features 5 | - models: lightgbm 6 | - override hydra/hydra_logging: disabled 7 | - override hydra/job_logging: disabled 8 | 9 | hydra: 10 | run: 11 | dir: . 12 | output_subdir: null 13 | 14 | output: 15 | path: output 16 | submission: sample_submission 17 | name: 5fold-ctr-lightgbm-0.4-seed1119 18 | 19 | seed: 42 20 | -------------------------------------------------------------------------------- /config/sampling.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - data: dataset 4 | - models: lightgbm 5 | - override hydra/hydra_logging: disabled 6 | - override hydra/job_logging: disabled 7 | 8 | hydra: 9 | run: 10 | dir: . 11 | output_subdir: null 12 | 13 | output: 14 | path: output 15 | submission: sample_submission.csv 16 | name: train_sample 17 | 18 | -------------------------------------------------------------------------------- /config/train.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - data: dataset 4 | - generator: features 5 | - models: catboost 6 | - override hydra/hydra_logging: disabled 7 | - override hydra/job_logging: disabled 8 | 9 | hydra: 10 | run: 11 | dir: . 12 | output_subdir: null 13 | 14 | output: 15 | path: output 16 | submission: sample_submission.csv 17 | name: 5fold-ctr-catboost.csv 18 | 19 | seed: 42 20 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: ctr 2 | channels: 3 | - defaults 4 | - conda-forge 5 | - pytorch 6 | dependencies: 7 | - pip 8 | - python=3.10.13 9 | - pip: 10 | - black==24.1.1 11 | - category-encoders==2.6.3 12 | - deepctr-torch==0.2.9 13 | - flake8==7.0.0 14 | - isort==5.13.2 15 | - jupyter==1.0.0 16 | - openpyxl==3.0.10 17 | - catboost==1.2.2 18 | - hydra-core==1.3.0 19 | - holidays==0.24 20 | - lightgbm==3.3.2 21 | - matplotlib==3.5.3 22 | - seaborn==0.11.0 23 | - numpy==1.24.3 24 | - pandas==2.1.4 25 | - plotly==5.18.0 26 | - polars==0.19.19 27 | - prettytable==3.8.0 28 | - pyarrow==16.0.0 29 | - pytorch-tabnet==4.1.0 30 | - pytorch_optimizer==2.12.0 31 | - scikit-learn==1.2.2 32 | - scipy==1.10.1 33 | - torch==2.3.1 34 | - torchaudio==2.3.1 35 | - torchvision==0.18.1 36 | - tqdm==4.65.0 37 | - xgboost==2.0.1 38 | - wandb==0.17.2 -------------------------------------------------------------------------------- /input/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ds-wook/web-ctr-prediction/362c7c940f87b1c6f204764680cbb1bd8139f307/input/.gitkeep -------------------------------------------------------------------------------- /libs/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import * -------------------------------------------------------------------------------- /libs/data/dataset.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import joblib 4 | import pandas as pd 5 | from category_encoders import CountEncoder 6 | from omegaconf import DictConfig 7 | from sklearn.preprocessing import QuantileTransformer 8 | 9 | from libs.generator import FeatureEngineering, LabelEncoder 10 | 11 | 12 | class DataStorage: 13 | def __init__(self, cfg: DictConfig): 14 | self.cfg = cfg 15 | 16 | def _categorize_train_features(self, train_x: pd.DataFrame) -> pd.DataFrame: 17 | """Categorical encoding for train data 18 | Args: 19 | config: config 20 | train: dataframe 21 | Returns: 22 | dataframe 23 | """ 24 | le = LabelEncoder() 25 | train_x[[*self.cfg.generator.cat_features]] = le.fit_transform(train_x[[*self.cfg.generator.cat_features]]) 26 | joblib.dump(le, Path(self.cfg.data.meta) / "label_encoder.pkl") 27 | 28 | return train_x 29 | 30 | def _categorize_test_features(self, test_x: pd.DataFrame) -> pd.DataFrame: 31 | """Categorical encoding for test data 32 | Args: 33 | config: config 34 | test: dataframe 35 | Returns: 36 | dataframe 37 | """ 38 | 39 | le = joblib.load(Path(self.cfg.data.meta) / "label_encoder.pkl") 40 | test_x[[*self.cfg.generator.cat_features]] = le.transform(test_x[[*self.cfg.generator.cat_features]]) 41 | 42 | return test_x 43 | 44 | def _count_train_features(self, train_x: pd.DataFrame) -> pd.DataFrame: 45 | """Categorical encoding for train data 46 | Args: 47 | config: config 48 | train: dataframe 49 | Returns: 50 | dataframe 51 | """ 52 | cnt = CountEncoder() 53 | train_enc = cnt.fit_transform(train_x[[*self.cfg.generator.cat_features]]) 54 | train_x = train_x.join(train_enc.add_suffix("_count")) 55 | joblib.dump(cnt, Path(self.cfg.data.meta) / "count_encoder.pkl") 56 | 57 | return train_x 58 | 59 | def _count_test_features(self, test_x: pd.DataFrame) -> pd.DataFrame: 60 | """Categorical encoding for test data 61 | Args: 62 | config: config 63 | test: dataframe 64 | Returns: 65 | dataframe 66 | """ 67 | 68 | cnt = joblib.load(Path(self.cfg.data.meta) / "count_encoder.pkl") 69 | test_enc = cnt.transform(test_x[[*self.cfg.generator.cat_features]]) 70 | test_x = test_x.join(test_enc.add_suffix("_count")) 71 | 72 | return test_x 73 | 74 | def _numerical_train_scaling(self, train: pd.DataFrame) -> pd.DataFrame: 75 | """Numerical scaling 76 | Args: 77 | config: config 78 | train: dataframe 79 | test: dataframe 80 | Returns: 81 | dataframe 82 | """ 83 | scaler = QuantileTransformer(n_quantiles=100, output_distribution="normal") 84 | train[[*self.cfg.generator.num_features]] = scaler.fit_transform(train[[*self.cfg.generator.num_features]]) 85 | joblib.dump(scaler, Path(self.cfg.data.meta) / "rankgauss.pkl") 86 | 87 | return train 88 | 89 | def _numerical_test_scaling(self, test: pd.DataFrame) -> pd.DataFrame: 90 | """Numerical scaling 91 | Args: 92 | config: config 93 | test: dataframe 94 | Returns: 95 | dataframe 96 | """ 97 | scaler = joblib.load(Path(self.cfg.data.meta) / "rankgauss.pkl") 98 | test[[*self.cfg.generator.num_features]] = scaler.transform(test[[*self.cfg.generator.num_features]]) 99 | 100 | return test 101 | 102 | def load_train_dataset(self) -> pd.DataFrame: 103 | train = pd.read_parquet(Path(self.cfg.data.path) / f"{self.cfg.data.train}.parquet") 104 | 105 | feature_engineering = FeatureEngineering(self.cfg) 106 | 107 | if self.cfg.models.name == "lightgbm": 108 | train = self._categorize_train_features(train) 109 | train = feature_engineering.convert_categorical_features(train) 110 | train = self._count_train_features(train) 111 | 112 | elif self.cfg.models.name == "catboost": 113 | train = self._categorize_train_features(train) 114 | train = feature_engineering.reduce_mem_usage(train) 115 | train = self._count_train_features(train) 116 | 117 | else: 118 | train = self._categorize_train_features(train) 119 | train = self._numerical_train_scaling(train) 120 | train = train.fillna(0) 121 | 122 | train = feature_engineering.reduce_mem_usage(train) 123 | 124 | train_x = train.drop(columns=[*self.cfg.generator.drop_features, self.cfg.data.target]) 125 | train_y = train[self.cfg.data.target] 126 | 127 | return train_x, train_y 128 | 129 | def load_test_dataset(self) -> pd.DataFrame: 130 | test = pd.read_parquet(Path(self.cfg.data.path) / "test.parquet") 131 | 132 | feature_engineering = FeatureEngineering(self.cfg) 133 | 134 | if self.cfg.models.name == "lightgbm": 135 | test = self._categorize_test_features(test) 136 | test = feature_engineering.convert_categorical_features(test) 137 | test = self._count_test_features(test) 138 | 139 | elif self.cfg.models.name == "catboost": 140 | test = self._categorize_test_features(test) 141 | test = self._count_test_features(test) 142 | 143 | else: 144 | test = self._categorize_test_features(test) 145 | test = self._numerical_test_scaling(test) 146 | test = test.fillna(0) 147 | 148 | test = feature_engineering.reduce_mem_usage(test) 149 | 150 | test_x = test.drop(columns=self.cfg.generator.drop_features) 151 | 152 | return test_x 153 | -------------------------------------------------------------------------------- /libs/generator/__init__.py: -------------------------------------------------------------------------------- 1 | from .encoder import * 2 | from .features import * 3 | -------------------------------------------------------------------------------- /libs/generator/encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator 4 | from typing_extensions import Self 5 | 6 | NAN_INT = 7535805 7 | 8 | 9 | class LabelEncoder(BaseEstimator): 10 | """Label Encoder that groups infrequent values into one label. 11 | 12 | Attributes: 13 | min_obs (int): minimum number of observation to assign a label. 14 | label_encoders (list of dict): label encoders for columns 15 | label_maxes (list of int): maximum of labels for columns 16 | """ 17 | 18 | def __init__(self, min_obs: int = 10): 19 | """Initialize the OneHotEncoder class object. 20 | 21 | Args: 22 | min_obs (int): minimum number of observation to assign a label. 23 | """ 24 | 25 | self.min_obs = min_obs 26 | self.is_fitted = False 27 | 28 | def __repr__(self): 29 | return ("LabelEncoder(min_obs={})").format(self.min_obs) 30 | 31 | def _get_label_encoder_and_max(self, x: pd.Series) -> tuple[dict, int]: 32 | """Return a mapping from values and its maximum of a column to integer labels. 33 | 34 | Args: 35 | x (pandas.Series): a categorical column to encode. 36 | 37 | Returns: 38 | (tuple): 39 | - (dict): mapping from values of features to integers 40 | - (int): maximum label 41 | """ 42 | 43 | # NaN cannot be used as a key for dict. Impute it with a random 44 | # integer. 45 | label_count = x.fillna(NAN_INT).value_counts() 46 | n_uniq = label_count.shape[0] 47 | 48 | label_count = label_count[label_count >= self.min_obs] 49 | n_uniq_new = label_count.shape[0] 50 | 51 | # If every label appears more than min_obs, new label starts from 0. 52 | # Otherwise, new label starts from 1 and 0 is used for all old labels 53 | # that appear less than min_obs. 54 | offset = 0 if n_uniq == n_uniq_new else 1 55 | 56 | label_encoder = pd.Series(np.arange(n_uniq_new) + offset, index=label_count.index) 57 | max_label = label_encoder.max() 58 | label_encoder = label_encoder.to_dict() 59 | 60 | return label_encoder, max_label 61 | 62 | def _transform_col(self, x: pd.Series, i: int) -> pd.Series: 63 | """Encode one categorical column into labels. 64 | 65 | Args: 66 | x (pandas.Series): a categorical column to encode 67 | i (int): column index 68 | 69 | Returns: 70 | (pandas.Series): a column with labels. 71 | """ 72 | return x.fillna(NAN_INT).map(self.label_encoders[i]).fillna(0).astype(int) 73 | 74 | def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> Self: 75 | self.label_encoders = [None] * X.shape[1] 76 | self.label_maxes = [None] * X.shape[1] 77 | 78 | for i, col in enumerate(X.columns): 79 | ( 80 | self.label_encoders[i], 81 | self.label_maxes[i], 82 | ) = self._get_label_encoder_and_max(X[col]) 83 | 84 | self.is_fitted = True 85 | return self 86 | 87 | def transform(self, X: pd.DataFrame) -> pd.DataFrame: 88 | """Encode categorical columns into label encoded columns 89 | 90 | Args: 91 | X (pandas.DataFrame): categorical columns to encode 92 | 93 | Returns: 94 | (pandas.DataFrame): label encoded columns 95 | """ 96 | 97 | assert self.is_fitted, "fit() or fit_transform() must be called before transform()." 98 | 99 | X = X.copy() 100 | for i, col in enumerate(X.columns): 101 | X.loc[:, col] = self._transform_col(X[col], i) 102 | 103 | return X 104 | 105 | def fit_transform(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame: 106 | """Encode categorical columns into label encoded columns 107 | 108 | Args: 109 | X (pandas.DataFrame): categorical columns to encode 110 | 111 | Returns: 112 | (pandas.DataFrame): label encoded columns 113 | """ 114 | 115 | self.label_encoders = [None] * X.shape[1] 116 | self.label_maxes = [None] * X.shape[1] 117 | 118 | X = X.copy() 119 | for i, col in enumerate(X.columns): 120 | ( 121 | self.label_encoders[i], 122 | self.label_maxes[i], 123 | ) = self._get_label_encoder_and_max(X[col]) 124 | 125 | X.loc[:, col] = X[col].fillna(NAN_INT).map(self.label_encoders[i]).fillna(0).astype(int) 126 | 127 | self.is_fitted = True 128 | return X 129 | -------------------------------------------------------------------------------- /libs/generator/features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from omegaconf import DictConfig 4 | from tqdm import tqdm 5 | 6 | 7 | class FeatureEngineering: 8 | def __init__(self, cfg: DictConfig): 9 | self.cfg = cfg 10 | 11 | def convert_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame: 12 | with tqdm(total=len(self.cfg.generator.cat_features), desc="Convert features") as pbar: 13 | # Convert to category type 14 | for col in self.cfg.generator.cat_features: 15 | df[col] = df[col].astype("category") 16 | pbar.update(1) 17 | 18 | return df 19 | 20 | def reduce_mem_usage(self, df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame: 21 | """ 22 | Iterate through all the columns of a dataframe and modify the data type to reduce memory usage. 23 | """ 24 | numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] 25 | start_mem = df.memory_usage().sum() / 1024**2 26 | 27 | for col in df.columns: 28 | col_type = df[col].dtypes 29 | if col_type in numerics: 30 | c_min = df[col].min() 31 | c_max = df[col].max() 32 | 33 | if str(col_type)[:3] == "int": 34 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 35 | df[col] = df[col].astype(np.int8) 36 | 37 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 38 | df[col] = df[col].astype(np.int16) 39 | 40 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 41 | df[col] = df[col].astype(np.int32) 42 | 43 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 44 | df[col] = df[col].astype(np.int64) 45 | else: 46 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 47 | df[col] = df[col].astype(np.float16) 48 | 49 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 50 | df[col] = df[col].astype(np.float32) 51 | 52 | else: 53 | df[col] = df[col].astype(np.float64) 54 | 55 | end_mem = df.memory_usage().sum() / 1024**2 56 | 57 | if verbose: 58 | print( 59 | f"Mem. usage decreased to {end_mem:5.2f} Mb " 60 | + f"({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)" 61 | ) 62 | 63 | return df 64 | -------------------------------------------------------------------------------- /libs/models/__init__.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | 3 | from .base import * 4 | from .boosting import * 5 | from .boosting import CatBoostTrainer, LightGBMTrainer, XGBoostTrainer 6 | from .dlrm import AutoIntTrainer, FiBiNetTranier, WDLTrainer, XDeepFMTrainer 7 | 8 | BulidModel = ( 9 | CatBoostTrainer 10 | | LightGBMTrainer 11 | | XGBoostTrainer 12 | | WDLTrainer 13 | | FiBiNetTranier 14 | | XDeepFMTrainer 15 | | AutoIntTrainer 16 | ) 17 | 18 | 19 | def build_model(cfg: DictConfig) -> BulidModel: 20 | model_type = { 21 | "lightgbm": LightGBMTrainer(cfg), 22 | "xgboost": XGBoostTrainer(cfg), 23 | "catboost": CatBoostTrainer(cfg), 24 | "wdl": WDLTrainer(cfg), 25 | "fibinet": FiBiNetTranier(cfg), 26 | "xdeepfm": XDeepFMTrainer(cfg), 27 | "autoint": AutoIntTrainer(cfg), 28 | } 29 | 30 | if trainer := model_type.get(cfg.models.name): 31 | return trainer 32 | 33 | else: 34 | raise NotImplementedError(f"Model '{cfg.models.name}' is not implemented.") 35 | -------------------------------------------------------------------------------- /libs/models/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import gc 4 | from abc import ABC, abstractmethod 5 | from dataclasses import dataclass 6 | from pathlib import Path 7 | from typing import Any 8 | 9 | import joblib 10 | import lightgbm as lgb 11 | import numpy as np 12 | import pandas as pd 13 | import wandb 14 | import xgboost as xgb 15 | from catboost import CatBoostClassifier 16 | from deepctr_torch.models import WDL, AutoInt, FiBiNET, xDeepFM 17 | from omegaconf import DictConfig 18 | from sklearn.metrics import roc_auc_score 19 | from sklearn.model_selection import StratifiedKFold 20 | from typing_extensions import Self 21 | 22 | 23 | @dataclass 24 | class ModelResult: 25 | oof_preds: np.ndarray 26 | models: dict[str, Any] 27 | 28 | 29 | class BaseModel(ABC): 30 | def __init__(self, cfg: DictConfig): 31 | self.cfg = cfg 32 | 33 | @abstractmethod 34 | def _fit( 35 | self, 36 | X_train: pd.DataFrame | np.ndarray, 37 | y_train: pd.Series | np.ndarray, 38 | X_valid: pd.DataFrame | np.ndarray | None = None, 39 | y_valid: pd.Series | np.ndarray | None = None, 40 | ): 41 | raise NotImplementedError 42 | 43 | def save_model(self, save_dir: Path) -> None: 44 | joblib.dump(self.result, save_dir / f"{self.cfg.models.results}.pkl") 45 | 46 | def fit( 47 | self, 48 | X_train: pd.DataFrame | np.ndarray, 49 | y_train: pd.Series | np.ndarray, 50 | X_valid: pd.DataFrame | np.ndarray | None = None, 51 | y_valid: pd.Series | np.ndarray | None = None, 52 | ) -> Any: 53 | model = self._fit(X_train, y_train, X_valid, y_valid) 54 | 55 | return model 56 | 57 | def _predict(self, model: Any, X: pd.DataFrame | np.ndarray) -> np.ndarray: 58 | if isinstance(model, xgb.Booster): 59 | return model.predict(xgb.DMatrix(X)) 60 | 61 | elif isinstance(model, CatBoostClassifier): 62 | return model.predict_proba(X)[:, 1] 63 | 64 | elif isinstance(model, WDL | xDeepFM | AutoInt | FiBiNET): 65 | feature_names = [*self.cfg.generator.sparse_features, *self.cfg.generator.dense_features] 66 | valid_model_input = {name: X[name] for name in feature_names} 67 | 68 | return model.predict(valid_model_input, batch_size=512).flatten() 69 | 70 | elif isinstance(model, lgb.Booster): 71 | return model.predict(X) 72 | 73 | else: 74 | raise ValueError("Model not supported") 75 | 76 | def run_cv_training(self, X: pd.DataFrame, y: pd.Series) -> Self: 77 | oof_preds = np.zeros(X.shape[0]) 78 | models = {} 79 | kfold = StratifiedKFold(n_splits=self.cfg.data.n_splits, shuffle=True, random_state=self.cfg.data.seed) 80 | 81 | for fold, (train_idx, valid_idx) in enumerate(iterable=kfold.split(X, y), start=1): 82 | with wandb.init(project="competition", name=f"{self.cfg.models.name}-fold-{fold}", dir="never"): 83 | X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx] 84 | y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx] 85 | 86 | model = self.fit(X_train, y_train, X_valid, y_valid) 87 | oof_preds[valid_idx] = self._predict(model, X_valid) 88 | 89 | models[f"fold_{fold}"] = model 90 | 91 | del model, X_train, X_valid, y_train, y_valid 92 | gc.collect() 93 | 94 | self.result = ModelResult(oof_preds=oof_preds, models=models) 95 | 96 | print(f"CV Score: {roc_auc_score(y, oof_preds):.6f}") 97 | 98 | del oof_preds, y, models 99 | gc.collect() 100 | 101 | return self 102 | -------------------------------------------------------------------------------- /libs/models/boosting.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import lightgbm as lgb 4 | import numpy as np 5 | import pandas as pd 6 | import wandb.integration.catboost as wandb_cb 7 | import wandb.integration.lightgbm as wandb_lgb 8 | import wandb.integration.xgboost as wandb_xgb 9 | import xgboost as xgb 10 | from catboost import CatBoostClassifier, Pool 11 | from omegaconf import DictConfig, OmegaConf 12 | 13 | from libs.models import BaseModel 14 | 15 | 16 | class XGBoostTrainer(BaseModel): 17 | def __init__(self, cfg: DictConfig): 18 | super().__init__(cfg) 19 | 20 | def _fit( 21 | self, 22 | X_train: pd.DataFrame | np.ndarray, 23 | y_train: pd.Series | np.ndarray, 24 | X_valid: pd.DataFrame | np.ndarray | None = None, 25 | y_valid: pd.Series | np.ndarray | None = None, 26 | ) -> xgb.Booster: 27 | dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True) 28 | dvalid = xgb.DMatrix(X_valid, y_valid, enable_categorical=True) 29 | 30 | params = OmegaConf.to_container(self.cfg.models.params) 31 | params["seed"] = self.cfg.models.seed 32 | 33 | model = xgb.train( 34 | params=params, 35 | dtrain=dtrain, 36 | evals=[(dtrain, "train"), (dvalid, "eval")], 37 | num_boost_round=self.cfg.models.num_boost_round, 38 | early_stopping_rounds=self.cfg.models.early_stopping_rounds, 39 | verbose_eval=self.cfg.models.verbose_eval, 40 | callbacks=[wandb_xgb.WandbCallback()], 41 | ) 42 | 43 | return model 44 | 45 | 46 | class CatBoostTrainer(BaseModel): 47 | def __init__(self, cfg: DictConfig): 48 | super().__init__(cfg) 49 | 50 | def _fit( 51 | self, 52 | X_train: pd.DataFrame | np.ndarray, 53 | y_train: pd.Series | np.ndarray, 54 | X_valid: pd.DataFrame | np.ndarray | None = None, 55 | y_valid: pd.Series | np.ndarray | None = None, 56 | ) -> CatBoostClassifier: 57 | train_set = Pool(X_train, y_train) 58 | valid_set = Pool(X_valid, y_valid) 59 | 60 | params = OmegaConf.to_container(self.cfg.models.params) 61 | model = CatBoostClassifier(random_state=self.cfg.models.seed, **params) 62 | 63 | model.fit( 64 | train_set, 65 | eval_set=valid_set, 66 | verbose_eval=self.cfg.models.verbose_eval, 67 | early_stopping_rounds=self.cfg.models.early_stopping_rounds, 68 | callbacks=[wandb_cb.WandbCallback()], 69 | ) 70 | 71 | wandb_cb.log_summary(model) 72 | 73 | return model 74 | 75 | 76 | class LightGBMTrainer(BaseModel): 77 | def __init__(self, cfg: DictConfig): 78 | super().__init__(cfg) 79 | 80 | def _fit( 81 | self, 82 | X_train: pd.DataFrame | np.ndarray, 83 | y_train: pd.Series | np.ndarray, 84 | X_valid: pd.DataFrame | np.ndarray | None = None, 85 | y_valid: pd.Series | np.ndarray | None = None, 86 | ) -> lgb.Booster: 87 | train_set = lgb.Dataset(X_train, y_train, categorical_feature=self.cfg.generator.cat_features) 88 | valid_set = lgb.Dataset(X_valid, y_valid, categorical_feature=self.cfg.generator.cat_features) 89 | 90 | params = OmegaConf.to_container(self.cfg.models.params) 91 | params["seed"] = self.cfg.models.seed 92 | 93 | model = lgb.train( 94 | params=params, 95 | train_set=train_set, 96 | valid_sets=[train_set, valid_set], 97 | num_boost_round=self.cfg.models.num_boost_round, 98 | categorical_feature=[*self.cfg.generator.cat_features], 99 | callbacks=[ 100 | lgb.log_evaluation(self.cfg.models.verbose_eval), 101 | lgb.early_stopping(self.cfg.models.early_stopping_rounds), 102 | wandb_lgb.wandb_callback(), 103 | ], 104 | ) 105 | 106 | wandb_lgb.log_summary(model) 107 | 108 | return model 109 | -------------------------------------------------------------------------------- /libs/models/dlrm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from deepctr_torch.callbacks import EarlyStopping 4 | from deepctr_torch.inputs import DenseFeat, SparseFeat, get_feature_names 5 | from deepctr_torch.models import WDL, AutoInt, FiBiNET, xDeepFM 6 | from omegaconf import DictConfig 7 | from pytorch_optimizer import MADGRAD 8 | 9 | from libs.models import BaseModel 10 | 11 | 12 | class WDLTrainer(BaseModel): 13 | def __init__(self, cfg: DictConfig): 14 | super().__init__(cfg) 15 | 16 | def _fit( 17 | self, 18 | X_train: pd.DataFrame | np.ndarray, 19 | y_train: pd.Series | np.ndarray, 20 | X_valid: pd.DataFrame | np.ndarray | None = None, 21 | y_valid: pd.Series | np.ndarray | None = None, 22 | ) -> WDL: 23 | 24 | feature_columns = [ 25 | SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=16) 26 | for feat in self.cfg.generator.sparse_features 27 | ] + [DenseFeat(feat, 1) for feat in self.cfg.generator.dense_features] 28 | 29 | feature_names = get_feature_names(feature_columns) 30 | 31 | train_model_input = {name: X_train[name] for name in feature_names} 32 | valid_model_input = {name: X_valid[name] for name in feature_names} 33 | 34 | model = WDL( 35 | dnn_feature_columns=feature_columns, 36 | linear_feature_columns=feature_columns, 37 | device=self.cfg.models.device, 38 | gpus=self.cfg.models.gpus, 39 | seed=self.cfg.models.seed, 40 | l2_reg_linear=self.cfg.models.l2_reg_linear, 41 | l2_reg_embedding=self.cfg.models.l2_reg_embedding, 42 | dnn_activation=self.cfg.models.dnn_activation, 43 | dnn_dropout=self.cfg.models.dnn_dropout, 44 | dnn_use_bn=True, 45 | ) 46 | 47 | model.compile( 48 | MADGRAD(model.parameters(), lr=self.cfg.models.lr), 49 | "binary_crossentropy", 50 | metrics=["binary_crossentropy", "auc"], 51 | ) 52 | 53 | es = EarlyStopping( 54 | monitor="val_auc", 55 | min_delta=0, 56 | verbose=self.cfg.models.verbose, 57 | patience=self.cfg.models.patience, 58 | mode=self.cfg.models.mode, 59 | ) 60 | 61 | model.fit( 62 | train_model_input, 63 | y_train.values, 64 | batch_size=self.cfg.models.batch_size, 65 | epochs=self.cfg.models.epochs, 66 | verbose=self.cfg.models.verbose, 67 | validation_data=(valid_model_input, y_valid.values), 68 | callbacks=[es], 69 | ) 70 | 71 | return model 72 | 73 | 74 | class XDeepFMTrainer(BaseModel): 75 | def __init__(self, cfg: DictConfig): 76 | super().__init__(cfg) 77 | 78 | def _fit( 79 | self, 80 | X_train: pd.DataFrame | np.ndarray, 81 | y_train: pd.Series | np.ndarray, 82 | X_valid: pd.DataFrame | np.ndarray | None = None, 83 | y_valid: pd.Series | np.ndarray | None = None, 84 | ) -> xDeepFM: 85 | 86 | feature_columns = [ 87 | SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=16) 88 | for feat in self.cfg.generator.sparse_features 89 | ] + [DenseFeat(feat, 1) for feat in self.cfg.generator.dense_features] 90 | 91 | feature_names = get_feature_names(feature_columns) 92 | 93 | train_model_input = {name: X_train[name] for name in feature_names} 94 | valid_model_input = {name: X_valid[name] for name in feature_names} 95 | 96 | model = xDeepFM( 97 | dnn_feature_columns=feature_columns, 98 | linear_feature_columns=feature_columns, 99 | device=self.cfg.models.device, 100 | gpus=self.cfg.models.gpus, 101 | seed=self.cfg.models.seed, 102 | l2_reg_linear=self.cfg.models.l2_reg_linear, 103 | l2_reg_embedding=self.cfg.models.l2_reg_embedding, 104 | dnn_activation=self.cfg.models.dnn_activation, 105 | dnn_dropout=self.cfg.models.dnn_dropout, 106 | dnn_use_bn=True, 107 | ) 108 | 109 | model.compile( 110 | MADGRAD(model.parameters(), lr=self.cfg.models.lr), 111 | "binary_crossentropy", 112 | metrics=["binary_crossentropy", "auc"], 113 | ) 114 | 115 | es = EarlyStopping( 116 | monitor="val_auc", 117 | min_delta=0, 118 | verbose=self.cfg.models.verbose, 119 | patience=self.cfg.models.patience, 120 | mode=self.cfg.models.mode, 121 | ) 122 | 123 | model.fit( 124 | train_model_input, 125 | y_train.values, 126 | batch_size=self.cfg.models.batch_size, 127 | epochs=self.cfg.models.epochs, 128 | verbose=self.cfg.models.verbose, 129 | validation_data=(valid_model_input, y_valid.values), 130 | callbacks=[es], 131 | ) 132 | 133 | return model 134 | 135 | 136 | class FiBiNetTranier(BaseModel): 137 | def __init__(self, cfg: DictConfig): 138 | super().__init__(cfg) 139 | 140 | def _fit( 141 | self, 142 | X_train: pd.DataFrame | np.ndarray, 143 | y_train: pd.Series | np.ndarray, 144 | X_valid: pd.DataFrame | np.ndarray | None = None, 145 | y_valid: pd.Series | np.ndarray | None = None, 146 | ) -> FiBiNET: 147 | 148 | feature_columns = [ 149 | SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=16) 150 | for feat in self.cfg.generator.sparse_features 151 | ] + [DenseFeat(feat, 1) for feat in self.cfg.generator.dense_features] 152 | 153 | feature_names = get_feature_names(feature_columns) 154 | 155 | train_model_input = {name: X_train[name] for name in feature_names} 156 | valid_model_input = {name: X_valid[name] for name in feature_names} 157 | 158 | model = FiBiNET( 159 | dnn_feature_columns=feature_columns, 160 | linear_feature_columns=feature_columns, 161 | device=self.cfg.models.device, 162 | gpus=self.cfg.models.gpus, 163 | seed=self.cfg.models.seed, 164 | l2_reg_linear=self.cfg.models.l2_reg_linear, 165 | l2_reg_embedding=self.cfg.models.l2_reg_embedding, 166 | dnn_activation=self.cfg.models.dnn_activation, 167 | dnn_dropout=self.cfg.models.dnn_dropout, 168 | ) 169 | 170 | model.compile( 171 | MADGRAD(model.parameters(), lr=self.cfg.models.lr), 172 | "binary_crossentropy", 173 | metrics=["binary_crossentropy", "auc"], 174 | ) 175 | 176 | es = EarlyStopping( 177 | monitor="val_auc", 178 | min_delta=0, 179 | verbose=self.cfg.models.verbose, 180 | patience=self.cfg.models.patience, 181 | mode=self.cfg.models.mode, 182 | ) 183 | 184 | model.fit( 185 | train_model_input, 186 | y_train.values, 187 | batch_size=self.cfg.models.batch_size, 188 | epochs=self.cfg.models.epochs, 189 | verbose=self.cfg.models.verbose, 190 | validation_data=(valid_model_input, y_valid.values), 191 | callbacks=[es], 192 | ) 193 | 194 | return model 195 | 196 | 197 | class AutoIntTrainer(BaseModel): 198 | def __init__(self, cfg: DictConfig): 199 | super().__init__(cfg) 200 | 201 | def _fit( 202 | self, 203 | X_train: pd.DataFrame | np.ndarray, 204 | y_train: pd.Series | np.ndarray, 205 | X_valid: pd.DataFrame | np.ndarray | None = None, 206 | y_valid: pd.Series | np.ndarray | None = None, 207 | ) -> AutoInt: 208 | 209 | feature_columns = [ 210 | SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=16) 211 | for feat in self.cfg.generator.sparse_features 212 | ] + [DenseFeat(feat, 1) for feat in self.cfg.generator.dense_features] 213 | 214 | feature_names = get_feature_names(feature_columns) 215 | 216 | train_model_input = {name: X_train[name] for name in feature_names} 217 | valid_model_input = {name: X_valid[name] for name in feature_names} 218 | 219 | model = AutoInt( 220 | dnn_feature_columns=feature_columns, 221 | linear_feature_columns=feature_columns, 222 | device=self.cfg.models.device, 223 | gpus=self.cfg.models.gpus, 224 | seed=self.cfg.models.seed, 225 | l2_reg_embedding=self.cfg.models.l2_reg_embedding, 226 | dnn_activation=self.cfg.models.dnn_activation, 227 | dnn_dropout=self.cfg.models.dnn_dropout, 228 | dnn_use_bn=True, 229 | ) 230 | 231 | model.compile( 232 | MADGRAD(model.parameters(), lr=self.cfg.models.lr), 233 | "binary_crossentropy", 234 | metrics=["binary_crossentropy", "auc"], 235 | ) 236 | 237 | es = EarlyStopping( 238 | monitor="val_auc", 239 | min_delta=0, 240 | verbose=self.cfg.models.verbose, 241 | patience=self.cfg.models.patience, 242 | mode=self.cfg.models.mode, 243 | ) 244 | 245 | model.fit( 246 | train_model_input, 247 | y_train.values, 248 | batch_size=self.cfg.models.batch_size, 249 | epochs=self.cfg.models.epochs, 250 | verbose=self.cfg.models.verbose, 251 | validation_data=(valid_model_input, y_valid.values), 252 | callbacks=[es], 253 | ) 254 | 255 | return model 256 | -------------------------------------------------------------------------------- /libs/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utility import * 2 | -------------------------------------------------------------------------------- /libs/utils/utility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import numpy as np 5 | import torch 6 | 7 | 8 | def seed_everything(seed: int = 42) -> None: 9 | os.environ["PYTHONHASHSEED"] = str(seed) 10 | random.seed(seed) 11 | np.random.seed(seed) 12 | torch.manual_seed(seed) 13 | torch.cuda.manual_seed(seed) 14 | torch.cuda.manual_seed_all(seed) 15 | torch.backends.cudnn.deterministic = True 16 | torch.backends.cudnn.benchmark = False 17 | -------------------------------------------------------------------------------- /notebook/eda.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | 7 | train = pd.read_parquet("../input/web-ctr-prediction/train.parquet") 8 | 9 | # %% 10 | print(train.shape) 11 | 12 | # %% 13 | train["Click"].value_counts(normalize=True) 14 | 15 | # %% 16 | sns.countplot(data=train, x="Click") 17 | plt.show() 18 | # %% 19 | train.head() 20 | # %% 21 | train.info() 22 | 23 | # %% 24 | cat_features = train.dtypes[train.dtypes == "object"].index.tolist() 25 | cat_features 26 | # %% 27 | num_features = train.dtypes[train.dtypes != "object"].index.tolist() 28 | # %% 29 | num_features 30 | # %% 31 | sns.histplot(data=train, x="F04", bins=100) 32 | plt.show() 33 | # %% 34 | for col in num_features: 35 | sns.histplot(data=train, x=col, bins=100) 36 | plt.show() 37 | 38 | # %% 39 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | profile = "black" 3 | multi_line_output = 3 4 | include_trailing_comma = true 5 | force_grid_wrap = 0 6 | use_parentheses = true 7 | ensure_newline_before_comments = true 8 | line_length = 120 9 | 10 | [tool.black] 11 | line-length = 120 12 | target-version = ['py36', 'py37', 'py38'] 13 | exclude = ''' 14 | \.git 15 | | \.mypy_cache 16 | | \.tox 17 | | venv 18 | | \.venv 19 | | _build 20 | | buck-out 21 | | build 22 | | dist 23 | | ^.*\b(migrations)\b.*$ 24 | ''' -------------------------------------------------------------------------------- /res/meta/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ds-wook/web-ctr-prediction/362c7c940f87b1c6f204764680cbb1bd8139f307/res/meta/.gitkeep -------------------------------------------------------------------------------- /res/models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ds-wook/web-ctr-prediction/362c7c940f87b1c6f204764680cbb1bd8139f307/res/models/.gitkeep -------------------------------------------------------------------------------- /scripts/covert_to_parquet.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import hydra 6 | import pandas as pd 7 | from omegaconf import DictConfig 8 | from tqdm import tqdm 9 | 10 | 11 | @hydra.main(config_path="../config/", config_name="train", version_base="1.2.0") 12 | def _main(cfg: DictConfig): 13 | train = pd.DataFrame() 14 | 15 | for chunk in tqdm(pd.read_csv(Path(cfg.data.path) / "train.csv", chunksize=1000000)): 16 | train = pd.concat([train, chunk]) 17 | 18 | train.to_parquet(Path(cfg.data.path) / "train.parquet") 19 | 20 | test = pd.read_csv(Path(cfg.data.path) / "test.csv") 21 | test.to_parquet(Path(cfg.data.path) / "test.parquet") 22 | 23 | 24 | if __name__ == "__main__": 25 | _main() 26 | -------------------------------------------------------------------------------- /scripts/ensemble.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import hydra 6 | import numpy as np 7 | import pandas as pd 8 | from omegaconf import DictConfig 9 | from scipy.stats import rankdata 10 | from tqdm import tqdm 11 | 12 | 13 | def ensemble_predictions(predictions: list[np.ndarray], weights: list[float], method: str = "linear") -> np.ndarray: 14 | assert np.isclose(np.sum(weights), 1.0) 15 | if method == "linear": 16 | res = np.average(predictions, weights=weights, axis=0) 17 | 18 | elif method == "harmonic": 19 | res = np.average([1 / p for p in predictions], weights=weights, axis=0) 20 | return 1 / res 21 | 22 | elif method == "geometric": 23 | numerator = np.average([np.log(p) for p in predictions], weights=weights, axis=0) 24 | res = np.exp(numerator / sum(weights)) 25 | return res 26 | 27 | elif method == "rank": 28 | res = np.average([rankdata(p) for p in predictions], weights=weights, axis=0) 29 | return res / (len(res) + 1) 30 | 31 | elif method == "sigmoid": 32 | logit_values = np.log(predictions / (1 - predictions)) 33 | result = np.average(logit_values, weights=weights, axis=0) 34 | return 1 / (1 + np.exp(-result)) 35 | 36 | else: 37 | raise ValueError(f"Unknown ensemble method: {method}") 38 | 39 | return res 40 | 41 | 42 | def calculate_sigmoid_preds(values: list[np.ndarray]) -> np.ndarray: 43 | """ 44 | Calculate the sigmoid result of the ensemble predictions. 45 | :param values: list of predictions 46 | :param weight: list of weights 47 | :return: ensemble prediction 48 | """ 49 | values = np.array(values) 50 | 51 | logit_values = np.log(values / (1 - values)) 52 | result = np.mean(logit_values, axis=0) 53 | 54 | return 1 / (1 + np.exp(-result)) 55 | 56 | 57 | @hydra.main(config_path="../config/", config_name="ensemble", version_base="1.3.1") 58 | def _main(cfg: DictConfig): 59 | # Load submission file 60 | submit = pd.read_csv(Path(cfg.data.path) / f"{cfg.data.submit}.csv") 61 | 62 | # Load predictions and calculate ranks 63 | preds = [ 64 | pd.read_csv(Path(cfg.output.path) / f"{pred}.csv")[cfg.data.target].to_numpy() 65 | for pred in tqdm(cfg.preds, desc="Loading predictions", colour="red", total=len(cfg.preds)) 66 | ] 67 | 68 | # Calculate average predictions 69 | submit[cfg.data.target] = calculate_sigmoid_preds(preds) 70 | 71 | # Save the ensembled submission 72 | submit.to_csv(Path(cfg.output.path) / f"{cfg.output.name}.csv", index=False) 73 | 74 | 75 | if __name__ == "__main__": 76 | _main() 77 | -------------------------------------------------------------------------------- /scripts/predict.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import hydra 6 | import joblib 7 | import lightgbm as lgb 8 | import numpy as np 9 | import pandas as pd 10 | import xgboost as xgb 11 | from catboost import CatBoostClassifier 12 | from deepctr_torch.models import WDL, AutoInt, FiBiNET, xDeepFM 13 | from omegaconf import DictConfig 14 | from tqdm import tqdm 15 | 16 | from libs.data import DataStorage 17 | 18 | 19 | def inference_models(cfg: DictConfig, test_x: pd.DataFrame | dict[str, pd.Series]) -> np.ndarray: 20 | """Given a model, predict probabilities for each class. 21 | Args: 22 | results: ModelResult object 23 | test_x: test dataframe 24 | Returns: 25 | predict probabilities for each class 26 | """ 27 | # load model 28 | results = joblib.load(Path(cfg.models.path) / f"{cfg.models.results}.pkl") 29 | folds = len(results.models) 30 | preds = np.zeros((test_x.shape[0],)) 31 | 32 | for model in tqdm(results.models.values(), total=folds, desc="Predicting models", colour="blue"): 33 | if isinstance(model, lgb.Booster): 34 | preds += model.predict(test_x) / folds 35 | 36 | elif isinstance(model, xgb.Booster): 37 | preds += model.predict(xgb.DMatrix(test_x)) / folds 38 | 39 | elif isinstance(model, WDL | xDeepFM | AutoInt | FiBiNET): 40 | test_model_input = { 41 | name: test_x[name] for name in [*cfg.generator.sparse_features, *cfg.generator.dense_features] 42 | } 43 | preds += model.predict(test_model_input, batch_size=1024).flatten() / folds 44 | 45 | elif isinstance(model, CatBoostClassifier): 46 | preds += model.predict_proba(test_x)[:, 1] / folds 47 | 48 | else: 49 | raise ValueError(f"Model {model} not supported") 50 | 51 | return preds 52 | 53 | 54 | @hydra.main(config_path="../config/", config_name="predict", version_base="1.3.1") 55 | def _main(cfg: DictConfig): 56 | 57 | # load test dataset 58 | data_storage = DataStorage(cfg) 59 | test_x = data_storage.load_test_dataset() 60 | 61 | # load submit dataset 62 | submit = pd.read_csv(Path(cfg.data.path) / f"{cfg.data.submit}.csv") 63 | 64 | # predict 65 | preds = inference_models(cfg, test_x) 66 | submit[cfg.data.target] = preds 67 | submit.to_csv(Path(cfg.output.path) / f"{cfg.models.results}.csv", index=False) 68 | 69 | 70 | if __name__ == "__main__": 71 | _main() 72 | -------------------------------------------------------------------------------- /scripts/sampling.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import gc 4 | from pathlib import Path 5 | 6 | import hydra 7 | import pandas as pd 8 | import pyarrow.parquet as pq 9 | from omegaconf import DictConfig 10 | from tqdm import tqdm 11 | 12 | 13 | def negative_sampling_train_dataset(cfg: DictConfig) -> pd.DataFrame: 14 | pfile = pq.ParquetFile(Path(cfg.data.path) / "train.parquet") 15 | 16 | train = pd.DataFrame() 17 | negative = pd.DataFrame() 18 | positive = pd.DataFrame() 19 | chunksize = 10**7 20 | 21 | for chunk in tqdm(pfile.iter_batches(batch_size=chunksize), desc="Sampling data", leave=False): 22 | chunk = chunk.to_pandas() 23 | positive_sample = chunk[chunk["Click"] == 1] 24 | negative_sample = chunk[chunk["Click"] == 0] 25 | negative = pd.concat([negative, negative_sample], axis=0, ignore_index=True) 26 | positive = pd.concat([positive, positive_sample], axis=0, ignore_index=True) 27 | 28 | negative_sample = negative.sample(frac=cfg.data.sampling, replace=False, random_state=cfg.data.seed) 29 | train = pd.concat([train, negative_sample, positive], axis=0, ignore_index=True) 30 | del negative, positive 31 | 32 | return train 33 | 34 | 35 | @hydra.main(config_path="../config/", config_name="sampling", version_base="1.2.0") 36 | def _main(cfg: DictConfig): 37 | # load dataset 38 | train = negative_sampling_train_dataset(cfg) 39 | 40 | # save dataset 41 | train.to_parquet(Path(cfg.data.path) / f"{cfg.data.train}.parquet") 42 | 43 | del train 44 | gc.collect() 45 | 46 | 47 | if __name__ == "__main__": 48 | _main() 49 | -------------------------------------------------------------------------------- /scripts/shell/cb_experiment.sh: -------------------------------------------------------------------------------- 1 | MODEL_NAME="catboost" 2 | SAMPLING=0.45 3 | 4 | for seed in 517 1119 5 | do 6 | python -m scripts.train \ 7 | data.train=train_sample_${SAMPLING}_seed${seed} \ 8 | models=${MODEL_NAME} \ 9 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 10 | 11 | python -m scripts.predict \ 12 | models=${MODEL_NAME} \ 13 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \ 14 | output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 15 | done -------------------------------------------------------------------------------- /scripts/shell/fibinet_experiment.sh: -------------------------------------------------------------------------------- 1 | MODEL_NAME="fibinet" 2 | SAMPLING=0.45 3 | 4 | for seed in 517 1119 5 | do 6 | python -m scripts.train \ 7 | data.train=train_sample_${SAMPLING}_seed${seed} \ 8 | models=${MODEL_NAME} \ 9 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 10 | 11 | python -m scripts.predict \ 12 | models=${MODEL_NAME} \ 13 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \ 14 | output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 15 | done 16 | 17 | SAMPLING=0.4 18 | 19 | for seed in 414 602 20 | do 21 | python -m scripts.train \ 22 | data.train=train_sample_${SAMPLING}_seed${seed} \ 23 | models=${MODEL_NAME} \ 24 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 25 | 26 | python -m scripts.predict \ 27 | models=${MODEL_NAME} \ 28 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \ 29 | output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 30 | done -------------------------------------------------------------------------------- /scripts/shell/lgb_experiment.sh: -------------------------------------------------------------------------------- 1 | MODEL_NAME="lightgbm" 2 | SAMPLING=0.45 3 | 4 | for seed in 517 1119 5 | do 6 | python -m scripts.train \ 7 | data.train=train_sample_${SAMPLING}_seed${seed} \ 8 | models=${MODEL_NAME} \ 9 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 10 | 11 | python -m scripts.predict \ 12 | models=${MODEL_NAME} \ 13 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \ 14 | output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 15 | done 16 | 17 | SAMPLING=0.4 18 | 19 | for seed in 414 602 20 | do 21 | python -m scripts.train \ 22 | data.train=train_sample_${SAMPLING}_seed${seed} \ 23 | models=${MODEL_NAME} \ 24 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 25 | 26 | python -m scripts.predict \ 27 | models=${MODEL_NAME} \ 28 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \ 29 | output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 30 | done 31 | -------------------------------------------------------------------------------- /scripts/shell/run.sh: -------------------------------------------------------------------------------- 1 | python -m scripts.covert_to_parquet 2 | sh scripts/shell/sampling_dataset.sh 3 | sh scripts/shell/lgb_experiment.sh 4 | sh scripts/shell/cb_experiment.sh 5 | sh scripts/shell/xdeepfm_experiment.sh 6 | sh scripts/shell/fibinet_experiment.sh 7 | python -m scripts.ensemble 8 | -------------------------------------------------------------------------------- /scripts/shell/sampling_dataset.sh: -------------------------------------------------------------------------------- 1 | for sampling in 0.4 0.45 2 | do 3 | for seed in 517 1119 4 | do 5 | python -m.sampling \ 6 | data.seed=${seed} \ 7 | data.sampling=${sampling} \ 8 | data.train=train_sample_${sampling}_seed${seed} 9 | done 10 | done 11 | -------------------------------------------------------------------------------- /scripts/shell/wdl_experiment.sh: -------------------------------------------------------------------------------- 1 | MODEL_NAME="wdl" 2 | SAMPLING=0.45 3 | 4 | for seed in 517 1119 5 | do 6 | python -m scripts.train \ 7 | data.train=train_sample_${SAMPLING}_seed${seed} \ 8 | models=${MODEL_NAME} \ 9 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 10 | 11 | python -m scripts.predict \ 12 | models=${MODEL_NAME} \ 13 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \ 14 | output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 15 | done -------------------------------------------------------------------------------- /scripts/shell/xdeepfm_experiment.sh: -------------------------------------------------------------------------------- 1 | export PYTHONHASHSEED=0 2 | 3 | MODEL_NAME="xdeepfm" 4 | SAMPLING=0.45 5 | 6 | for seed in 517 1119 7 | do 8 | python -m scripts.train \ 9 | data.train=train_sample_${SAMPLING}_seed${seed} \ 10 | models=${MODEL_NAME} \ 11 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 12 | 13 | python -m scripts.predict \ 14 | models=${MODEL_NAME} \ 15 | models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \ 16 | output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} 17 | done -------------------------------------------------------------------------------- /scripts/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | from pathlib import Path 5 | 6 | import hydra 7 | from omegaconf import DictConfig 8 | 9 | from libs.data import DataStorage 10 | from libs.models import build_model 11 | 12 | 13 | @hydra.main(config_path="../config/", config_name="train", version_base="1.2.0") 14 | def _main(cfg: DictConfig): 15 | with warnings.catch_warnings(): 16 | warnings.filterwarnings("ignore", category=UserWarning) 17 | 18 | # load dataset 19 | data_storage = DataStorage(cfg) 20 | train_x, train_y = data_storage.load_train_dataset() 21 | 22 | # choose trainer 23 | trainer = build_model(cfg) 24 | 25 | # train model 26 | trainer.run_cv_training(train_x, train_y) 27 | 28 | # save model 29 | trainer.save_model(Path(cfg.models.path)) 30 | 31 | 32 | if __name__ == "__main__": 33 | _main() 34 | --------------------------------------------------------------------------------