├── .flake8
├── .gitignore
├── LICENSE
├── README.md
├── config
    ├── data
    │   └── dataset.yaml
    ├── ensemble.yaml
    ├── generator
    │   └── features.yaml
    ├── models
    │   ├── autoint.yaml
    │   ├── catboost.yaml
    │   ├── fibinet.yaml
    │   ├── lightgbm.yaml
    │   ├── wdl.yaml
    │   ├── xdeepfm.yaml
    │   └── xgboost.yaml
    ├── predict.yaml
    ├── sampling.yaml
    └── train.yaml
├── environment.yaml
├── input
    └── .gitkeep
├── libs
    ├── data
    │   ├── __init__.py
    │   └── dataset.py
    ├── generator
    │   ├── __init__.py
    │   ├── encoder.py
    │   └── features.py
    ├── models
    │   ├── __init__.py
    │   ├── base.py
    │   ├── boosting.py
    │   └── dlrm.py
    └── utils
    │   ├── __init__.py
    │   └── utility.py
├── notebook
    └── eda.py
├── pyproject.toml
├── res
    ├── meta
    │   └── .gitkeep
    └── models
    │   └── .gitkeep
└── scripts
    ├── covert_to_parquet.py
    ├── ensemble.py
    ├── predict.py
    ├── sampling.py
    ├── shell
        ├── cb_experiment.sh
        ├── fibinet_experiment.sh
        ├── lgb_experiment.sh
        ├── run.sh
        ├── sampling_dataset.sh
        ├── wdl_experiment.sh
        └── xdeepfm_experiment.sh
    └── train.py


/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 120
 3 | ignore = E203, W503, E501
 4 | exclude = 
 5 |     .git,
 6 |     .gitignore,
 7 |     */migrations/*,
 8 |     __pycache__,
 9 | per-file-ignores =
10 |     src/*/__init__.py:F401,F403,
11 |     notebook/eda.py:F401,E402


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | input/*
162 | !input/.gitkeep
163 | output/*
164 | !output/.gitkeep
165 | res/meta/*
166 | res/models/*
167 | !res/meta/.gitkeep
168 | !res/models/.gitkeep


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 ds wook
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # web-ctr-prediction
  2 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)  
  3 | 
  4 | This repository is the 1st solution of [web ctr competition](https://dacon.io/competitions/official/236258/overview/description).
  5 | 
  6 | 
  7 | ## Setting
  8 | - CPU: i7-11799K core 8
  9 | - RAM: 32GB
 10 | - GPU: NVIDIA GeForce RTX 3090 Ti
 11 | 
 12 | 
 13 | ## Requirements
 14 | 
 15 | By default, `hydra-core==1.3.0` was added to the requirements given by the competition.
 16 | For `pytorch`, refer to the link at https://pytorch.org/get-started/previous-versions/ and reinstall it with the right version of `pytorch` for your environment.
 17 | 
 18 | You can install a library where you can run the file by typing:
 19 | 
 20 | ```sh
 21 | $ conda env create --file environment.yaml
 22 | ```
 23 | 
 24 | ## Run code
 25 | 
 26 | Code execution for the new model is as follows:
 27 | 
 28 | Running the learning code shell.
 29 | 
 30 |    ```sh
 31 |     $ python -m scripts.covert_to_parquet
 32 |     $ sh scripts/shell/sampling_dataset.sh
 33 |     $ sh scripts/shell/lgb_experiment.sh
 34 |     $ sh scripts/shell/cb_experiment.sh
 35 |     $ sh scripts/shell/xdeepfm_experiment.sh
 36 |     $ sh scripts/shell/fibinet_experiment.sh
 37 |     $ python -m scripts.ensemble
 38 |    ```
 39 | 
 40 |    Examples are as follows.
 41 | 
 42 |    ```sh
 43 |     MODEL_NAME="lightgbm"
 44 |     SAMPLING=0.45
 45 | 
 46 |     for seed in 517 1119
 47 |     do
 48 |         python -m scripts.train \
 49 |             data.train=train_sample_${SAMPLING}_seed${seed} \
 50 |             models=${MODEL_NAME} \
 51 |             models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
 52 | 
 53 |         python -m scripts.predict \
 54 |             models=${MODEL_NAME} \
 55 |             models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \
 56 |             output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
 57 |     done
 58 |    ```
 59 | 
 60 | ## Summary
 61 | ![competition-model](https://github.com/ds-wook/web-ctr-prediction/assets/46340424/21f6f58c-1844-4d6b-a915-3afcacdca4a2)
 62 | 
 63 | 
 64 | Simple is better than complex
 65 | 
 66 | ## Negative Sampling
 67 | Negative sampling is very important in recommendation systems. This method is very effective when it is not possible to train on large volumes of data.
 68 | In my experiment, I used seeds 414 and 602 for a 40% negative sample, and seeds 517 and 1119 for a 45% negative sample.
 69 | 
 70 | ## Features
 71 | #### Label Encoder
 72 | I encoded the Label of each categorical dataset and trained them together, referring to the [kaggler](https://github.com/jeongyoonlee/Kaggler) code.
 73 | 
 74 | 
 75 | #### Count features
 76 | I encoded the frequency of occurrence of each categorical dataset and trained them.
 77 | 
 78 | #### Gauss Rank
 79 | ![gauss rank](https://github.com/ds-wook/web-ctr-prediction/assets/46340424/4d9ce6bc-8d6c-41f4-b001-298bb4538265)
 80 | 
 81 | Routine to rank a set of given ensemble forecasts according to their "value".
 82 | This method normally distributes the distribution of each numerical data, resulting in better performance for the model. Experimental results show higher performance than ``MinMaxScaler``.
 83 | 
 84 | ### Model
 85 | Considering the characteristics of tabular data, we devised a strategy to train GBDT models and NN models, and then ensemble them.
 86 | 
 87 | #### GBDT
 88 | + LightGBM
 89 |     + With count features
 90 |     + StratifiedKfold: 5
 91 | 
 92 | + CatBoost
 93 |     + Use GPU
 94 |     + Not used cat_features parameter
 95 |     + With count features
 96 |     + StratifiedKFold: 5
 97 | 
 98 | #### Deep CTR
 99 | 
100 | + xDeepFM
101 |     + With Gauss Rank
102 |     + StratifiedKFold: 5
103 | 
104 | + FiBiNET
105 |     + With Gauss Rank
106 |     + StratifiedKFold: 5
107 |     + Long training and inferencing time
108 | 
109 | ### Ensemble
110 | #### Sigmoid Ensemble 
111 | I used the concept of log-odds from logistic regression to construct an ensemble:  
112 | $$\sigma(𝑥)=\frac{1}{1 + e^{-x}}$$  
113 | $$\sigma^{-1}(x)= \log(\frac{x}{1-x})$$  
114 | $$\hat{y}=\sigma(\frac{1}{n}\sum_i^n \sigma^{-1}(x_i))=\sigma(\mathbb{E}[\sigma^{-1}(X)])$$  
115 | 
116 | + It seems to perform better than other ensembles (Rank, Voting).
117 | + Since the prediction values are probabilities, we used the logit function and its inverse to perform bagging for the ensemble.
118 | 
119 | 
120 | ## Benchmark
121 | + Each model result
122 | 
123 | |Model|cv|public-lb|private-lb|
124 | |-----|--|---------|----------|
125 | |LightGBM-0.45 sampling|**0.7850**|0.7863|0.7866|
126 | |FiBiNET-0.45 sampling|0.7833|0.7861|0.7862|
127 | |xDeepFM-0.45 sampling|0.7819|**0.7866**|**0.7867**|
128 | |wide&deep-0.45 sampling|0.7807|0.7835|0.7837|
129 | |AutoInt-0.45 sampling|0.7813|0.7846|0.7848|
130 | |CatBoost-0.45 sampling|0.7765|0.7773|0.7778|
131 | 
132 | + Ensemble result
133 | 
134 | |Method|public-lb|private-lb|
135 | |------|---------|----------|
136 | |Rank Ensemble|0.7889|-|
137 | |Average Ensemble|0.7892|-|
138 | |Weighted average Ensemble|0.7891|-|
139 | |Sigmoid Ensemble|**0.7903**|**0.7905**|
140 | 
141 | 
142 | ## Doesn't Work
143 | + Day Cross validation
144 | + Day feature
145 | + Catboost with cat_features parameter
146 | + XGBoost with GPU
147 | + Hash features: need more RAM
148 | + DeepFM
149 | + LightGBM DART
150 |   
151 | ## Reference
152 | + [LightGBM: A Highly Efficient Gradient Boosting Decision Tree](https://lightgbm.readthedocs.io/en/stable/)
153 | + [Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792)
154 | + [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433)
155 | + [xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://arxiv.org/pdf/1803.05170)
156 | + [CatBoost is a high-performance open source library for gradient boosting on decision trees](https://catboost.ai/)
157 | + [Efficient Click-Through Rate Prediction for Developing Countries via Tabular Learning](https://arxiv.org/pdf/2104.07553)
158 | + [Label Encoder](https://github.com/jeongyoonlee/Kaggler/blob/master/kaggler/preprocessing/categorical.py)
159 | + [Gauss Rank](https://github.com/aldente0630/gauss-rank-scaler)
160 | + [Sigmoid Ensemble](https://www.kaggle.com/competitions/amex-default-prediction/discussion/329103)
161 | 


--------------------------------------------------------------------------------
/config/data/dataset.yaml:
--------------------------------------------------------------------------------
 1 | path: input/web-ctr-prediction/
 2 | meta: res/meta/
 3 | shift: 1
 4 | train: train_sample_0.45_seed517
 5 | test: test
 6 | submit: sample_submission
 7 | target: Click
 8 | n_splits: 5
 9 | seed: 1119
10 | sampling: 0.3
11 | 


--------------------------------------------------------------------------------
/config/ensemble.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - data: dataset
 4 |   - generator: features
 5 |   - models: lightgbm
 6 |   - override hydra/hydra_logging: disabled
 7 |   - override hydra/job_logging: disabled
 8 | 
 9 | hydra:
10 |   run:
11 |     dir: .
12 |   output_subdir: null
13 | 
14 | output:
15 |   path: output
16 |   submission: sample_submission
17 |   name: sigmoid-ensemble-final-4models-2sample
18 | 
19 | preds:
20 | - 5fold-ctr-lightgbm-0.4-seed414
21 | - 5fold-ctr-lightgbm-0.4-seed602
22 | - 5fold-ctr-lightgbm-0.45-seed517
23 | - 5fold-ctr-lightgbm-0.45-seed1119
24 | - 5fold-ctr-fibinet-0.4-seed414
25 | - 5fold-ctr-fibinet-0.4-seed602
26 | - 5fold-ctr-fibinet-0.45-seed517
27 | - 5fold-ctr-fibinet-0.45-seed1119
28 | - 5fold-ctr-catboost-0.45-seed517
29 | - 5fold-ctr-catboost-0.45-seed1119
30 | - 5fold-ctr-xdeepfm-0.45-seed517
31 | - 5fold-ctr-xdeepfm-0.45-seed1119


--------------------------------------------------------------------------------
/config/generator/features.yaml:
--------------------------------------------------------------------------------
 1 | cat_features:
 2 | - F01
 3 | - F02
 4 | - F03
 5 | - F05
 6 | - F07
 7 | - F08
 8 | - F09
 9 | - F10
10 | - F12
11 | - F13
12 | - F15
13 | - F16
14 | - F17
15 | - F20
16 | - F21
17 | - F22
18 | - F23
19 | - F25
20 | - F26
21 | - F28
22 | - F30
23 | - F31
24 | - F34
25 | - F35
26 | - F37
27 | - F39
28 | 
29 | num_features:
30 | - F14
31 | - F04
32 | - F11
33 | - F18
34 | - F19
35 | - F24
36 | - F27
37 | - F29
38 | - F32
39 | - F33
40 | - F36
41 | - F38
42 | 
43 | drop_features:
44 | - ID
45 | 
46 | sparse_features:
47 | - F01
48 | - F02
49 | - F03
50 | - F05
51 | - F07
52 | - F08
53 | - F09
54 | - F10
55 | - F12
56 | - F13
57 | - F15
58 | - F16
59 | - F17
60 | - F20
61 | - F21
62 | - F22
63 | - F23
64 | - F25
65 | - F26
66 | - F28
67 | - F30
68 | - F31
69 | - F34
70 | - F35
71 | - F37
72 | - F39
73 | 
74 | dense_features:
75 | - F14
76 | - F04
77 | - F11
78 | - F18
79 | - F19
80 | - F24
81 | - F27
82 | - F29
83 | - F32
84 | - F33
85 | - F36
86 | - F38
87 | 


--------------------------------------------------------------------------------
/config/models/autoint.yaml:
--------------------------------------------------------------------------------
 1 | name: autoint
 2 | 
 3 | device: cuda:0
 4 | seed: 42
 5 | l2_reg_linear: 0.0001
 6 | l2_reg_embedding: 0.0001
 7 | dnn_activation: prelu
 8 | lr: 0.0001
 9 | dnn_dropout: 0.5
10 | verbose: 1
11 | patience: 2
12 | mode: min
13 | batch_size: 4096
14 | epochs: 3
15 | 
16 | path: res/models/
17 | results: 5fold-ctr-autoint-0.4-seed414


--------------------------------------------------------------------------------
/config/models/catboost.yaml:
--------------------------------------------------------------------------------
 1 | name: catboost
 2 | 
 3 | params:
 4 |   iterations: 30000
 5 |   task_type: GPU
 6 |   loss_function: Logloss
 7 |   eval_metric: AUC
 8 |   simple_ctr: FeatureFreq
 9 |   combinations_ctr: FeatureFreq
10 |   max_ctr_complexity: 4
11 |   learning_rate: 0.3
12 |   od_type: Iter
13 |   allow_writing_files: False
14 | 
15 | path: res/models/
16 | results: 5fold-ctr-catboost
17 | early_stopping_rounds: 100
18 | seed: 42
19 | verbose_eval: 1000


--------------------------------------------------------------------------------
/config/models/fibinet.yaml:
--------------------------------------------------------------------------------
 1 | name: fibinet
 2 | 
 3 | device: cuda:0
 4 | seed: 42
 5 | l2_reg_linear: 0.0001
 6 | l2_reg_embedding: 0.0001
 7 | dnn_activation: prelu
 8 | lr: 0.0001
 9 | dnn_dropout: 0.5
10 | verbose: 1
11 | patience: 2
12 | mode: max
13 | batch_size: 4096
14 | epochs: 3
15 | 
16 | path: res/models/
17 | results: 5fold-ctr-fibinet-0.4-seed414


--------------------------------------------------------------------------------
/config/models/lightgbm.yaml:
--------------------------------------------------------------------------------
 1 | name: lightgbm
 2 | 
 3 | params:
 4 |   boosting_type: gbdt
 5 |   objective: binary
 6 |   metric: auc
 7 |   learning_rate: 0.05
 8 |   bagging_seed: 602
 9 |   num_leaves: 256
10 |   max_depth: -1
11 |   min_child_weight: 0.03
12 |   bagging_fraction: 0.4
13 |   feature_fraction: 0.3
14 |   lambda_l1: 0.4
15 |   lambda_l2: 0.6
16 |   num_threads: -1
17 | 
18 | path: res/models/
19 | results: 5fold-ctr-lightgbm-0.4-seed1119
20 | early_stopping_rounds: 100
21 | num_boost_round: 10000
22 | verbose_eval: 100
23 | seed: 42


--------------------------------------------------------------------------------
/config/models/wdl.yaml:
--------------------------------------------------------------------------------
 1 | name: wdl
 2 | 
 3 | device: cuda:0
 4 | seed: 42
 5 | l2_reg_linear: 0.0001
 6 | l2_reg_embedding: 0.0001
 7 | dnn_activation: prelu
 8 | lr: 0.0001
 9 | dnn_dropout: 0.5
10 | verbose: 1
11 | patience: 2
12 | mode: min
13 | batch_size: 4096
14 | epochs: 5
15 | 
16 | path: res/models/
17 | results: 5fold-ctr-wdl-0.4-seed42


--------------------------------------------------------------------------------
/config/models/xdeepfm.yaml:
--------------------------------------------------------------------------------
 1 | name: xdeepfm
 2 | 
 3 | device: cuda
 4 | seed: 42
 5 | l2_reg_linear: 0.0001
 6 | l2_reg_embedding: 0.0001
 7 | dnn_activation: prelu
 8 | lr: 0.0001
 9 | dnn_dropout: 0.5
10 | verbose: 1
11 | patience: 2
12 | mode: min
13 | batch_size: 4096
14 | epochs: 5
15 | gpus:
16 | - 0
17 | - 1
18 | 
19 | path: res/models/
20 | results: 5fold-ctr-dien-0.4-seed414


--------------------------------------------------------------------------------
/config/models/xgboost.yaml:
--------------------------------------------------------------------------------
 1 | name: xgboost
 2 | 
 3 | params:
 4 |   eta: 0.03
 5 |   subsample: 0.8
 6 |   colsample_bytree: 0.8
 7 |   alpha: 0.01
 8 |   lambda: 0.01
 9 |   seed: 42
10 |   method: hist
11 |   device: cuda
12 |   objective: binary:logistic
13 |   eval_metric: auc
14 |   n_jobs: -1
15 | 
16 | path: res/models/
17 | results: 5fold-ctr-xgboost-0.4-seed1119
18 | early_stopping_rounds: 100
19 | num_boost_round: 10000
20 | verbose_eval: 100
21 | seed: 42


--------------------------------------------------------------------------------
/config/predict.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - data: dataset
 4 |   - generator: features
 5 |   - models: lightgbm
 6 |   - override hydra/hydra_logging: disabled
 7 |   - override hydra/job_logging: disabled
 8 | 
 9 | hydra:
10 |   run:
11 |     dir: .
12 |   output_subdir: null
13 | 
14 | output:
15 |   path: output
16 |   submission: sample_submission
17 |   name: 5fold-ctr-lightgbm-0.4-seed1119
18 | 
19 | seed: 42
20 | 


--------------------------------------------------------------------------------
/config/sampling.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - data: dataset
 4 |   - models: lightgbm
 5 |   - override hydra/hydra_logging: disabled
 6 |   - override hydra/job_logging: disabled
 7 | 
 8 | hydra:
 9 |   run:
10 |     dir: .
11 |   output_subdir: null
12 | 
13 | output:
14 |   path: output
15 |   submission: sample_submission.csv
16 |   name: train_sample
17 | 
18 | 


--------------------------------------------------------------------------------
/config/train.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - data: dataset
 4 |   - generator: features
 5 |   - models: catboost
 6 |   - override hydra/hydra_logging: disabled
 7 |   - override hydra/job_logging: disabled
 8 | 
 9 | hydra:
10 |   run:
11 |     dir: .
12 |   output_subdir: null
13 | 
14 | output:
15 |   path: output
16 |   submission: sample_submission.csv
17 |   name: 5fold-ctr-catboost.csv
18 |   
19 | seed: 42
20 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: ctr
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 |   - pytorch
 6 | dependencies:
 7 |   - pip
 8 |   - python=3.10.13
 9 |   - pip:
10 |     - black==24.1.1
11 |     - category-encoders==2.6.3
12 |     - deepctr-torch==0.2.9
13 |     - flake8==7.0.0
14 |     - isort==5.13.2
15 |     - jupyter==1.0.0
16 |     - openpyxl==3.0.10
17 |     - catboost==1.2.2
18 |     - hydra-core==1.3.0
19 |     - holidays==0.24
20 |     - lightgbm==3.3.2
21 |     - matplotlib==3.5.3
22 |     - seaborn==0.11.0
23 |     - numpy==1.24.3
24 |     - pandas==2.1.4
25 |     - plotly==5.18.0
26 |     - polars==0.19.19
27 |     - prettytable==3.8.0
28 |     - pyarrow==16.0.0
29 |     - pytorch-tabnet==4.1.0
30 |     - pytorch_optimizer==2.12.0
31 |     - scikit-learn==1.2.2
32 |     - scipy==1.10.1
33 |     - torch==2.3.1
34 |     - torchaudio==2.3.1
35 |     - torchvision==0.18.1
36 |     - tqdm==4.65.0
37 |     - xgboost==2.0.1
38 |     - wandb==0.17.2


--------------------------------------------------------------------------------
/input/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-wook/web-ctr-prediction/362c7c940f87b1c6f204764680cbb1bd8139f307/input/.gitkeep


--------------------------------------------------------------------------------
/libs/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import *


--------------------------------------------------------------------------------
/libs/data/dataset.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import joblib
  4 | import pandas as pd
  5 | from category_encoders import CountEncoder
  6 | from omegaconf import DictConfig
  7 | from sklearn.preprocessing import QuantileTransformer
  8 | 
  9 | from libs.generator import FeatureEngineering, LabelEncoder
 10 | 
 11 | 
 12 | class DataStorage:
 13 |     def __init__(self, cfg: DictConfig):
 14 |         self.cfg = cfg
 15 | 
 16 |     def _categorize_train_features(self, train_x: pd.DataFrame) -> pd.DataFrame:
 17 |         """Categorical encoding for train data
 18 |         Args:
 19 |             config: config
 20 |             train: dataframe
 21 |         Returns:
 22 |             dataframe
 23 |         """
 24 |         le = LabelEncoder()
 25 |         train_x[[*self.cfg.generator.cat_features]] = le.fit_transform(train_x[[*self.cfg.generator.cat_features]])
 26 |         joblib.dump(le, Path(self.cfg.data.meta) / "label_encoder.pkl")
 27 | 
 28 |         return train_x
 29 | 
 30 |     def _categorize_test_features(self, test_x: pd.DataFrame) -> pd.DataFrame:
 31 |         """Categorical encoding for test data
 32 |         Args:
 33 |             config: config
 34 |             test: dataframe
 35 |         Returns:
 36 |             dataframe
 37 |         """
 38 | 
 39 |         le = joblib.load(Path(self.cfg.data.meta) / "label_encoder.pkl")
 40 |         test_x[[*self.cfg.generator.cat_features]] = le.transform(test_x[[*self.cfg.generator.cat_features]])
 41 | 
 42 |         return test_x
 43 | 
 44 |     def _count_train_features(self, train_x: pd.DataFrame) -> pd.DataFrame:
 45 |         """Categorical encoding for train data
 46 |         Args:
 47 |             config: config
 48 |             train: dataframe
 49 |         Returns:
 50 |             dataframe
 51 |         """
 52 |         cnt = CountEncoder()
 53 |         train_enc = cnt.fit_transform(train_x[[*self.cfg.generator.cat_features]])
 54 |         train_x = train_x.join(train_enc.add_suffix("_count"))
 55 |         joblib.dump(cnt, Path(self.cfg.data.meta) / "count_encoder.pkl")
 56 | 
 57 |         return train_x
 58 | 
 59 |     def _count_test_features(self, test_x: pd.DataFrame) -> pd.DataFrame:
 60 |         """Categorical encoding for test data
 61 |         Args:
 62 |             config: config
 63 |             test: dataframe
 64 |         Returns:
 65 |             dataframe
 66 |         """
 67 | 
 68 |         cnt = joblib.load(Path(self.cfg.data.meta) / "count_encoder.pkl")
 69 |         test_enc = cnt.transform(test_x[[*self.cfg.generator.cat_features]])
 70 |         test_x = test_x.join(test_enc.add_suffix("_count"))
 71 | 
 72 |         return test_x
 73 | 
 74 |     def _numerical_train_scaling(self, train: pd.DataFrame) -> pd.DataFrame:
 75 |         """Numerical scaling
 76 |         Args:
 77 |             config: config
 78 |             train: dataframe
 79 |             test: dataframe
 80 |         Returns:
 81 |             dataframe
 82 |         """
 83 |         scaler = QuantileTransformer(n_quantiles=100, output_distribution="normal")
 84 |         train[[*self.cfg.generator.num_features]] = scaler.fit_transform(train[[*self.cfg.generator.num_features]])
 85 |         joblib.dump(scaler, Path(self.cfg.data.meta) / "rankgauss.pkl")
 86 | 
 87 |         return train
 88 | 
 89 |     def _numerical_test_scaling(self, test: pd.DataFrame) -> pd.DataFrame:
 90 |         """Numerical scaling
 91 |         Args:
 92 |             config: config
 93 |             test: dataframe
 94 |         Returns:
 95 |             dataframe
 96 |         """
 97 |         scaler = joblib.load(Path(self.cfg.data.meta) / "rankgauss.pkl")
 98 |         test[[*self.cfg.generator.num_features]] = scaler.transform(test[[*self.cfg.generator.num_features]])
 99 | 
100 |         return test
101 | 
102 |     def load_train_dataset(self) -> pd.DataFrame:
103 |         train = pd.read_parquet(Path(self.cfg.data.path) / f"{self.cfg.data.train}.parquet")
104 | 
105 |         feature_engineering = FeatureEngineering(self.cfg)
106 | 
107 |         if self.cfg.models.name == "lightgbm":
108 |             train = self._categorize_train_features(train)
109 |             train = feature_engineering.convert_categorical_features(train)
110 |             train = self._count_train_features(train)
111 | 
112 |         elif self.cfg.models.name == "catboost":
113 |             train = self._categorize_train_features(train)
114 |             train = feature_engineering.reduce_mem_usage(train)
115 |             train = self._count_train_features(train)
116 | 
117 |         else:
118 |             train = self._categorize_train_features(train)
119 |             train = self._numerical_train_scaling(train)
120 |             train = train.fillna(0)
121 | 
122 |         train = feature_engineering.reduce_mem_usage(train)
123 | 
124 |         train_x = train.drop(columns=[*self.cfg.generator.drop_features, self.cfg.data.target])
125 |         train_y = train[self.cfg.data.target]
126 | 
127 |         return train_x, train_y
128 | 
129 |     def load_test_dataset(self) -> pd.DataFrame:
130 |         test = pd.read_parquet(Path(self.cfg.data.path) / "test.parquet")
131 | 
132 |         feature_engineering = FeatureEngineering(self.cfg)
133 | 
134 |         if self.cfg.models.name == "lightgbm":
135 |             test = self._categorize_test_features(test)
136 |             test = feature_engineering.convert_categorical_features(test)
137 |             test = self._count_test_features(test)
138 | 
139 |         elif self.cfg.models.name == "catboost":
140 |             test = self._categorize_test_features(test)
141 |             test = self._count_test_features(test)
142 | 
143 |         else:
144 |             test = self._categorize_test_features(test)
145 |             test = self._numerical_test_scaling(test)
146 |             test = test.fillna(0)
147 | 
148 |         test = feature_engineering.reduce_mem_usage(test)
149 | 
150 |         test_x = test.drop(columns=self.cfg.generator.drop_features)
151 | 
152 |         return test_x
153 | 


--------------------------------------------------------------------------------
/libs/generator/__init__.py:
--------------------------------------------------------------------------------
1 | from .encoder import *
2 | from .features import *
3 | 


--------------------------------------------------------------------------------
/libs/generator/encoder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.base import BaseEstimator
  4 | from typing_extensions import Self
  5 | 
  6 | NAN_INT = 7535805
  7 | 
  8 | 
  9 | class LabelEncoder(BaseEstimator):
 10 |     """Label Encoder that groups infrequent values into one label.
 11 | 
 12 |     Attributes:
 13 |         min_obs (int): minimum number of observation to assign a label.
 14 |         label_encoders (list of dict): label encoders for columns
 15 |         label_maxes (list of int): maximum of labels for columns
 16 |     """
 17 | 
 18 |     def __init__(self, min_obs: int = 10):
 19 |         """Initialize the OneHotEncoder class object.
 20 | 
 21 |         Args:
 22 |             min_obs (int): minimum number of observation to assign a label.
 23 |         """
 24 | 
 25 |         self.min_obs = min_obs
 26 |         self.is_fitted = False
 27 | 
 28 |     def __repr__(self):
 29 |         return ("LabelEncoder(min_obs={})").format(self.min_obs)
 30 | 
 31 |     def _get_label_encoder_and_max(self, x: pd.Series) -> tuple[dict, int]:
 32 |         """Return a mapping from values and its maximum of a column to integer labels.
 33 | 
 34 |         Args:
 35 |             x (pandas.Series): a categorical column to encode.
 36 | 
 37 |         Returns:
 38 |             (tuple):
 39 |                 - (dict): mapping from values of features to integers
 40 |                 - (int): maximum label
 41 |         """
 42 | 
 43 |         # NaN cannot be used as a key for dict. Impute it with a random
 44 |         # integer.
 45 |         label_count = x.fillna(NAN_INT).value_counts()
 46 |         n_uniq = label_count.shape[0]
 47 | 
 48 |         label_count = label_count[label_count >= self.min_obs]
 49 |         n_uniq_new = label_count.shape[0]
 50 | 
 51 |         # If every label appears more than min_obs, new label starts from 0.
 52 |         # Otherwise, new label starts from 1 and 0 is used for all old labels
 53 |         # that appear less than min_obs.
 54 |         offset = 0 if n_uniq == n_uniq_new else 1
 55 | 
 56 |         label_encoder = pd.Series(np.arange(n_uniq_new) + offset, index=label_count.index)
 57 |         max_label = label_encoder.max()
 58 |         label_encoder = label_encoder.to_dict()
 59 | 
 60 |         return label_encoder, max_label
 61 | 
 62 |     def _transform_col(self, x: pd.Series, i: int) -> pd.Series:
 63 |         """Encode one categorical column into labels.
 64 | 
 65 |         Args:
 66 |             x (pandas.Series): a categorical column to encode
 67 |             i (int): column index
 68 | 
 69 |         Returns:
 70 |             (pandas.Series): a column with labels.
 71 |         """
 72 |         return x.fillna(NAN_INT).map(self.label_encoders[i]).fillna(0).astype(int)
 73 | 
 74 |     def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> Self:
 75 |         self.label_encoders = [None] * X.shape[1]
 76 |         self.label_maxes = [None] * X.shape[1]
 77 | 
 78 |         for i, col in enumerate(X.columns):
 79 |             (
 80 |                 self.label_encoders[i],
 81 |                 self.label_maxes[i],
 82 |             ) = self._get_label_encoder_and_max(X[col])
 83 | 
 84 |         self.is_fitted = True
 85 |         return self
 86 | 
 87 |     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 88 |         """Encode categorical columns into label encoded columns
 89 | 
 90 |         Args:
 91 |             X (pandas.DataFrame): categorical columns to encode
 92 | 
 93 |         Returns:
 94 |             (pandas.DataFrame): label encoded columns
 95 |         """
 96 | 
 97 |         assert self.is_fitted, "fit() or fit_transform() must be called before transform()."
 98 | 
 99 |         X = X.copy()
100 |         for i, col in enumerate(X.columns):
101 |             X.loc[:, col] = self._transform_col(X[col], i)
102 | 
103 |         return X
104 | 
105 |     def fit_transform(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame:
106 |         """Encode categorical columns into label encoded columns
107 | 
108 |         Args:
109 |             X (pandas.DataFrame): categorical columns to encode
110 | 
111 |         Returns:
112 |             (pandas.DataFrame): label encoded columns
113 |         """
114 | 
115 |         self.label_encoders = [None] * X.shape[1]
116 |         self.label_maxes = [None] * X.shape[1]
117 | 
118 |         X = X.copy()
119 |         for i, col in enumerate(X.columns):
120 |             (
121 |                 self.label_encoders[i],
122 |                 self.label_maxes[i],
123 |             ) = self._get_label_encoder_and_max(X[col])
124 | 
125 |             X.loc[:, col] = X[col].fillna(NAN_INT).map(self.label_encoders[i]).fillna(0).astype(int)
126 | 
127 |         self.is_fitted = True
128 |         return X
129 | 


--------------------------------------------------------------------------------
/libs/generator/features.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from omegaconf import DictConfig
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | class FeatureEngineering:
 8 |     def __init__(self, cfg: DictConfig):
 9 |         self.cfg = cfg
10 | 
11 |     def convert_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
12 |         with tqdm(total=len(self.cfg.generator.cat_features), desc="Convert features") as pbar:
13 |             # Convert to category type
14 |             for col in self.cfg.generator.cat_features:
15 |                 df[col] = df[col].astype("category")
16 |                 pbar.update(1)
17 | 
18 |         return df
19 | 
20 |     def reduce_mem_usage(self, df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
21 |         """
22 |         Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
23 |         """
24 |         numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
25 |         start_mem = df.memory_usage().sum() / 1024**2
26 | 
27 |         for col in df.columns:
28 |             col_type = df[col].dtypes
29 |             if col_type in numerics:
30 |                 c_min = df[col].min()
31 |                 c_max = df[col].max()
32 | 
33 |                 if str(col_type)[:3] == "int":
34 |                     if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
35 |                         df[col] = df[col].astype(np.int8)
36 | 
37 |                     elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
38 |                         df[col] = df[col].astype(np.int16)
39 | 
40 |                     elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
41 |                         df[col] = df[col].astype(np.int32)
42 | 
43 |                     elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
44 |                         df[col] = df[col].astype(np.int64)
45 |                 else:
46 |                     if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
47 |                         df[col] = df[col].astype(np.float16)
48 | 
49 |                     elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
50 |                         df[col] = df[col].astype(np.float32)
51 | 
52 |                     else:
53 |                         df[col] = df[col].astype(np.float64)
54 | 
55 |         end_mem = df.memory_usage().sum() / 1024**2
56 | 
57 |         if verbose:
58 |             print(
59 |                 f"Mem. usage decreased to {end_mem:5.2f} Mb "
60 |                 + f"({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)"
61 |             )
62 | 
63 |         return df
64 | 


--------------------------------------------------------------------------------
/libs/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | 
 3 | from .base import *
 4 | from .boosting import *
 5 | from .boosting import CatBoostTrainer, LightGBMTrainer, XGBoostTrainer
 6 | from .dlrm import AutoIntTrainer, FiBiNetTranier, WDLTrainer, XDeepFMTrainer
 7 | 
 8 | BulidModel = (
 9 |     CatBoostTrainer
10 |     | LightGBMTrainer
11 |     | XGBoostTrainer
12 |     | WDLTrainer
13 |     | FiBiNetTranier
14 |     | XDeepFMTrainer
15 |     | AutoIntTrainer
16 | )
17 | 
18 | 
19 | def build_model(cfg: DictConfig) -> BulidModel:
20 |     model_type = {
21 |         "lightgbm": LightGBMTrainer(cfg),
22 |         "xgboost": XGBoostTrainer(cfg),
23 |         "catboost": CatBoostTrainer(cfg),
24 |         "wdl": WDLTrainer(cfg),
25 |         "fibinet": FiBiNetTranier(cfg),
26 |         "xdeepfm": XDeepFMTrainer(cfg),
27 |         "autoint": AutoIntTrainer(cfg),
28 |     }
29 | 
30 |     if trainer := model_type.get(cfg.models.name):
31 |         return trainer
32 | 
33 |     else:
34 |         raise NotImplementedError(f"Model '{cfg.models.name}' is not implemented.")
35 | 


--------------------------------------------------------------------------------
/libs/models/base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import gc
  4 | from abc import ABC, abstractmethod
  5 | from dataclasses import dataclass
  6 | from pathlib import Path
  7 | from typing import Any
  8 | 
  9 | import joblib
 10 | import lightgbm as lgb
 11 | import numpy as np
 12 | import pandas as pd
 13 | import wandb
 14 | import xgboost as xgb
 15 | from catboost import CatBoostClassifier
 16 | from deepctr_torch.models import WDL, AutoInt, FiBiNET, xDeepFM
 17 | from omegaconf import DictConfig
 18 | from sklearn.metrics import roc_auc_score
 19 | from sklearn.model_selection import StratifiedKFold
 20 | from typing_extensions import Self
 21 | 
 22 | 
 23 | @dataclass
 24 | class ModelResult:
 25 |     oof_preds: np.ndarray
 26 |     models: dict[str, Any]
 27 | 
 28 | 
 29 | class BaseModel(ABC):
 30 |     def __init__(self, cfg: DictConfig):
 31 |         self.cfg = cfg
 32 | 
 33 |     @abstractmethod
 34 |     def _fit(
 35 |         self,
 36 |         X_train: pd.DataFrame | np.ndarray,
 37 |         y_train: pd.Series | np.ndarray,
 38 |         X_valid: pd.DataFrame | np.ndarray | None = None,
 39 |         y_valid: pd.Series | np.ndarray | None = None,
 40 |     ):
 41 |         raise NotImplementedError
 42 | 
 43 |     def save_model(self, save_dir: Path) -> None:
 44 |         joblib.dump(self.result, save_dir / f"{self.cfg.models.results}.pkl")
 45 | 
 46 |     def fit(
 47 |         self,
 48 |         X_train: pd.DataFrame | np.ndarray,
 49 |         y_train: pd.Series | np.ndarray,
 50 |         X_valid: pd.DataFrame | np.ndarray | None = None,
 51 |         y_valid: pd.Series | np.ndarray | None = None,
 52 |     ) -> Any:
 53 |         model = self._fit(X_train, y_train, X_valid, y_valid)
 54 | 
 55 |         return model
 56 | 
 57 |     def _predict(self, model: Any, X: pd.DataFrame | np.ndarray) -> np.ndarray:
 58 |         if isinstance(model, xgb.Booster):
 59 |             return model.predict(xgb.DMatrix(X))
 60 | 
 61 |         elif isinstance(model, CatBoostClassifier):
 62 |             return model.predict_proba(X)[:, 1]
 63 | 
 64 |         elif isinstance(model, WDL | xDeepFM | AutoInt | FiBiNET):
 65 |             feature_names = [*self.cfg.generator.sparse_features, *self.cfg.generator.dense_features]
 66 |             valid_model_input = {name: X[name] for name in feature_names}
 67 | 
 68 |             return model.predict(valid_model_input, batch_size=512).flatten()
 69 | 
 70 |         elif isinstance(model, lgb.Booster):
 71 |             return model.predict(X)
 72 | 
 73 |         else:
 74 |             raise ValueError("Model not supported")
 75 | 
 76 |     def run_cv_training(self, X: pd.DataFrame, y: pd.Series) -> Self:
 77 |         oof_preds = np.zeros(X.shape[0])
 78 |         models = {}
 79 |         kfold = StratifiedKFold(n_splits=self.cfg.data.n_splits, shuffle=True, random_state=self.cfg.data.seed)
 80 | 
 81 |         for fold, (train_idx, valid_idx) in enumerate(iterable=kfold.split(X, y), start=1):
 82 |             with wandb.init(project="competition", name=f"{self.cfg.models.name}-fold-{fold}", dir="never"):
 83 |                 X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
 84 |                 y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
 85 | 
 86 |                 model = self.fit(X_train, y_train, X_valid, y_valid)
 87 |                 oof_preds[valid_idx] = self._predict(model, X_valid)
 88 | 
 89 |                 models[f"fold_{fold}"] = model
 90 | 
 91 |         del model, X_train, X_valid, y_train, y_valid
 92 |         gc.collect()
 93 | 
 94 |         self.result = ModelResult(oof_preds=oof_preds, models=models)
 95 | 
 96 |         print(f"CV Score: {roc_auc_score(y, oof_preds):.6f}")
 97 | 
 98 |         del oof_preds, y, models
 99 |         gc.collect()
100 | 
101 |         return self
102 | 


--------------------------------------------------------------------------------
/libs/models/boosting.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import lightgbm as lgb
  4 | import numpy as np
  5 | import pandas as pd
  6 | import wandb.integration.catboost as wandb_cb
  7 | import wandb.integration.lightgbm as wandb_lgb
  8 | import wandb.integration.xgboost as wandb_xgb
  9 | import xgboost as xgb
 10 | from catboost import CatBoostClassifier, Pool
 11 | from omegaconf import DictConfig, OmegaConf
 12 | 
 13 | from libs.models import BaseModel
 14 | 
 15 | 
 16 | class XGBoostTrainer(BaseModel):
 17 |     def __init__(self, cfg: DictConfig):
 18 |         super().__init__(cfg)
 19 | 
 20 |     def _fit(
 21 |         self,
 22 |         X_train: pd.DataFrame | np.ndarray,
 23 |         y_train: pd.Series | np.ndarray,
 24 |         X_valid: pd.DataFrame | np.ndarray | None = None,
 25 |         y_valid: pd.Series | np.ndarray | None = None,
 26 |     ) -> xgb.Booster:
 27 |         dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
 28 |         dvalid = xgb.DMatrix(X_valid, y_valid, enable_categorical=True)
 29 | 
 30 |         params = OmegaConf.to_container(self.cfg.models.params)
 31 |         params["seed"] = self.cfg.models.seed
 32 | 
 33 |         model = xgb.train(
 34 |             params=params,
 35 |             dtrain=dtrain,
 36 |             evals=[(dtrain, "train"), (dvalid, "eval")],
 37 |             num_boost_round=self.cfg.models.num_boost_round,
 38 |             early_stopping_rounds=self.cfg.models.early_stopping_rounds,
 39 |             verbose_eval=self.cfg.models.verbose_eval,
 40 |             callbacks=[wandb_xgb.WandbCallback()],
 41 |         )
 42 | 
 43 |         return model
 44 | 
 45 | 
 46 | class CatBoostTrainer(BaseModel):
 47 |     def __init__(self, cfg: DictConfig):
 48 |         super().__init__(cfg)
 49 | 
 50 |     def _fit(
 51 |         self,
 52 |         X_train: pd.DataFrame | np.ndarray,
 53 |         y_train: pd.Series | np.ndarray,
 54 |         X_valid: pd.DataFrame | np.ndarray | None = None,
 55 |         y_valid: pd.Series | np.ndarray | None = None,
 56 |     ) -> CatBoostClassifier:
 57 |         train_set = Pool(X_train, y_train)
 58 |         valid_set = Pool(X_valid, y_valid)
 59 | 
 60 |         params = OmegaConf.to_container(self.cfg.models.params)
 61 |         model = CatBoostClassifier(random_state=self.cfg.models.seed, **params)
 62 | 
 63 |         model.fit(
 64 |             train_set,
 65 |             eval_set=valid_set,
 66 |             verbose_eval=self.cfg.models.verbose_eval,
 67 |             early_stopping_rounds=self.cfg.models.early_stopping_rounds,
 68 |             callbacks=[wandb_cb.WandbCallback()],
 69 |         )
 70 | 
 71 |         wandb_cb.log_summary(model)
 72 | 
 73 |         return model
 74 | 
 75 | 
 76 | class LightGBMTrainer(BaseModel):
 77 |     def __init__(self, cfg: DictConfig):
 78 |         super().__init__(cfg)
 79 | 
 80 |     def _fit(
 81 |         self,
 82 |         X_train: pd.DataFrame | np.ndarray,
 83 |         y_train: pd.Series | np.ndarray,
 84 |         X_valid: pd.DataFrame | np.ndarray | None = None,
 85 |         y_valid: pd.Series | np.ndarray | None = None,
 86 |     ) -> lgb.Booster:
 87 |         train_set = lgb.Dataset(X_train, y_train, categorical_feature=self.cfg.generator.cat_features)
 88 |         valid_set = lgb.Dataset(X_valid, y_valid, categorical_feature=self.cfg.generator.cat_features)
 89 | 
 90 |         params = OmegaConf.to_container(self.cfg.models.params)
 91 |         params["seed"] = self.cfg.models.seed
 92 | 
 93 |         model = lgb.train(
 94 |             params=params,
 95 |             train_set=train_set,
 96 |             valid_sets=[train_set, valid_set],
 97 |             num_boost_round=self.cfg.models.num_boost_round,
 98 |             categorical_feature=[*self.cfg.generator.cat_features],
 99 |             callbacks=[
100 |                 lgb.log_evaluation(self.cfg.models.verbose_eval),
101 |                 lgb.early_stopping(self.cfg.models.early_stopping_rounds),
102 |                 wandb_lgb.wandb_callback(),
103 |             ],
104 |         )
105 | 
106 |         wandb_lgb.log_summary(model)
107 | 
108 |         return model
109 | 


--------------------------------------------------------------------------------
/libs/models/dlrm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from deepctr_torch.callbacks import EarlyStopping
  4 | from deepctr_torch.inputs import DenseFeat, SparseFeat, get_feature_names
  5 | from deepctr_torch.models import WDL, AutoInt, FiBiNET, xDeepFM
  6 | from omegaconf import DictConfig
  7 | from pytorch_optimizer import MADGRAD
  8 | 
  9 | from libs.models import BaseModel
 10 | 
 11 | 
 12 | class WDLTrainer(BaseModel):
 13 |     def __init__(self, cfg: DictConfig):
 14 |         super().__init__(cfg)
 15 | 
 16 |     def _fit(
 17 |         self,
 18 |         X_train: pd.DataFrame | np.ndarray,
 19 |         y_train: pd.Series | np.ndarray,
 20 |         X_valid: pd.DataFrame | np.ndarray | None = None,
 21 |         y_valid: pd.Series | np.ndarray | None = None,
 22 |     ) -> WDL:
 23 | 
 24 |         feature_columns = [
 25 |             SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=16)
 26 |             for feat in self.cfg.generator.sparse_features
 27 |         ] + [DenseFeat(feat, 1) for feat in self.cfg.generator.dense_features]
 28 | 
 29 |         feature_names = get_feature_names(feature_columns)
 30 | 
 31 |         train_model_input = {name: X_train[name] for name in feature_names}
 32 |         valid_model_input = {name: X_valid[name] for name in feature_names}
 33 | 
 34 |         model = WDL(
 35 |             dnn_feature_columns=feature_columns,
 36 |             linear_feature_columns=feature_columns,
 37 |             device=self.cfg.models.device,
 38 |             gpus=self.cfg.models.gpus,
 39 |             seed=self.cfg.models.seed,
 40 |             l2_reg_linear=self.cfg.models.l2_reg_linear,
 41 |             l2_reg_embedding=self.cfg.models.l2_reg_embedding,
 42 |             dnn_activation=self.cfg.models.dnn_activation,
 43 |             dnn_dropout=self.cfg.models.dnn_dropout,
 44 |             dnn_use_bn=True,
 45 |         )
 46 | 
 47 |         model.compile(
 48 |             MADGRAD(model.parameters(), lr=self.cfg.models.lr),
 49 |             "binary_crossentropy",
 50 |             metrics=["binary_crossentropy", "auc"],
 51 |         )
 52 | 
 53 |         es = EarlyStopping(
 54 |             monitor="val_auc",
 55 |             min_delta=0,
 56 |             verbose=self.cfg.models.verbose,
 57 |             patience=self.cfg.models.patience,
 58 |             mode=self.cfg.models.mode,
 59 |         )
 60 | 
 61 |         model.fit(
 62 |             train_model_input,
 63 |             y_train.values,
 64 |             batch_size=self.cfg.models.batch_size,
 65 |             epochs=self.cfg.models.epochs,
 66 |             verbose=self.cfg.models.verbose,
 67 |             validation_data=(valid_model_input, y_valid.values),
 68 |             callbacks=[es],
 69 |         )
 70 | 
 71 |         return model
 72 | 
 73 | 
 74 | class XDeepFMTrainer(BaseModel):
 75 |     def __init__(self, cfg: DictConfig):
 76 |         super().__init__(cfg)
 77 | 
 78 |     def _fit(
 79 |         self,
 80 |         X_train: pd.DataFrame | np.ndarray,
 81 |         y_train: pd.Series | np.ndarray,
 82 |         X_valid: pd.DataFrame | np.ndarray | None = None,
 83 |         y_valid: pd.Series | np.ndarray | None = None,
 84 |     ) -> xDeepFM:
 85 | 
 86 |         feature_columns = [
 87 |             SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=16)
 88 |             for feat in self.cfg.generator.sparse_features
 89 |         ] + [DenseFeat(feat, 1) for feat in self.cfg.generator.dense_features]
 90 | 
 91 |         feature_names = get_feature_names(feature_columns)
 92 | 
 93 |         train_model_input = {name: X_train[name] for name in feature_names}
 94 |         valid_model_input = {name: X_valid[name] for name in feature_names}
 95 | 
 96 |         model = xDeepFM(
 97 |             dnn_feature_columns=feature_columns,
 98 |             linear_feature_columns=feature_columns,
 99 |             device=self.cfg.models.device,
100 |             gpus=self.cfg.models.gpus,
101 |             seed=self.cfg.models.seed,
102 |             l2_reg_linear=self.cfg.models.l2_reg_linear,
103 |             l2_reg_embedding=self.cfg.models.l2_reg_embedding,
104 |             dnn_activation=self.cfg.models.dnn_activation,
105 |             dnn_dropout=self.cfg.models.dnn_dropout,
106 |             dnn_use_bn=True,
107 |         )
108 | 
109 |         model.compile(
110 |             MADGRAD(model.parameters(), lr=self.cfg.models.lr),
111 |             "binary_crossentropy",
112 |             metrics=["binary_crossentropy", "auc"],
113 |         )
114 | 
115 |         es = EarlyStopping(
116 |             monitor="val_auc",
117 |             min_delta=0,
118 |             verbose=self.cfg.models.verbose,
119 |             patience=self.cfg.models.patience,
120 |             mode=self.cfg.models.mode,
121 |         )
122 | 
123 |         model.fit(
124 |             train_model_input,
125 |             y_train.values,
126 |             batch_size=self.cfg.models.batch_size,
127 |             epochs=self.cfg.models.epochs,
128 |             verbose=self.cfg.models.verbose,
129 |             validation_data=(valid_model_input, y_valid.values),
130 |             callbacks=[es],
131 |         )
132 | 
133 |         return model
134 | 
135 | 
136 | class FiBiNetTranier(BaseModel):
137 |     def __init__(self, cfg: DictConfig):
138 |         super().__init__(cfg)
139 | 
140 |     def _fit(
141 |         self,
142 |         X_train: pd.DataFrame | np.ndarray,
143 |         y_train: pd.Series | np.ndarray,
144 |         X_valid: pd.DataFrame | np.ndarray | None = None,
145 |         y_valid: pd.Series | np.ndarray | None = None,
146 |     ) -> FiBiNET:
147 | 
148 |         feature_columns = [
149 |             SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=16)
150 |             for feat in self.cfg.generator.sparse_features
151 |         ] + [DenseFeat(feat, 1) for feat in self.cfg.generator.dense_features]
152 | 
153 |         feature_names = get_feature_names(feature_columns)
154 | 
155 |         train_model_input = {name: X_train[name] for name in feature_names}
156 |         valid_model_input = {name: X_valid[name] for name in feature_names}
157 | 
158 |         model = FiBiNET(
159 |             dnn_feature_columns=feature_columns,
160 |             linear_feature_columns=feature_columns,
161 |             device=self.cfg.models.device,
162 |             gpus=self.cfg.models.gpus,
163 |             seed=self.cfg.models.seed,
164 |             l2_reg_linear=self.cfg.models.l2_reg_linear,
165 |             l2_reg_embedding=self.cfg.models.l2_reg_embedding,
166 |             dnn_activation=self.cfg.models.dnn_activation,
167 |             dnn_dropout=self.cfg.models.dnn_dropout,
168 |         )
169 | 
170 |         model.compile(
171 |             MADGRAD(model.parameters(), lr=self.cfg.models.lr),
172 |             "binary_crossentropy",
173 |             metrics=["binary_crossentropy", "auc"],
174 |         )
175 | 
176 |         es = EarlyStopping(
177 |             monitor="val_auc",
178 |             min_delta=0,
179 |             verbose=self.cfg.models.verbose,
180 |             patience=self.cfg.models.patience,
181 |             mode=self.cfg.models.mode,
182 |         )
183 | 
184 |         model.fit(
185 |             train_model_input,
186 |             y_train.values,
187 |             batch_size=self.cfg.models.batch_size,
188 |             epochs=self.cfg.models.epochs,
189 |             verbose=self.cfg.models.verbose,
190 |             validation_data=(valid_model_input, y_valid.values),
191 |             callbacks=[es],
192 |         )
193 | 
194 |         return model
195 | 
196 | 
197 | class AutoIntTrainer(BaseModel):
198 |     def __init__(self, cfg: DictConfig):
199 |         super().__init__(cfg)
200 | 
201 |     def _fit(
202 |         self,
203 |         X_train: pd.DataFrame | np.ndarray,
204 |         y_train: pd.Series | np.ndarray,
205 |         X_valid: pd.DataFrame | np.ndarray | None = None,
206 |         y_valid: pd.Series | np.ndarray | None = None,
207 |     ) -> AutoInt:
208 | 
209 |         feature_columns = [
210 |             SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=16)
211 |             for feat in self.cfg.generator.sparse_features
212 |         ] + [DenseFeat(feat, 1) for feat in self.cfg.generator.dense_features]
213 | 
214 |         feature_names = get_feature_names(feature_columns)
215 | 
216 |         train_model_input = {name: X_train[name] for name in feature_names}
217 |         valid_model_input = {name: X_valid[name] for name in feature_names}
218 | 
219 |         model = AutoInt(
220 |             dnn_feature_columns=feature_columns,
221 |             linear_feature_columns=feature_columns,
222 |             device=self.cfg.models.device,
223 |             gpus=self.cfg.models.gpus,
224 |             seed=self.cfg.models.seed,
225 |             l2_reg_embedding=self.cfg.models.l2_reg_embedding,
226 |             dnn_activation=self.cfg.models.dnn_activation,
227 |             dnn_dropout=self.cfg.models.dnn_dropout,
228 |             dnn_use_bn=True,
229 |         )
230 | 
231 |         model.compile(
232 |             MADGRAD(model.parameters(), lr=self.cfg.models.lr),
233 |             "binary_crossentropy",
234 |             metrics=["binary_crossentropy", "auc"],
235 |         )
236 | 
237 |         es = EarlyStopping(
238 |             monitor="val_auc",
239 |             min_delta=0,
240 |             verbose=self.cfg.models.verbose,
241 |             patience=self.cfg.models.patience,
242 |             mode=self.cfg.models.mode,
243 |         )
244 | 
245 |         model.fit(
246 |             train_model_input,
247 |             y_train.values,
248 |             batch_size=self.cfg.models.batch_size,
249 |             epochs=self.cfg.models.epochs,
250 |             verbose=self.cfg.models.verbose,
251 |             validation_data=(valid_model_input, y_valid.values),
252 |             callbacks=[es],
253 |         )
254 | 
255 |         return model
256 | 


--------------------------------------------------------------------------------
/libs/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utility import *
2 | 


--------------------------------------------------------------------------------
/libs/utils/utility.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | 
 8 | def seed_everything(seed: int = 42) -> None:
 9 |     os.environ["PYTHONHASHSEED"] = str(seed)
10 |     random.seed(seed)
11 |     np.random.seed(seed)
12 |     torch.manual_seed(seed)
13 |     torch.cuda.manual_seed(seed)
14 |     torch.cuda.manual_seed_all(seed)
15 |     torch.backends.cudnn.deterministic = True
16 |     torch.backends.cudnn.benchmark = False
17 | 


--------------------------------------------------------------------------------
/notebook/eda.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import pandas as pd
 5 | import seaborn as sns
 6 | 
 7 | train = pd.read_parquet("../input/web-ctr-prediction/train.parquet")
 8 | 
 9 | # %%
10 | print(train.shape)
11 | 
12 | # %%
13 | train["Click"].value_counts(normalize=True)
14 | 
15 | # %%
16 | sns.countplot(data=train, x="Click")
17 | plt.show()
18 | # %%
19 | train.head()
20 | # %%
21 | train.info()
22 | 
23 | # %%
24 | cat_features = train.dtypes[train.dtypes == "object"].index.tolist()
25 | cat_features
26 | # %%
27 | num_features = train.dtypes[train.dtypes != "object"].index.tolist()
28 | # %%
29 | num_features
30 | # %%
31 | sns.histplot(data=train, x="F04", bins=100)
32 | plt.show()
33 | # %%
34 | for col in num_features:
35 |     sns.histplot(data=train, x=col, bins=100)
36 |     plt.show()
37 | 
38 | # %%
39 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.isort]
 2 | profile = "black"
 3 | multi_line_output = 3
 4 | include_trailing_comma = true
 5 | force_grid_wrap = 0
 6 | use_parentheses = true
 7 | ensure_newline_before_comments = true
 8 | line_length = 120
 9 | 
10 | [tool.black]
11 | line-length = 120
12 | target-version = ['py36', 'py37', 'py38']
13 | exclude = '''
14 |     \.git
15 |   | \.mypy_cache
16 |   | \.tox
17 |   | venv
18 |   | \.venv
19 |   | _build
20 |   | buck-out
21 |   | build
22 |   | dist
23 |   | ^.*\b(migrations)\b.*$
24 | '''


--------------------------------------------------------------------------------
/res/meta/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-wook/web-ctr-prediction/362c7c940f87b1c6f204764680cbb1bd8139f307/res/meta/.gitkeep


--------------------------------------------------------------------------------
/res/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-wook/web-ctr-prediction/362c7c940f87b1c6f204764680cbb1bd8139f307/res/models/.gitkeep


--------------------------------------------------------------------------------
/scripts/covert_to_parquet.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import hydra
 6 | import pandas as pd
 7 | from omegaconf import DictConfig
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | @hydra.main(config_path="../config/", config_name="train", version_base="1.2.0")
12 | def _main(cfg: DictConfig):
13 |     train = pd.DataFrame()
14 | 
15 |     for chunk in tqdm(pd.read_csv(Path(cfg.data.path) / "train.csv", chunksize=1000000)):
16 |         train = pd.concat([train, chunk])
17 | 
18 |     train.to_parquet(Path(cfg.data.path) / "train.parquet")
19 | 
20 |     test = pd.read_csv(Path(cfg.data.path) / "test.csv")
21 |     test.to_parquet(Path(cfg.data.path) / "test.parquet")
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     _main()
26 | 


--------------------------------------------------------------------------------
/scripts/ensemble.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import hydra
 6 | import numpy as np
 7 | import pandas as pd
 8 | from omegaconf import DictConfig
 9 | from scipy.stats import rankdata
10 | from tqdm import tqdm
11 | 
12 | 
13 | def ensemble_predictions(predictions: list[np.ndarray], weights: list[float], method: str = "linear") -> np.ndarray:
14 |     assert np.isclose(np.sum(weights), 1.0)
15 |     if method == "linear":
16 |         res = np.average(predictions, weights=weights, axis=0)
17 | 
18 |     elif method == "harmonic":
19 |         res = np.average([1 / p for p in predictions], weights=weights, axis=0)
20 |         return 1 / res
21 | 
22 |     elif method == "geometric":
23 |         numerator = np.average([np.log(p) for p in predictions], weights=weights, axis=0)
24 |         res = np.exp(numerator / sum(weights))
25 |         return res
26 | 
27 |     elif method == "rank":
28 |         res = np.average([rankdata(p) for p in predictions], weights=weights, axis=0)
29 |         return res / (len(res) + 1)
30 | 
31 |     elif method == "sigmoid":
32 |         logit_values = np.log(predictions / (1 - predictions))
33 |         result = np.average(logit_values, weights=weights, axis=0)
34 |         return 1 / (1 + np.exp(-result))
35 | 
36 |     else:
37 |         raise ValueError(f"Unknown ensemble method: {method}")
38 | 
39 |     return res
40 | 
41 | 
42 | def calculate_sigmoid_preds(values: list[np.ndarray]) -> np.ndarray:
43 |     """
44 |     Calculate the sigmoid result of the ensemble predictions.
45 |     :param values: list of predictions
46 |     :param weight: list of weights
47 |     :return: ensemble prediction
48 |     """
49 |     values = np.array(values)
50 | 
51 |     logit_values = np.log(values / (1 - values))
52 |     result = np.mean(logit_values, axis=0)
53 | 
54 |     return 1 / (1 + np.exp(-result))
55 | 
56 | 
57 | @hydra.main(config_path="../config/", config_name="ensemble", version_base="1.3.1")
58 | def _main(cfg: DictConfig):
59 |     # Load submission file
60 |     submit = pd.read_csv(Path(cfg.data.path) / f"{cfg.data.submit}.csv")
61 | 
62 |     # Load predictions and calculate ranks
63 |     preds = [
64 |         pd.read_csv(Path(cfg.output.path) / f"{pred}.csv")[cfg.data.target].to_numpy()
65 |         for pred in tqdm(cfg.preds, desc="Loading predictions", colour="red", total=len(cfg.preds))
66 |     ]
67 | 
68 |     # Calculate average predictions
69 |     submit[cfg.data.target] = calculate_sigmoid_preds(preds)
70 | 
71 |     # Save the ensembled submission
72 |     submit.to_csv(Path(cfg.output.path) / f"{cfg.output.name}.csv", index=False)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     _main()
77 | 


--------------------------------------------------------------------------------
/scripts/predict.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import hydra
 6 | import joblib
 7 | import lightgbm as lgb
 8 | import numpy as np
 9 | import pandas as pd
10 | import xgboost as xgb
11 | from catboost import CatBoostClassifier
12 | from deepctr_torch.models import WDL, AutoInt, FiBiNET, xDeepFM
13 | from omegaconf import DictConfig
14 | from tqdm import tqdm
15 | 
16 | from libs.data import DataStorage
17 | 
18 | 
19 | def inference_models(cfg: DictConfig, test_x: pd.DataFrame | dict[str, pd.Series]) -> np.ndarray:
20 |     """Given a model, predict probabilities for each class.
21 |     Args:
22 |         results: ModelResult object
23 |         test_x: test dataframe
24 |     Returns:
25 |         predict probabilities for each class
26 |     """
27 |     # load model
28 |     results = joblib.load(Path(cfg.models.path) / f"{cfg.models.results}.pkl")
29 |     folds = len(results.models)
30 |     preds = np.zeros((test_x.shape[0],))
31 | 
32 |     for model in tqdm(results.models.values(), total=folds, desc="Predicting models", colour="blue"):
33 |         if isinstance(model, lgb.Booster):
34 |             preds += model.predict(test_x) / folds
35 | 
36 |         elif isinstance(model, xgb.Booster):
37 |             preds += model.predict(xgb.DMatrix(test_x)) / folds
38 | 
39 |         elif isinstance(model, WDL | xDeepFM | AutoInt | FiBiNET):
40 |             test_model_input = {
41 |                 name: test_x[name] for name in [*cfg.generator.sparse_features, *cfg.generator.dense_features]
42 |             }
43 |             preds += model.predict(test_model_input, batch_size=1024).flatten() / folds
44 | 
45 |         elif isinstance(model, CatBoostClassifier):
46 |             preds += model.predict_proba(test_x)[:, 1] / folds
47 | 
48 |         else:
49 |             raise ValueError(f"Model {model} not supported")
50 | 
51 |     return preds
52 | 
53 | 
54 | @hydra.main(config_path="../config/", config_name="predict", version_base="1.3.1")
55 | def _main(cfg: DictConfig):
56 | 
57 |     # load test dataset
58 |     data_storage = DataStorage(cfg)
59 |     test_x = data_storage.load_test_dataset()
60 | 
61 |     # load submit dataset
62 |     submit = pd.read_csv(Path(cfg.data.path) / f"{cfg.data.submit}.csv")
63 | 
64 |     # predict
65 |     preds = inference_models(cfg, test_x)
66 |     submit[cfg.data.target] = preds
67 |     submit.to_csv(Path(cfg.output.path) / f"{cfg.models.results}.csv", index=False)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     _main()
72 | 


--------------------------------------------------------------------------------
/scripts/sampling.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import gc
 4 | from pathlib import Path
 5 | 
 6 | import hydra
 7 | import pandas as pd
 8 | import pyarrow.parquet as pq
 9 | from omegaconf import DictConfig
10 | from tqdm import tqdm
11 | 
12 | 
13 | def negative_sampling_train_dataset(cfg: DictConfig) -> pd.DataFrame:
14 |     pfile = pq.ParquetFile(Path(cfg.data.path) / "train.parquet")
15 | 
16 |     train = pd.DataFrame()
17 |     negative = pd.DataFrame()
18 |     positive = pd.DataFrame()
19 |     chunksize = 10**7
20 | 
21 |     for chunk in tqdm(pfile.iter_batches(batch_size=chunksize), desc="Sampling data", leave=False):
22 |         chunk = chunk.to_pandas()
23 |         positive_sample = chunk[chunk["Click"] == 1]
24 |         negative_sample = chunk[chunk["Click"] == 0]
25 |         negative = pd.concat([negative, negative_sample], axis=0, ignore_index=True)
26 |         positive = pd.concat([positive, positive_sample], axis=0, ignore_index=True)
27 | 
28 |     negative_sample = negative.sample(frac=cfg.data.sampling, replace=False, random_state=cfg.data.seed)
29 |     train = pd.concat([train, negative_sample, positive], axis=0, ignore_index=True)
30 |     del negative, positive
31 | 
32 |     return train
33 | 
34 | 
35 | @hydra.main(config_path="../config/", config_name="sampling", version_base="1.2.0")
36 | def _main(cfg: DictConfig):
37 |     # load dataset
38 |     train = negative_sampling_train_dataset(cfg)
39 | 
40 |     # save dataset
41 |     train.to_parquet(Path(cfg.data.path) / f"{cfg.data.train}.parquet")
42 | 
43 |     del train
44 |     gc.collect()
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     _main()
49 | 


--------------------------------------------------------------------------------
/scripts/shell/cb_experiment.sh:
--------------------------------------------------------------------------------
 1 | MODEL_NAME="catboost"
 2 | SAMPLING=0.45
 3 | 
 4 | for seed in 517 1119
 5 | do
 6 |     python -m scripts.train \
 7 |         data.train=train_sample_${SAMPLING}_seed${seed} \
 8 |         models=${MODEL_NAME} \
 9 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
10 | 
11 |     python -m scripts.predict \
12 |         models=${MODEL_NAME} \
13 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \
14 |         output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
15 | done


--------------------------------------------------------------------------------
/scripts/shell/fibinet_experiment.sh:
--------------------------------------------------------------------------------
 1 | MODEL_NAME="fibinet"
 2 | SAMPLING=0.45
 3 | 
 4 | for seed in 517 1119
 5 | do
 6 |     python -m scripts.train \
 7 |         data.train=train_sample_${SAMPLING}_seed${seed} \
 8 |         models=${MODEL_NAME} \
 9 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
10 | 
11 |     python -m scripts.predict \
12 |         models=${MODEL_NAME} \
13 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \
14 |         output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
15 | done
16 | 
17 | SAMPLING=0.4
18 | 
19 | for seed in 414 602
20 | do
21 |     python -m scripts.train \
22 |         data.train=train_sample_${SAMPLING}_seed${seed} \
23 |         models=${MODEL_NAME} \
24 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
25 | 
26 |     python -m scripts.predict \
27 |         models=${MODEL_NAME} \
28 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \
29 |         output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
30 | done


--------------------------------------------------------------------------------
/scripts/shell/lgb_experiment.sh:
--------------------------------------------------------------------------------
 1 | MODEL_NAME="lightgbm"
 2 | SAMPLING=0.45
 3 | 
 4 | for seed in 517 1119
 5 | do
 6 |     python -m scripts.train \
 7 |         data.train=train_sample_${SAMPLING}_seed${seed} \
 8 |         models=${MODEL_NAME} \
 9 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
10 | 
11 |     python -m scripts.predict \
12 |         models=${MODEL_NAME} \
13 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \
14 |         output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
15 | done
16 | 
17 | SAMPLING=0.4
18 | 
19 | for seed in 414 602
20 | do
21 |     python -m scripts.train \
22 |         data.train=train_sample_${SAMPLING}_seed${seed} \
23 |         models=${MODEL_NAME} \
24 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
25 | 
26 |     python -m scripts.predict \
27 |         models=${MODEL_NAME} \
28 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \
29 |         output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
30 | done
31 | 


--------------------------------------------------------------------------------
/scripts/shell/run.sh:
--------------------------------------------------------------------------------
1 | python -m scripts.covert_to_parquet
2 | sh scripts/shell/sampling_dataset.sh
3 | sh scripts/shell/lgb_experiment.sh
4 | sh scripts/shell/cb_experiment.sh
5 | sh scripts/shell/xdeepfm_experiment.sh
6 | sh scripts/shell/fibinet_experiment.sh
7 | python -m scripts.ensemble
8 | 


--------------------------------------------------------------------------------
/scripts/shell/sampling_dataset.sh:
--------------------------------------------------------------------------------
 1 | for sampling in 0.4 0.45
 2 | do
 3 |     for seed in 517 1119
 4 |     do
 5 |         python -m.sampling \
 6 |             data.seed=${seed} \
 7 |             data.sampling=${sampling} \
 8 |             data.train=train_sample_${sampling}_seed${seed}
 9 |     done
10 | done
11 | 


--------------------------------------------------------------------------------
/scripts/shell/wdl_experiment.sh:
--------------------------------------------------------------------------------
 1 | MODEL_NAME="wdl"
 2 | SAMPLING=0.45
 3 | 
 4 | for seed in 517 1119
 5 | do
 6 |     python -m scripts.train \
 7 |         data.train=train_sample_${SAMPLING}_seed${seed} \
 8 |         models=${MODEL_NAME} \
 9 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
10 | 
11 |     python  -m scripts.predict \
12 |         models=${MODEL_NAME} \
13 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \
14 |         output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
15 | done


--------------------------------------------------------------------------------
/scripts/shell/xdeepfm_experiment.sh:
--------------------------------------------------------------------------------
 1 | export PYTHONHASHSEED=0
 2 | 
 3 | MODEL_NAME="xdeepfm"
 4 | SAMPLING=0.45
 5 | 
 6 | for seed in 517 1119
 7 | do
 8 |     python -m scripts.train \
 9 |         data.train=train_sample_${SAMPLING}_seed${seed} \
10 |         models=${MODEL_NAME} \
11 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
12 | 
13 |     python  -m scripts.predict \
14 |         models=${MODEL_NAME} \
15 |         models.results=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed} \
16 |         output.name=5fold-ctr-${MODEL_NAME}-${SAMPLING}-seed${seed}
17 | done


--------------------------------------------------------------------------------
/scripts/train.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import warnings
 4 | from pathlib import Path
 5 | 
 6 | import hydra
 7 | from omegaconf import DictConfig
 8 | 
 9 | from libs.data import DataStorage
10 | from libs.models import build_model
11 | 
12 | 
13 | @hydra.main(config_path="../config/", config_name="train", version_base="1.2.0")
14 | def _main(cfg: DictConfig):
15 |     with warnings.catch_warnings():
16 |         warnings.filterwarnings("ignore", category=UserWarning)
17 | 
18 |         # load dataset
19 |         data_storage = DataStorage(cfg)
20 |         train_x, train_y = data_storage.load_train_dataset()
21 | 
22 |         # choose trainer
23 |         trainer = build_model(cfg)
24 | 
25 |         # train model
26 |         trainer.run_cv_training(train_x, train_y)
27 | 
28 |         # save model
29 |         trainer.save_model(Path(cfg.models.path))
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     _main()
34 | 


--------------------------------------------------------------------------------