├── .github └── workflows │ ├── pythonpackage.yml │ ├── pythonpublish.yml │ └── weekly_test.yml ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── conf.py ├── image │ └── mlflow.png ├── index.rst ├── installation.rst ├── make.bat ├── requirements.txt └── source │ ├── reference │ ├── ensemble.rst │ ├── experiment.rst │ ├── feature_store.rst │ ├── features.rst │ ├── hyper_parameters.rst │ ├── index.rst │ ├── util.rst │ └── validation.rst │ └── tutorial │ ├── experiment.rst │ ├── experiment_advanced.rst │ ├── feature_store.rst │ └── index.rst ├── examples ├── kaggle-bnp-paribas │ └── kaggle_bnp_paribas.py ├── kaggle-days-tokyo │ └── kaggle_days_tokyo.py ├── kaggle-plasticc │ └── kaggle_plasticc.py └── wine-quality │ └── wine-quality.py ├── nyaggle ├── __init__.py ├── ensemble │ ├── __init__.py │ ├── averaging.py │ ├── common.py │ └── stacking.py ├── environment.py ├── experiment │ ├── __init__.py │ ├── auto_prep.py │ ├── experiment.py │ ├── hyperparameter_tuner.py │ └── run.py ├── feature │ ├── __init__.py │ ├── base.py │ ├── category_encoder │ │ ├── __init__.py │ │ └── target_encoder.py │ ├── groupby.py │ └── nlp │ │ ├── __init__.py │ │ └── bert.py ├── feature_store │ ├── __init__.py │ └── feature_store.py ├── hyper_parameters │ ├── __init__.py │ ├── catboost.py │ ├── lightgbm.py │ ├── parameters.py │ └── xgboost.py ├── testing │ ├── __init__.py │ └── util.py ├── util │ ├── __init__.py │ ├── plot_importance.py │ ├── submission.py │ └── traits.py ├── validation │ ├── __init__.py │ ├── adversarial_validate.py │ ├── cross_validate.py │ └── split.py └── version.py ├── requirements-dev.txt ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── conftest.py ├── ensemble ├── test_averaging.py └── test_stacking.py ├── experiment ├── test_experiment.py ├── test_hyperparameter_tuner.py └── test_run.py ├── feature ├── category_encoder │ └── test_target_encoder.py ├── nlp │ └── test_bert.py └── test_groupby.py ├── feature_store └── test_feature_store.py └── validation ├── test_adversarial_validate.py ├── test_cross_validate.py └── test_split.py /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | max-parallel: 4 11 | matrix: 12 | python-version: ['3.8', '3.9', '3.10', '3.11'] 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - uses: actions/cache@v1 21 | with: 22 | path: ~/.cache/pip 23 | key: > 24 | ${{ runner.os }}-pip- 25 | ${{ hashFiles('**/requirements.txt') }}- 26 | ${{ hashFiles('**/requirements-dev.txt') }} 27 | restore-keys: | 28 | ${{ runner.os }}-pip- 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install -r requirements.txt -r requirements-dev.txt 33 | - name: Install MeCab 34 | run: | 35 | sudo apt install mecab libmecab-dev mecab-ipadic-utf8 36 | - name: Lint with flake8 37 | run: | 38 | # stop the build if there are Python syntax errors or undefined names 39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 42 | - name: Test with pytest 43 | run: | 44 | export PYTHONPATH=./ 45 | pytest --verbose --color=yes 46 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/weekly_test.yml: -------------------------------------------------------------------------------- 1 | name: weekly_test 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * 0" 6 | 7 | jobs: 8 | build: 9 | 10 | runs-on: ubuntu-latest 11 | strategy: 12 | max-parallel: 4 13 | matrix: 14 | python-version: ['3.8', '3.9', '3.10', '3.11'] 15 | 16 | steps: 17 | - uses: actions/checkout@v1 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v1 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - uses: actions/cache@v1 23 | with: 24 | path: ~/.cache/pip 25 | key: > 26 | ${{ runner.os }}-pip- 27 | ${{ hashFiles('**/requirements.txt') }}- 28 | ${{ hashFiles('**/requirements-dev.txt') }} 29 | restore-keys: | 30 | ${{ runner.os }}-pip- 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install -r requirements.txt -r requirements-dev.txt 35 | - name: Install MeCab 36 | run: | 37 | sudo apt install mecab libmecab-dev mecab-ipadic-utf8 38 | - name: Lint with flake8 39 | run: | 40 | # stop the build if there are Python syntax errors or undefined names 41 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 42 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 43 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 44 | - name: Test with pytest 45 | run: | 46 | export PYTHONPATH=./ 47 | pytest --verbose --color=yes 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | nyaggle.egg-info/ 4 | .idea/ 5 | __pycache__/ 6 | .pytest_cache/ 7 | mlruns/ 8 | catboost_info 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/#use-with-ide 119 | .pdm.toml 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | 158 | # pytype static type analyzer 159 | .pytype/ 160 | 161 | # Cython debug symbols 162 | cython_debug/ 163 | 164 | # PyCharm 165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 167 | # and can be added to the global gitignore or merged into this file. For a more nuclear 168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 169 | #.idea/ 170 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.7 22 | install: 23 | - requirements: requirements.txt 24 | - requirements: docs/requirements.txt 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 nyanp 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt README.md 2 | recursive-include docs *.txt 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nyaggle 2 | 3 | ![GitHub Actions CI Status](https://github.com/nyanp/nyaggle/workflows/Python%20package/badge.svg) 4 | ![GitHub Actions CI Status](https://github.com/nyanp/nyaggle/workflows/weekly_test/badge.svg) 5 | ![Python Versions](https://img.shields.io/pypi/pyversions/nyaggle.svg?logo=python&logoColor=white) 6 | ![Documentation Status](https://readthedocs.org/projects/nyaggle/badge/?version=latest) 7 | 8 | [**Documentation**](https://nyaggle.readthedocs.io/en/latest/index.html) 9 | | [**Slide (Japanese)**](https://docs.google.com/presentation/d/1jv3J7DISw8phZT4z9rqjM-azdrQ4L4wWJN5P-gKL6fA/edit?usp=sharing) 10 | 11 | **nyaggle** is an utility library for Kaggle and offline competitions. 12 | It is particularly focused on experiment tracking, feature engineering, and validation. 13 | 14 | - **nyaggle.ensemble** - Averaging & stacking 15 | - **nyaggle.experiment** - Experiment tracking 16 | - **nyaggle.feature_store** - Lightweight feature storage using feather-format 17 | - **nyaggle.features** - sklearn-compatible features 18 | - **nyaggle.hyper_parameters** - Collection of GBDT hyper-parameters used in past Kaggle competitions 19 | - **nyaggle.validation** - Adversarial validation & sklearn-compatible CV splitters 20 | 21 | ## Installation 22 | 23 | You can install nyaggle via pip: 24 | 25 | ```bash 26 | pip install nyaggle 27 | ``` 28 | 29 | ## Examples 30 | 31 | ### Experiment Tracking 32 | 33 | `run_experiment()` is a high-level API for experiments with cross validation. 34 | It outputs parameters, metrics, out of fold predictions, test predictions, 35 | feature importance, and submission.csv under the specified directory. 36 | 37 | To enable mlflow tracking, include the optional `with_mlflow=True` parameter. 38 | 39 | ```python 40 | from sklearn.model_selection import train_test_split 41 | 42 | from nyaggle.experiment import run_experiment 43 | from nyaggle.testing import make_classification_df 44 | 45 | X, y = make_classification_df() 46 | X_train, X_test, y_train, y_test = train_test_split(X, y) 47 | 48 | params = { 49 | 'n_estimators': 1000, 50 | 'max_depth': 8 51 | } 52 | 53 | result = run_experiment(params, 54 | X_train, 55 | y_train, 56 | X_test) 57 | 58 | # You can get outputs that are needed in data science competitions with 1 API 59 | 60 | print(result.test_prediction) # Test prediction in numpy array 61 | print(result.oof_prediction) # Out-of-fold prediction in numpy array 62 | print(result.models) # Trained models for each fold 63 | print(result.importance) # Feature importance for each fold 64 | print(result.metrics) # Evalulation metrics for each fold 65 | print(result.time) # Elapsed time 66 | print(result.submission_df) # The output dataframe saved as submission.csv 67 | 68 | # ...and all outputs have been saved under the logging directory (default: output/yyyymmdd_HHMMSS). 69 | 70 | 71 | # You can use it with mlflow and track your experiments through mlflow-ui 72 | result = run_experiment(params, 73 | X_train, 74 | y_train, 75 | X_test, 76 | with_mlflow=True) 77 | ``` 78 | 79 | nyaggle also has a low-level API which has similar interface to 80 | [mlflow tracking](https://www.mlflow.org/docs/latest/tracking.html) and [wandb](https://www.wandb.com/). 81 | 82 | ```python 83 | from nyaggle.experiment import Experiment 84 | 85 | with Experiment(logging_directory='./output/') as exp: 86 | # log key-value pair as a parameter 87 | exp.log_param('lr', 0.01) 88 | exp.log_param('optimizer', 'adam') 89 | 90 | # log text 91 | exp.log('blah blah blah') 92 | 93 | # log metric 94 | exp.log_metric('CV', 0.85) 95 | 96 | # log numpy ndarray, pandas dafaframe and any artifacts 97 | exp.log_numpy('predicted', predicted) 98 | exp.log_dataframe('submission', sub, file_format='csv') 99 | exp.log_artifact('path-to-your-file') 100 | ``` 101 | 102 | ### Feature Engineering 103 | 104 | #### Target Encoding with K-Fold 105 | 106 | ```python 107 | import pandas as pd 108 | import numpy as np 109 | 110 | from sklearn.model_selection import KFold 111 | from nyaggle.feature.category_encoder import TargetEncoder 112 | 113 | 114 | train = pd.read_csv('train.csv') 115 | test = pd.read_csv('test.csv') 116 | all = pd.concat([train, test]).copy() 117 | 118 | cat_cols = [c for c in train.columns if train[c].dtype == np.object] 119 | target_col = 'y' 120 | 121 | kf = KFold(5) 122 | 123 | # Target encoding with K-fold 124 | te = TargetEncoder(kf.split(train)) 125 | 126 | # use fit/fit_transform to train data, then apply transform to test data 127 | train.loc[:, cat_cols] = te.fit_transform(train[cat_cols], train[target_col]) 128 | test.loc[:, cat_cols] = te.transform(test[cat_cols]) 129 | 130 | # ... or just call fit_transform to concatenated data 131 | all.loc[:, cat_cols] = te.fit_transform(all[cat_cols], all[cat_cols]) 132 | ``` 133 | 134 | #### Text Vectorization using BERT 135 | 136 | You need to install pytorch to your virtual environment to use BertSentenceVectorizer. 137 | MaCab and mecab-python3 are also required if you use the Japanese BERT model. 138 | 139 | ```python 140 | import pandas as pd 141 | from nyaggle.feature.nlp import BertSentenceVectorizer 142 | 143 | 144 | train = pd.read_csv('train.csv') 145 | test = pd.read_csv('test.csv') 146 | all = pd.concat([train, test]).copy() 147 | 148 | text_cols = ['body'] 149 | target_col = 'y' 150 | group_col = 'user_id' 151 | 152 | 153 | # extract BERT-based sentence vector 154 | bv = BertSentenceVectorizer(text_columns=text_cols) 155 | 156 | text_vector = bv.fit_transform(train) 157 | 158 | 159 | # BERT + SVD, with cuda 160 | bv = BertSentenceVectorizer(text_columns=text_cols, use_cuda=True, n_components=40) 161 | 162 | text_vector_svd = bv.fit_transform(train) 163 | 164 | # Japanese BERT 165 | bv = BertSentenceVectorizer(text_columns=text_cols, lang='jp') 166 | 167 | japanese_text_vector = bv.fit_transform(train) 168 | ``` 169 | 170 | 171 | ### Adversarial Validation 172 | 173 | ```python 174 | import pandas as pd 175 | from nyaggle.validation import adversarial_validate 176 | 177 | train = pd.read_csv('train.csv') 178 | test = pd.read_csv('test.csv') 179 | 180 | auc, importance = adversarial_validate(train, test, importance_type='gain') 181 | 182 | ``` 183 | 184 | ### Validation Splitters 185 | 186 | nyaggle provides a set of validation splitters that are compatible with sklearn. 187 | 188 | ```python 189 | import pandas as pd 190 | from sklearn.model_selection import cross_validate, KFold 191 | from nyaggle.validation import TimeSeriesSplit, Take, Skip, Nth 192 | 193 | train = pd.read_csv('train.csv', parse_dates='dt') 194 | 195 | # time-series split 196 | ts = TimeSeriesSplit(train['dt']) 197 | ts.add_fold(train_interval=('2019-01-01', '2019-01-10'), test_interval=('2019-01-10', '2019-01-20')) 198 | ts.add_fold(train_interval=('2019-01-06', '2019-01-15'), test_interval=('2019-01-15', '2019-01-25')) 199 | 200 | cross_validate(..., cv=ts) 201 | 202 | # take the first 3 folds out of 10 203 | cross_validate(..., cv=Take(3, KFold(10))) 204 | 205 | # skip the first 3 folds, and evaluate the remaining 7 folds 206 | cross_validate(..., cv=Skip(3, KFold(10))) 207 | 208 | # evaluate 1st fold 209 | cross_validate(..., cv=Nth(1, ts)) 210 | 211 | ``` 212 | 213 | ### Other Awesome Repositories 214 | 215 | Here is a list of awesome repositories that provide general utility functions for data science competitions. 216 | Please let me know if you have another one :) 217 | 218 | - [jeongyoonlee/Kaggler](https://github.com/jeongyoonlee/Kaggler) 219 | - [mxbi/mlcrate](https://github.com/mxbi/mlcrate) 220 | - [analokmaus/kuma_utils](https://github.com/analokmaus/kuma_utils) 221 | - [Far0n/kaggletils](https://github.com/Far0n/kaggletils) 222 | - [MLWave/Kaggle-Ensemble-Guide](https://github.com/MLWave/Kaggle-Ensemble-Guide) 223 | - [rushter/heamy](https://github.com/rushter/heamy) 224 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | import sphinx_rtd_theme 16 | 17 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 18 | 19 | import nyaggle 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = 'nyaggle' 24 | copyright = '2019, nyanp' 25 | author = 'nyanp' 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = nyaggle.__version__ 29 | 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.napoleon', 39 | 'sphinx_autodoc_typehints', 40 | 'sphinx.ext.viewcode' 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # List of patterns, relative to source directory, that match files and 47 | # directories to ignore when looking for source files. 48 | # This pattern also affects html_static_path and html_extra_path. 49 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 50 | 51 | 52 | # -- Options for HTML output ------------------------------------------------- 53 | 54 | # Add any paths that contain custom static files (such as style sheets) here, 55 | # relative to this directory. They are copied after the builtin static files, 56 | # so a file named "default.css" will overwrite the builtin "default.css". 57 | html_static_path = ['_static'] 58 | 59 | # The master toctree document. 60 | master_doc = 'index' 61 | 62 | # The theme to use for HTML and HTML Help pages. See the documentation for 63 | # a list of builtin themes. 64 | html_theme = 'sphinx_rtd_theme' 65 | 66 | # Theme options are theme-specific and customize the look and feel of a theme 67 | # further. For a list of options available for each theme, see the 68 | # documentation. 69 | 70 | # Add any paths that contain custom themes here, relative to this directory. 71 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 72 | -------------------------------------------------------------------------------- /docs/image/mlflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyanp/nyaggle/636532292d7ce3468cd47a3337bc50d620f0d23b/docs/image/mlflow.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. nyaggle documentation master file, created by 2 | sphinx-quickstart on Thu Dec 26 08:09:20 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to nyaggle's documentation! 7 | =================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | installation 14 | source/tutorial/index 15 | source/reference/index 16 | 17 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | =================================== 3 | 4 | You can install nyaggle via pip: 5 | 6 | 7 | .. code-block:: bash 8 | 9 | pip install nyaggle # Install core parts of nyaggle 10 | 11 | 12 | nyaggle does not install the following packages by default: 13 | 14 | - catboost 15 | - lightgbm 16 | - xgboost 17 | - mlflow 18 | - pytorch 19 | 20 | 21 | Modules which depends on these packages won't work until you also install them. 22 | For example, ``run_experiment`` with ``algorithm_type='xgb'``, ``'lgbm'`` and ``'cat'`` options won't work 23 | until you also install xgboost, lightgbm and catboost respectively. 24 | 25 | If you want to install everything required in nyaggle, This command can be used: 26 | 27 | .. code-block:: bash 28 | 29 | pip install nyaggle[all] # Install everything 30 | 31 | 32 | If you use :code:`lang=ja` option in :code:`BertSentenceVecorizer`, 33 | you also need to intall MeCab and mecab-python3 package to your environment. 34 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-autodoc-typehints 2 | -------------------------------------------------------------------------------- /docs/source/reference/ensemble.rst: -------------------------------------------------------------------------------- 1 | nyaggle.ensemble 2 | ----------------------- 3 | 4 | .. automodule:: nyaggle.ensemble 5 | :members: 6 | :imported-members: 7 | -------------------------------------------------------------------------------- /docs/source/reference/experiment.rst: -------------------------------------------------------------------------------- 1 | nyaggle.experiment 2 | ----------------------- 3 | 4 | .. automodule:: nyaggle.experiment 5 | :members: 6 | :imported-members: 7 | -------------------------------------------------------------------------------- /docs/source/reference/feature_store.rst: -------------------------------------------------------------------------------- 1 | nyaggle.feature_store 2 | --------------------------- 3 | 4 | .. automodule:: nyaggle.feature_store 5 | :members: 6 | :imported-members: 7 | 8 | -------------------------------------------------------------------------------- /docs/source/reference/features.rst: -------------------------------------------------------------------------------- 1 | nyaggle.feature 2 | ---------------------------------------- 3 | 4 | .. automodule:: nyaggle.feature.category_encoder 5 | :members: 6 | :imported-members: 7 | :inherited-members: 8 | 9 | .. automodule:: nyaggle.feature.nlp 10 | :members: 11 | :imported-members: 12 | 13 | 14 | .. automodule:: nyaggle.feature.groupby 15 | :members: 16 | -------------------------------------------------------------------------------- /docs/source/reference/hyper_parameters.rst: -------------------------------------------------------------------------------- 1 | nyaggle.hyper_parameters 2 | -------------------------- 3 | 4 | .. automodule:: nyaggle.hyper_parameters 5 | :members: 6 | :imported-members: 7 | -------------------------------------------------------------------------------- /docs/source/reference/index.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | ensemble 8 | experiment 9 | feature_store 10 | features 11 | hyper_parameters 12 | util 13 | validation 14 | -------------------------------------------------------------------------------- /docs/source/reference/util.rst: -------------------------------------------------------------------------------- 1 | nyaggle.util 2 | ----------------------- 3 | 4 | .. automodule:: nyaggle.util 5 | :members: 6 | :imported-members: 7 | -------------------------------------------------------------------------------- /docs/source/reference/validation.rst: -------------------------------------------------------------------------------- 1 | nyaggle.validation 2 | -------------------------- 3 | 4 | .. automodule:: nyaggle.validation 5 | :members: 6 | :imported-members: 7 | -------------------------------------------------------------------------------- /docs/source/tutorial/experiment.rst: -------------------------------------------------------------------------------- 1 | Tracking your machine learning experiments with run_experiment 2 | =============================================================== 3 | 4 | Concept 5 | ------------------------------- 6 | 7 | 8 | In a typical tabular data competition, you may probably repeat evaluating your idea 9 | by cross-validation with logging the parameters and results to track your experiments. 10 | 11 | The ``nyaggle.experiment.run_experiment`` is an API for such situations. 12 | If you are using LightGBM as your model, the code will be quite simple: 13 | 14 | 15 | .. code-block:: python 16 | 17 | import pandas as pd 18 | from nyaggle.experiment import run_experiment 19 | from nyaggle.experiment import make_classification_df 20 | 21 | INPUT_DIR = '../input' 22 | target_column = 'target' 23 | 24 | X_train = pd.read_csv(f'{INPUT_DIR}/train.csv') 25 | X_test = pd.read_csv(f'{INPUT_DIR}/test.csv') 26 | sample_df = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv') # OPTIONAL 27 | 28 | y = X_train[target_column] 29 | X_train = X_train.drop(target_column, axis=1) 30 | 31 | lightgbm_params = { 32 | 'max_depth': 8 33 | } 34 | 35 | result = run_experiment(lightgbm_params, 36 | X_train, 37 | y, 38 | X_test, 39 | sample_submission=sample_df) 40 | 41 | 42 | The ``run_experiment`` API performs cross-validation and stores artifacts to the logging directory. You will see the output files stored as follows: 43 | 44 | :: 45 | 46 | output 47 | └── 20200130123456 # yyyymmssHHMMSS 48 | ├── params.json # Parameters 49 | ├── metrics.json # Metrics (single fold & overall CV score) 50 | ├── oof_prediction.npy # Out of fold prediction 51 | ├── test_prediction.npy # Test prediction 52 | ├── 20200130123456.csv # Submission csv file 53 | ├── importances.png # Feature importance plot 54 | ├── log.txt # Log file 55 | └── models # The trained models for each fold 56 | ├── fold1 57 | ├── fold2 58 | ├── fold3 59 | ├── fold4 60 | └── fold5 61 | 62 | 63 | 64 | .. hint:: 65 | The default validation strategy is a 5-fold CV. You can change this behavior by passing ``cv`` parameter 66 | (see API reference in detail). 67 | 68 | 69 | If you want to use XGBoost, CatBoost or other sklearn estimators, 70 | specify the type of algorithm: 71 | 72 | 73 | .. code-block:: python 74 | 75 | # CatBoost 76 | catboost_params = { 77 | 'eval_metric': 'Logloss', 78 | 'loss_function': 'Logloss', 79 | 'depth': 8, 80 | 'task_type': 'GPU' 81 | } 82 | result = run_experiment(catboost_params, 83 | X_train, 84 | y, 85 | X_test, 86 | algorithm_type='cat') 87 | 88 | # XGBoost 89 | xgboost_params = { 90 | 'objective': 'reg:linear', 91 | 'max_depth': 8 92 | } 93 | result = run_experiment(xgboost_params, 94 | X_train, 95 | y, 96 | X_test, 97 | algorithm_type='xgb') 98 | 99 | # sklearn estimator 100 | from sklearn.linear_model import Ridge 101 | rigde_params = { 102 | 'alpha': 1.0 103 | } 104 | result = run_experiment(rigde_params, 105 | X_train, 106 | y, 107 | X_test, 108 | algorithm_type=Ridge) 109 | 110 | 111 | 112 | .. hint:: 113 | The parameter will be passed to the constructor of sklearn API (e.g. ``LGBMClassifier``). 114 | 115 | 116 | Collaborating with mlflow 117 | ------------------------------ 118 | 119 | If you want GUI dashboard to manage your experiments, you can use ``run_experiment`` 120 | with mlflow by just setting ``with_mlfow = True`` (you need to install mlflow beforehand). 121 | 122 | 123 | .. code-block:: python 124 | 125 | result = run_experiment(params, 126 | X_train, 127 | y, 128 | X_test, 129 | with_mlflow=True) 130 | 131 | 132 | 133 | In the same directory as the script executed, run 134 | 135 | .. code-block:: bash 136 | 137 | mlflow ui 138 | 139 | 140 | 141 | and view it at http://localhost:5000 . 142 | On this page, you can see the list of experiments with CV scores and parameters. 143 | 144 | 145 | .. image:: ../../image/mlflow.png 146 | 147 | 148 | If you want to customize the behavior of logging, you can call ``run_experiment`` in 149 | the context of mlflow run. If there is an active run, ``run_experiment`` will use the 150 | currently active run instead of creating a new one. 151 | 152 | 153 | .. code-block:: python 154 | 155 | mlflow.set_tracking_uri('gs://ok-i-want-to-use-gcs') 156 | 157 | with mlflow.start_run(run_name='your-favorite-run-name'): 158 | mlflow.log_param('something-you-want-to-log', 42) 159 | 160 | result = run_experiment(params, 161 | X_train, 162 | y, 163 | X_test, 164 | with_mlflow=True) 165 | 166 | 167 | 168 | 169 | 170 | What does ``run_experiment`` not do? 171 | ------------------------------------- 172 | 173 | ``run_experiment`` can be considered as a mere cross-validation API with logging functionality. 174 | Therefore, you have to choose model parameters and perform feature engineering yourself. 175 | -------------------------------------------------------------------------------- /docs/source/tutorial/experiment_advanced.rst: -------------------------------------------------------------------------------- 1 | Advanced usage 2 | ============================== 3 | 4 | Using low-level experiment API 5 | ------------------------------- 6 | 7 | While nyaggle provides ``run_experiment`` as a high-level API, 8 | ``Experiment`` class can be used as a low-level API that provides primitive functionality for logging experiments. 9 | 10 | It is useful when you want to track something other than CV, or need to implement your own CV logic. 11 | 12 | 13 | .. code-block:: python 14 | 15 | from nyaggle.experiment import Experiment 16 | 17 | 18 | with Experiment(logging_directory='./output/') as exp: 19 | # log key-value pair as a parameter 20 | exp.log_param('lr', 0.01) 21 | exp.log_param('optimizer', 'adam') 22 | 23 | # log text 24 | exp.log('blah blah blah') 25 | 26 | # log metric 27 | exp.log_metric('CV', 0.85) 28 | 29 | # log numpy ndarray 30 | exp.log_numpy('predicted', predicted) 31 | 32 | # log pandas dataframe 33 | exp.log_dataframe('submission', sub, file_format='csv') 34 | 35 | # log any file 36 | exp.log_artifact('path-to-your-file') 37 | 38 | 39 | # you can continue logging from existing result 40 | with Experiment.continue_from('./output') as exp: 41 | ... 42 | 43 | 44 | If you are familiar with mlflow tracking, you may notice that these APIs are similar to mlflow. 45 | ``Experiment`` can be treated as a thin wrapper if you pass ``mlflow=True`` to the constructor. 46 | 47 | 48 | .. code-block:: python 49 | 50 | from nyaggle.experiment import Experiment 51 | 52 | with Experiment(logging_directory='./output/', with_mlflow=True) as exp: 53 | # logging as you want, and you can see the result in mlflow ui 54 | ... 55 | 56 | 57 | 58 | Logging extra parameters to run_experiment 59 | ------------------------------------------- 60 | 61 | By using ``inherit_experiment`` parameter, you can mix any additional logging with the results ``run_experiment`` will create. 62 | In the following example, nyaggle records the result of ``run_experiment`` under the same experiment as 63 | the parameter and metrics written outside of the function. 64 | 65 | .. code-block:: python 66 | 67 | from nyaggle.experiment import Experiment, run_experiment 68 | 69 | with Experiment(logging_directory='./output/') as exp: 70 | 71 | exp.log_param('my extra param', 'bar') 72 | 73 | run_experiment(..., inherit_experiment=exp) 74 | 75 | exp.log_metrics('my extra metrics', 0.999) 76 | 77 | 78 | Tracking seed averaging experiment 79 | --------------------------------------- 80 | 81 | If you train a bunch of models with different seeds to ensemble them, tracking individual models with mlflow 82 | will make GUI filled up with these results and make it difficult to manage. 83 | A nested run functionality of mlflow is useful to display multiple models together in one result. 84 | 85 | .. code-block:: python 86 | 87 | import mlflow 88 | from nyaggle.ensemble import averaging 89 | from nyaggle.util import make_submission_df 90 | 91 | mlflow.start_run() 92 | base_logging_dir = './seed-avg/' 93 | results = [] 94 | 95 | for i in range(3): 96 | mlflow.start_run(nested=True) # use nested-run to place each experiments under the parent run 97 | params['seed'] = i 98 | 99 | result = run_experiment(params, 100 | X_train, 101 | y_train, 102 | X_test, 103 | logging_directory=base_logging_dir+f'seed_{i}', 104 | with_mlflow=True) 105 | results.append(result) 106 | 107 | mlflow.end_run() 108 | 109 | 110 | ensemble = averaging([result.test_prediction for result in results]) 111 | sub = make_submission_df(ensemble.test_prediction, pd.read_csv('sample_submission.csv')) 112 | sub.to_csv('ensemble_sub.csv', index=False) 113 | -------------------------------------------------------------------------------- /docs/source/tutorial/feature_store.rst: -------------------------------------------------------------------------------- 1 | Feature management using feature_store 2 | ======================================= 3 | 4 | Concept 5 | ------------------------------- 6 | 7 | Feature engineering is one of the most important parts of Kaggle. 8 | If you do a lot of feature engineering, it is time-consuming to calculate 9 | features each time you build a model. 10 | 11 | Many skilled Kagglers save their features to local disk as binary (npy, pickle or feather) 12 | to manage their features [1]_ [2]_ [3]_ [4]_. 13 | 14 | ``feature_store`` provides simple helper APIs for feature management. 15 | 16 | 17 | .. code-block:: python 18 | 19 | import pandas as pd 20 | import nyaggle.feature_store as fs 21 | 22 | def make_feature_1(df: pd.DataFrame) -> pd.DataFrame: 23 | return ... 24 | 25 | def make_feature_2(df: pd.DataFrame) -> pd.DataFrame: 26 | return ... 27 | 28 | # feature 1 29 | feature_1 = make_feature_1(df) 30 | 31 | # feature 2 32 | feature_2 = make_feature_2(df) 33 | 34 | # name can be str or int 35 | fs.save_feature(feature_1, "my_feature_1") 36 | fs.save_feature(feature_2, 42, '../my_favorite_feature_store') # change directory to be saved 37 | 38 | 39 | ``save_feature`` stores dataframe as a feather format under the feature directory (``./features`` by default). 40 | If you want to load the feature, just call ``load_feature`` by name. 41 | 42 | .. code-block:: python 43 | 44 | feature_1_restored = fs.load_feature("my_feature_1") 45 | feature_2_restored = fs.load_feature(999) 46 | 47 | 48 | To merge all features into the main dataframe, call ``load_features`` with the main dataframe you want to merge with. 49 | 50 | 51 | .. code-block:: python 52 | 53 | train = pd.read_csv('train.csv') 54 | test = pd.read_csv('test.csv') 55 | base_df = pd.concat([train, test]) 56 | 57 | df_with_features = fs.load_features(base_df, ["my_feature_1", "magic_1", "leaky_1"]) 58 | 59 | 60 | .. note:: 61 | ``load_features`` assumes that the stored feature values are concatenated in the 62 | order [train, test]. 63 | 64 | 65 | If you don't like separating your feature engineering code into the independent module, 66 | ``cached_feature`` decorator provides cache functionality. 67 | The function with this decorator automatically saves the return value using ``save_feature`` on the first call, 68 | and returns the result of ``load_feature`` on subsequent calls instead of executing the function body. 69 | 70 | .. code-block:: python 71 | 72 | import pandas as pd 73 | import nyaggle.feature_store as fs 74 | 75 | @fs.cached_feature("my_feature_1") 76 | def make_feature_1(df: pd.DataFrame) -> pd.DataFrame: 77 | ... 78 | return result 79 | 80 | # saves automatically to features/my_feature_1.f 81 | feature_1 = make_feature_1(df) 82 | 83 | # loads from saved binary instead of calling make_feature_1 84 | feature_1 = make_feature_1(df) 85 | 86 | 87 | .. note:: 88 | The function decorated by ``cached_feature`` must return pandas DataFrame. 89 | 90 | 91 | Use with ``run_experiment`` 92 | ------------------------------- 93 | 94 | If you pass ``feature_list`` and ``feature_directory`` parameters to ``run_experiment`` API, 95 | nyaggle will combine specified features to the given dataframe before performing cross-validation. 96 | 97 | List of features is logged as parameters (and of course can be seen in mlflow ui), 98 | that makes your experiment cycle much simpler. 99 | 100 | .. code-block:: python 101 | 102 | import pandas as pd 103 | import nyaggle.feature_store as fs 104 | from nyaggle.experiment import run_experiment 105 | 106 | run_experiment(params, 107 | X_train, 108 | y, 109 | X_test, 110 | feature_list=["my_feature_1", "magic_1", "leaky_1"], 111 | feature_directory="../my_features") 112 | 113 | 114 | 115 | 116 | Reference 117 | ------------------------------- 118 | 119 | 120 | .. [1] https://www.kaggle.com/c/avito-demand-prediction/discussion/59881 121 | .. [2] https://github.com/flowlight0/talkingdata-adtracking-fraud-detection 122 | .. [3] https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/55581 123 | .. [4] https://amalog.hateblo.jp/entry/kaggle-feature-management -------------------------------------------------------------------------------- /docs/source/tutorial/index.rst: -------------------------------------------------------------------------------- 1 | Tutorial 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | experiment 8 | feature_store 9 | experiment_advanced -------------------------------------------------------------------------------- /examples/kaggle-bnp-paribas/kaggle_bnp_paribas.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | 4 | from sklearn.metrics import log_loss 5 | from nyaggle.experiment import run_experiment 6 | 7 | 8 | if __name__ == "__main__": 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-g', '--gpu', action='store_true', default=False) 12 | args = parser.parse_args() 13 | 14 | X_train = pd.read_csv('train.csv', index_col='ID') 15 | X_test = pd.read_csv('test.csv', index_col='ID') 16 | y_train = X_train['target'] 17 | X_train = X_train.drop('target', axis=1) 18 | 19 | cat_params = { 20 | 'eval_metric': 'Logloss', 21 | 'loss_function': 'Logloss', 22 | 'metric_period': 10, 23 | 'depth': 8, 24 | 'task_type': 'GPU' if args.gpu else 'CPU' 25 | } 26 | 27 | result = run_experiment(cat_params, X_train, y_train, X_test, logging_directory='bnp-paribas-{time}', 28 | eval_func=log_loss, 29 | algorithm_type='cat', 30 | sample_submission=pd.read_csv('sample_submission.csv'), 31 | with_mlflow=True) 32 | -------------------------------------------------------------------------------- /examples/kaggle-days-tokyo/kaggle_days_tokyo.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import mean_squared_error 3 | from sklearn.model_selection import GroupKFold 4 | 5 | from nyaggle.experiment import run_experiment 6 | from nyaggle.feature.category_encoder import TargetEncoder 7 | 8 | lgb_params = { 9 | "objective": "rmse", 10 | "n_estimators": 2000, 11 | "max_depth": 10, 12 | "colsample_bytree": 0.8 13 | } 14 | 15 | X_train = pd.read_csv('train.csv') 16 | X_test = pd.read_csv('test.csv') 17 | y_train = X_train['age'] 18 | X_train = X_train.drop('age', axis=1) 19 | 20 | te_cols = [c for c in X_train.columns if X_train[c].dtype.name == 'object' and c not in ['user_id', 'ts']] 21 | te = TargetEncoder(cv=GroupKFold(5), cols=te_cols, groups=X_train['user_id']).fit(X_train, y_train) 22 | 23 | 24 | def transform(te: TargetEncoder, df: pd.DataFrame, y: pd.Series): 25 | df.drop('ts', axis=1, inplace=True) 26 | 27 | if y is not None: 28 | df = te.fit_transform(df, y) 29 | y = y.groupby(df['user_id']).min() 30 | else: 31 | df = te.transform(df) 32 | 33 | df = df.groupby('user_id').agg(['mean', 'min', 'max']) 34 | df.columns = [e[0] + '_' + e[1] for e in df.columns] 35 | return df, y 36 | 37 | 38 | X_train, y_train = transform(te, X_train, y_train) 39 | X_test, _ = transform(te, X_test, None) 40 | 41 | # generated submission.csv scores 11.61445 in private LB (35th) 42 | run_experiment(logging_directory='baseline_kaggledays_tokyo', 43 | model_params=lgb_params, 44 | X_train=X_train, 45 | y=y_train, 46 | X_test=X_test, 47 | eval_func=mean_squared_error, 48 | type_of_target='continuous', 49 | if_exists='replace', 50 | with_auto_hpo=True, 51 | sample_submission=pd.read_csv('sample_submission.csv')) 52 | -------------------------------------------------------------------------------- /examples/kaggle-plasticc/kaggle_plasticc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from sklearn.model_selection import StratifiedKFold 4 | from nyaggle.experiment import run_experiment 5 | 6 | 7 | meta = pd.read_csv('training_set_metadata.csv') 8 | 9 | is_extra = meta.hostgal_photoz > 0.0 10 | meta_extra = meta[is_extra] 11 | meta_inner = meta[~is_extra] 12 | 13 | lgb_param_extra = { 14 | 'objective': 'multiclass', 15 | 'metric': 'multi_logloss', 16 | 'num_class': 9 17 | } 18 | 19 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) 20 | 21 | result_extra = run_experiment(lgb_param_extra, 22 | meta_extra.drop('target', axis=1), 23 | meta_extra['target'], 24 | logging_directory='plasticc-{time}', 25 | cv=skf, 26 | type_of_target='multiclass') 27 | -------------------------------------------------------------------------------- /examples/wine-quality/wine-quality.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import train_test_split 3 | 4 | from nyaggle.experiment import run_experiment 5 | 6 | 7 | csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' 8 | 9 | data = pd.read_csv(csv_url, sep=';') 10 | 11 | X = data.drop('quality', axis=1) 12 | y = data['quality'] 13 | 14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 15 | 16 | 17 | params = { 18 | 'max_depth': 4, 19 | 'n_estimators': 1000, 20 | 'reg_alpha': 0.1 21 | } 22 | 23 | result = run_experiment(params, 24 | X_train, 25 | y_train, 26 | X_test, 27 | './wine-quality-{time}', 28 | type_of_target='continuous', 29 | with_mlflow=True, 30 | with_auto_hpo=True) 31 | -------------------------------------------------------------------------------- /nyaggle/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.version import __version__ 2 | -------------------------------------------------------------------------------- /nyaggle/ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.ensemble.averaging import averaging, averaging_opt 2 | from nyaggle.ensemble.stacking import stacking -------------------------------------------------------------------------------- /nyaggle/ensemble/averaging.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Optional, Tuple 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import scipy.stats as stats 6 | from scipy.optimize import minimize 7 | 8 | from nyaggle.ensemble.common import EnsembleResult 9 | 10 | 11 | def averaging(test_predictions: List[np.ndarray], 12 | oof_predictions: Optional[List[np.ndarray]] = None, 13 | y: Optional[pd.Series] = None, 14 | weights: Optional[List[float]] = None, 15 | eval_func: Optional[Callable] = None, 16 | rank_averaging: bool = False) -> EnsembleResult: 17 | """ 18 | Perform averaging on model predictions. 19 | 20 | Args: 21 | test_predictions: 22 | List of predicted values on test data. 23 | oof_predictions: 24 | List of predicted values on out-of-fold training data. 25 | y: 26 | Target value 27 | weights: 28 | Weights for each predictions 29 | eval_func: 30 | Evaluation metric used for calculating result score. Used only if ``oof_predictions`` and ``y`` are given. 31 | rank_averaging: 32 | If ``True``, predictions will be converted to rank before averaging. 33 | Returns: 34 | Namedtuple with following members 35 | 36 | * test_prediction: 37 | numpy array, Average prediction on test data. 38 | * oof_prediction: 39 | numpy array, Average prediction on Out-of-Fold validation data. ``None`` if ``oof_predictions`` = ``None``. 40 | * score: 41 | float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``. 42 | """ 43 | if weights is None: 44 | weights = np.ones((len(test_predictions))) / len(test_predictions) 45 | 46 | if rank_averaging: 47 | test_predictions, oof_predictions = _to_rank(test_predictions, oof_predictions) 48 | 49 | def _weighted_average(predictions: List[np.ndarray], weights: List[float]): 50 | if len(predictions) != len(weights): 51 | raise ValueError('len(predictions) != len(weights)') 52 | average = np.zeros_like(predictions[0]) 53 | 54 | for i, weight in enumerate(weights): 55 | if predictions[i].shape != average.shape: 56 | raise ValueError('predictions[{}].shape != predictions[0].shape'.format(i)) 57 | average += predictions[i] * weight 58 | 59 | return average 60 | 61 | average_test = _weighted_average(test_predictions, weights) 62 | if oof_predictions is not None: 63 | average_oof = _weighted_average(oof_predictions, weights) 64 | score = eval_func(y, average_oof) if eval_func is not None else None 65 | else: 66 | average_oof = None 67 | score = None 68 | 69 | return EnsembleResult(average_test, average_oof, score) 70 | 71 | 72 | def averaging_opt(test_predictions: List[np.ndarray], 73 | oof_predictions: Optional[List[np.ndarray]], 74 | y: Optional[pd.Series], 75 | eval_func: Optional[Callable[[np.ndarray, np.ndarray], float]], 76 | higher_is_better: bool, 77 | weight_bounds: Tuple[float, float] = (0.0, 1.0), 78 | rank_averaging: bool = False, 79 | method: Optional[str] = None) -> EnsembleResult: 80 | """ 81 | Perform averaging with optimal weights using scipy.optimize. 82 | 83 | Args: 84 | test_predictions: 85 | List of predicted values on test data. 86 | oof_predictions: 87 | List of predicted values on out-of-fold training data. 88 | y: 89 | Target value 90 | eval_func: 91 | Evaluation metric f(y_true, y_pred) used for calculating result score. 92 | Used only if ``oof_predictions`` and ``y`` are given. 93 | higher_is_better: 94 | Determine the direction of optimize ``eval_func``. 95 | weight_bounds: 96 | Specify lower/upper bounds of each weight. 97 | rank_averaging: 98 | If ``True``, predictions will be converted to rank before averaging. 99 | method: 100 | Type of solver. If ``None``, SLSQP will be used. 101 | Returns: 102 | Namedtuple with following members 103 | 104 | * test_prediction: 105 | numpy array, Average prediction on test data. 106 | * oof_prediction: 107 | numpy array, Average prediction on Out-of-Fold validation data. ``None`` if ``oof_predictions`` = ``None``. 108 | * score: 109 | float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``. 110 | """ 111 | 112 | def _minimize(weights): 113 | prediction = np.zeros_like(oof_predictions[0]) 114 | for weight, oof in zip(weights, oof_predictions): 115 | prediction += weight * oof 116 | oof_score = eval_func(y, prediction) 117 | 118 | return -oof_score if higher_is_better else oof_score 119 | 120 | weights = np.ones((len(test_predictions))) / len(test_predictions) 121 | 122 | if rank_averaging: 123 | test_predictions, oof_predictions = _to_rank(test_predictions, oof_predictions) 124 | 125 | method = method or 'SLSQP' 126 | 127 | if method in ['COBYLA', 'SLSQP', 'trust-constr']: 128 | cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)}) 129 | else: 130 | cons = None 131 | 132 | bounds = [weight_bounds] * len(test_predictions) 133 | 134 | result = minimize(_minimize, weights, method=method, constraints=cons, bounds=bounds) 135 | 136 | return averaging(test_predictions, oof_predictions, y, result['x'], eval_func) 137 | 138 | 139 | def _to_rank(test_predictions: List[np.ndarray], oof_predictions: Optional[List[np.ndarray]]): 140 | if oof_predictions is not None: 141 | oof_predictions = [stats.rankdata(oof) / len(oof) for oof in oof_predictions] 142 | test_predictions = [stats.rankdata(test) / len(test) for test in test_predictions] 143 | 144 | return test_predictions, oof_predictions 145 | -------------------------------------------------------------------------------- /nyaggle/ensemble/common.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | EnsembleResult = namedtuple('EnsembleResult', ['test_prediction', 'oof_prediction', 'score']) 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /nyaggle/ensemble/stacking.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Iterable, List, Union, Optional 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import sklearn.utils.multiclass as multiclass 6 | from category_encoders.utils import convert_input, convert_input_vector 7 | from sklearn.base import BaseEstimator 8 | from sklearn.linear_model import LogisticRegression, Ridge 9 | from sklearn.model_selection import BaseCrossValidator, GridSearchCV 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.preprocessing import StandardScaler 12 | 13 | from nyaggle.ensemble.common import EnsembleResult 14 | from nyaggle.validation import cross_validate 15 | 16 | 17 | def stacking(test_predictions: List[np.ndarray], 18 | oof_predictions: List[np.ndarray], 19 | y: pd.Series, 20 | estimator: Optional[BaseEstimator] = None, 21 | cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, 22 | groups: Optional[pd.Series] = None, 23 | type_of_target: str = 'auto', 24 | eval_func: Optional[Callable] = None) -> EnsembleResult: 25 | """ 26 | Perform stacking on predictions. 27 | 28 | Args: 29 | test_predictions: 30 | List of predicted values on test data. 31 | oof_predictions: 32 | List of predicted values on out-of-fold training data. 33 | y: 34 | Target value 35 | estimator: 36 | Estimator used for the 2nd-level model. 37 | If ``None``, the default estimator (auto-tuned linear model) will be used. 38 | cv: 39 | int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. 40 | 41 | - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, 42 | - integer, to specify the number of folds in a ``(Stratified)KFold``, 43 | - CV splitter (the instance of ``BaseCrossValidator``), 44 | - An iterable yielding (train, test) splits as arrays of indices. 45 | groups: 46 | Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). 47 | type_of_target: 48 | The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. 49 | Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. 50 | eval_func: 51 | Evaluation metric used for calculating result score. Used only if ``oof_predictions`` and ``y`` are given. 52 | Returns: 53 | Namedtuple with following members 54 | 55 | * test_prediction: 56 | numpy array, Average prediction on test data. 57 | * oof_prediction: 58 | numpy array, Average prediction on Out-of-Fold validation data. ``None`` if ``oof_predictions`` = ``None``. 59 | * score: 60 | float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``. 61 | """ 62 | assert len(oof_predictions) == len(test_predictions), "Number of oof and test predictions should be same" 63 | 64 | def _stack(predictions): 65 | if predictions[0].ndim == 1: 66 | predictions = [p.reshape(len(p), -1) for p in predictions] 67 | return np.hstack(predictions) 68 | 69 | X_train = convert_input(_stack(oof_predictions)) 70 | y = convert_input_vector(y, X_train.index) 71 | X_test = convert_input(_stack(test_predictions)) 72 | 73 | assert len(X_train) == len(y) 74 | 75 | if type_of_target == 'auto': 76 | type_of_target = multiclass.type_of_target(y) 77 | 78 | if estimator is None: 79 | # if estimator is None, tuned linear estimator is used 80 | if type_of_target == 'continuous': 81 | estimator = make_pipeline(StandardScaler(), Ridge(random_state=0)) 82 | param_grid = { 83 | 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10], 84 | } 85 | else: 86 | estimator = LogisticRegression(random_state=0, solver='liblinear') 87 | param_grid = { 88 | 'penalty': ['l1', 'l2'], 89 | 'C': [0.001, 0.01, 0.1, 1, 10], 90 | } 91 | grid_search = GridSearchCV(estimator, param_grid, cv=cv) 92 | grid_search.fit(X_train, y, groups=groups) 93 | estimator = grid_search.best_estimator_ 94 | 95 | result = cross_validate(estimator, X_train, y, X_test, cv=cv, groups=groups, eval_func=eval_func, type_of_target=type_of_target) 96 | score = result.scores[-1] if result.scores else None 97 | 98 | return EnsembleResult(result.test_prediction, result.oof_prediction, score) 99 | -------------------------------------------------------------------------------- /nyaggle/environment.py: -------------------------------------------------------------------------------- 1 | # pytorch 2 | 3 | try: 4 | import torch 5 | _has_torch = True 6 | except ImportError: 7 | _has_torch = False 8 | 9 | 10 | def requires_torch(): 11 | if not _has_torch: 12 | raise ImportError('You need to install pytorch before using this API.') 13 | 14 | 15 | # mlflow 16 | 17 | try: 18 | import mlflow 19 | _has_mlflow = True 20 | except ImportError: 21 | _has_mlflow = False 22 | 23 | 24 | def requires_mlflow(): 25 | if not _has_mlflow: 26 | raise ImportError('You need to install mlflow before using this API.') 27 | 28 | 29 | # lightgbm 30 | 31 | 32 | try: 33 | import lightgbm 34 | _has_lightgbm = True 35 | except ImportError: 36 | _has_lightgbm = False 37 | 38 | 39 | def requires_lightgbm(): 40 | if not _has_lightgbm: 41 | raise ImportError('You need to install lightgbm before using this API.') 42 | 43 | 44 | # lightgbm 45 | 46 | 47 | try: 48 | import catboost 49 | _has_catboost = True 50 | # TODO check catboost version >= 0.17 51 | except ImportError: 52 | _has_catboost = False 53 | 54 | 55 | def requires_catboost(): 56 | if not _has_catboost: 57 | raise ImportError('You need to install catboost before using this API.') 58 | 59 | 60 | # xgboost 61 | 62 | 63 | try: 64 | import xgboost 65 | _has_xgboost = True 66 | except ImportError: 67 | _has_xgboost = False 68 | 69 | 70 | def requires_xgboost(): 71 | if not _has_xgboost: 72 | raise ImportError('You need to install xgboost before using this API.') 73 | -------------------------------------------------------------------------------- /nyaggle/experiment/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.experiment.experiment import Experiment, add_leaderboard_score 2 | from nyaggle.experiment.run import autoprep_gbdt, run_experiment, find_best_lgbm_parameter 3 | -------------------------------------------------------------------------------- /nyaggle/experiment/auto_prep.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pandas.api.types import is_integer_dtype, is_categorical_dtype 6 | from sklearn.preprocessing import LabelEncoder 7 | 8 | 9 | def autoprep_gbdt(algorithm_type: str, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame], 10 | categorical_feature_to_treat: Optional[List[str]] = None) -> Tuple[pd.DataFrame, pd.DataFrame]: 11 | if categorical_feature_to_treat is None: 12 | categorical_feature_to_treat = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']] 13 | 14 | # LightGBM: 15 | # Can handle categorical dtype. Otherwise, int, float or bool is acceptable for categorical columns. 16 | # https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support 17 | # 18 | # CatBoost: 19 | # int, float, bool or str is acceptable for categorical columns. NaN should be filled. 20 | # https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features 21 | # 22 | # XGBoost: 23 | # All categorical column should be encoded beforehand. 24 | 25 | if algorithm_type == 'lgbm': 26 | # LightGBM can handle categorical dtype natively 27 | categorical_feature_to_treat = [c for c in categorical_feature_to_treat if not is_categorical_dtype(X_train[c])] 28 | 29 | if algorithm_type == 'cat' and len(categorical_feature_to_treat) > 0: 30 | X_train = X_train.copy() 31 | X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy() # dummy 32 | for c in categorical_feature_to_treat: 33 | X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c]) 34 | 35 | if algorithm_type in ('lgbm', 'xgb') and len(categorical_feature_to_treat) > 0: 36 | assert X_test is not None, "X_test is required for XGBoost with categorical variables" 37 | X_train = X_train.copy() 38 | X_test = X_test.copy() 39 | 40 | for c in categorical_feature_to_treat: 41 | X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c]) 42 | le = LabelEncoder() 43 | concat = np.concatenate([X_train[c].values, X_test[c].values]) 44 | concat = le.fit_transform(concat) 45 | X_train[c] = concat[:len(X_train)] 46 | X_test[c] = concat[len(X_train):] 47 | 48 | return X_train, X_test 49 | 50 | 51 | def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]) -> Tuple[pd.Series, pd.Series]: 52 | if is_categorical_dtype(strain): 53 | return strain.cat.codes, stest.cat.codes 54 | elif is_integer_dtype(strain.dtype): 55 | fillval = min(strain.min(), stest.min()) - 1 56 | return strain.fillna(fillval), stest.fillna(fillval) 57 | else: 58 | return strain.astype(str), stest.astype(str) 59 | -------------------------------------------------------------------------------- /nyaggle/experiment/experiment.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numbers 3 | import os 4 | import shutil 5 | import uuid 6 | import warnings 7 | from logging import getLogger, FileHandler, DEBUG, Logger 8 | from typing import Dict, Optional 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | from nyaggle.environment import requires_mlflow 14 | 15 | MLFLOW_KEY_LENGTH_LIMIT = 250 16 | MLFLOW_VALUE_LENGTH_LIMIT = 250 17 | 18 | 19 | def _sanitize_mlflow_param(param, limit): 20 | if len(str(param)) > limit: 21 | warnings.warn('Length of param exceeds limit {}. It will be truncated. value: {}'.format(limit, param)) 22 | param = str(param)[:limit] 23 | return param 24 | 25 | 26 | def _check_directory(directory: str, if_exists: str) -> str: 27 | if os.path.exists(directory): 28 | if if_exists == 'error': 29 | raise ValueError('directory {} already exists.'.format(directory)) 30 | elif if_exists == 'replace': 31 | warnings.warn( 32 | 'directory {} already exists. It will be replaced by the new result'.format(directory)) 33 | 34 | existing_run_id = _try_to_get_existing_mlflow_run_id(directory) 35 | if existing_run_id is not None: 36 | requires_mlflow() 37 | import mlflow 38 | mlflow.delete_run(existing_run_id) 39 | 40 | shutil.rmtree(directory, ignore_errors=True) 41 | elif if_exists == 'rename': 42 | postfix_index = 1 43 | 44 | while os.path.exists(directory + '_' + str(postfix_index)): 45 | postfix_index += 1 46 | 47 | directory += '_' + str(postfix_index) 48 | warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory)) 49 | return directory 50 | 51 | 52 | def _sanitize(v): 53 | return v if isinstance(v, numbers.Number) else str(v) 54 | 55 | 56 | def _try_to_get_existing_mlflow_run_id(logging_directory: str) -> Optional[str]: 57 | mlflow_path = os.path.join(logging_directory, 'mlflow.json') 58 | if os.path.exists(mlflow_path): 59 | with open(mlflow_path, 'r') as f: 60 | mlflow_metadata = json.load(f) 61 | return mlflow_metadata['run_id'] 62 | return None 63 | 64 | 65 | class Experiment(object): 66 | """Minimal experiment logger for Kaggle 67 | 68 | This module provides minimal functionality for tracking experiments. 69 | The output files are laid out as follows: 70 | 71 | .. code-block:: none 72 | 73 | / 74 | log.txt <== Output of log 75 | metrics.json <== Output of log_metric(s), format: name,score 76 | params.json <== Output of log_param(s), format: key,value 77 | mlflow.json <== mlflow's run_id, experiment_id and artifact_uri (logged if with_mlflow=True) 78 | 79 | 80 | You can add numpy array and pandas dataframe under the directory through ``log_numpy`` and ``log_dataframe``. 81 | 82 | Args: 83 | logging_directory: 84 | Path to directory where output is stored. 85 | custom_logger: 86 | A custom logger to be used instead of default logger. 87 | with_mlflow: 88 | If True, `mlflow tracking `_ is used. 89 | One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow. 90 | Note that all output files are located both ``logging_directory`` and 91 | mlflow's directory (``mlruns`` by default). 92 | if_exists: 93 | How to behave if the logging directory already exists. 94 | 95 | - error: Raise a ValueError. 96 | - replace: Delete logging directory before logging. 97 | - append: Append to exisitng experiment. 98 | - rename: Rename current directory by adding "_1", "_2"... prefix 99 | Example: 100 | >>> import numpy as np 101 | >>> import pandas as pd 102 | >>> from nyaggle.experiment import Experiment 103 | >>> 104 | >>> with Experiment(logging_directory='./output/') as exp: 105 | >>> # log key-value pair as a parameter 106 | >>> exp.log_param('lr', 0.01) 107 | >>> exp.log_param('optimizer', 'adam') 108 | >>> 109 | >>> # log text 110 | >>> exp.log('blah blah blah') 111 | >>> 112 | >>> # log metric 113 | >>> exp.log_metric('CV', 0.85) 114 | >>> 115 | >>> # log dictionary with flattening keys 116 | >>> exp.log_dict('params', {'X': 3, 'Y': {'Z': 'foobar'}}) 117 | >>> 118 | >>> # log numpy ndarray, pandas dafaframe and any artifacts 119 | >>> exp.log_numpy('predicted', np.zeros(1)) 120 | >>> exp.log_dataframe('submission', pd.DataFrame(), file_format='csv') 121 | >>> exp.log_artifact('path-to-your-file') 122 | """ 123 | 124 | def __init__(self, 125 | logging_directory: str, 126 | custom_logger: Optional[Logger] = None, 127 | with_mlflow: bool = False, 128 | if_exists: str = 'error' 129 | ): 130 | logging_directory = _check_directory(logging_directory, if_exists) 131 | os.makedirs(logging_directory, exist_ok=True) 132 | 133 | self.logging_directory = logging_directory 134 | self.with_mlflow = with_mlflow 135 | 136 | if custom_logger is not None: 137 | self.logger = custom_logger 138 | self.is_custom = True 139 | else: 140 | self.logger = getLogger(str(uuid.uuid4())) 141 | self.log_path = os.path.join(logging_directory, 'log.txt') 142 | self.logger.addHandler(FileHandler(self.log_path)) 143 | self.logger.setLevel(DEBUG) 144 | self.is_custom = False 145 | self.metrics = self._load_dict('metrics.json') 146 | self.params = self._load_dict('params.json') 147 | self.inherit_existing_run = False 148 | 149 | if self.with_mlflow: 150 | requires_mlflow() 151 | self.mlflow_run_id = _try_to_get_existing_mlflow_run_id(logging_directory) 152 | if self.mlflow_run_id is not None: 153 | self.mlflow_run_name = None 154 | else: 155 | self.mlflow_run_name = logging_directory 156 | 157 | def __enter__(self): 158 | self.start() 159 | return self 160 | 161 | def __exit__(self, ex_type, ex_value, trace): 162 | self.stop() 163 | 164 | @classmethod 165 | def continue_from(cls, logging_directory: str, with_mlflow: bool = False): 166 | return cls(logging_directory=logging_directory, if_exists='append', with_mlflow=with_mlflow) 167 | 168 | def start(self): 169 | """ 170 | Start a new experiment. 171 | """ 172 | if self.with_mlflow: 173 | import mlflow 174 | 175 | if mlflow.active_run() is not None: 176 | active_run = mlflow.active_run() 177 | self.inherit_existing_run = True 178 | else: 179 | active_run = mlflow.start_run(run_name=self.mlflow_run_name, run_id=self.mlflow_run_id) 180 | mlflow_metadata = { 181 | 'artifact_uri': active_run.info.artifact_uri, 182 | 'experiment_id': active_run.info.experiment_id, 183 | 'run_id': active_run.info.run_id 184 | } 185 | self.mlflow_run_id = active_run.info.run_id 186 | with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f: 187 | json.dump(mlflow_metadata, f, indent=4) 188 | 189 | def _load_dict(self, filename: str) -> Dict: 190 | try: 191 | path = os.path.join(self.logging_directory, filename) 192 | with open(path, 'r') as f: 193 | return json.load(f) 194 | except IOError: 195 | self.logger.warning('failed to load file: {}'.format(filename)) 196 | return {} 197 | 198 | def _save_dict(self, obj: Dict, filename: str): 199 | try: 200 | path = os.path.join(self.logging_directory, filename) 201 | with open(path, 'w') as f: 202 | json.dump(obj, f, indent=2) 203 | except IOError: 204 | self.logger.warning('failed to save file: {}'.format(filename)) 205 | 206 | def stop(self): 207 | """ 208 | Stop current experiment. 209 | """ 210 | self._save_dict(self.metrics, 'metrics.json') 211 | self._save_dict(self.params, 'params.json') 212 | 213 | if not self.is_custom: 214 | for h in self.logger.handlers: 215 | h.close() 216 | 217 | if self.with_mlflow: 218 | import mlflow 219 | from mlflow.exceptions import MlflowException 220 | 221 | try: 222 | mlflow.log_artifact(self.log_path) 223 | mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json')) 224 | mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json')) 225 | except MlflowException as e: 226 | warnings.warn('Error in saving artifacts to mlflow. The result may not be saved.: {}'.format(e)) 227 | if not self.inherit_existing_run: 228 | mlflow.end_run() 229 | 230 | def get_logger(self) -> Logger: 231 | """ 232 | Get logger used in this experiment. 233 | 234 | Returns: 235 | logger object 236 | """ 237 | return self.logger 238 | 239 | def get_run(self): 240 | """ 241 | Get mlflow's currently active run, or None if ``with_mlflow = False``. 242 | 243 | Returns: 244 | active Run 245 | """ 246 | if not self.with_mlflow: 247 | return None 248 | 249 | import mlflow 250 | return mlflow.active_run() 251 | 252 | def log(self, text: str): 253 | """ 254 | Logs a message on the logger for the experiment. 255 | 256 | Args: 257 | text: 258 | The message to be written. 259 | """ 260 | self.logger.info(text) 261 | 262 | def log_param(self, key, value): 263 | """ 264 | Logs a key-value pair for the experiment. 265 | 266 | Args: 267 | key: parameter name 268 | value: parameter value 269 | """ 270 | key = _sanitize(key) 271 | value = _sanitize(value) 272 | self.params[key] = value 273 | 274 | if self.with_mlflow: 275 | import mlflow 276 | from mlflow.exceptions import MlflowException 277 | 278 | key_mlflow = _sanitize_mlflow_param(key, MLFLOW_KEY_LENGTH_LIMIT) 279 | value_mlflow = _sanitize_mlflow_param(value, MLFLOW_VALUE_LENGTH_LIMIT) 280 | 281 | try: 282 | mlflow.log_param(key_mlflow, value_mlflow) 283 | except MlflowException as e: 284 | warnings.warn('Error in logging parameter {} to mlflow. Skipped. {}'.format(key, e)) 285 | 286 | def log_params(self, params: Dict): 287 | """ 288 | Logs a batch of params for the experiments. 289 | 290 | Args: 291 | params: dictionary of parameters 292 | """ 293 | for k, v in params.items(): 294 | self.log_param(k, v) 295 | 296 | def log_dict(self, name: str, value: Dict, separator: str = '.'): 297 | """ 298 | Logs a dictionary as parameter with flatten format. 299 | 300 | Args: 301 | name: Parameter name 302 | value: Parameter value 303 | separator: Separating character used to concatanate keys 304 | Examples: 305 | >>> with Experiment('./') as e: 306 | >>> e.log_dict('a', {'b': 1, 'c': 'd'}) 307 | >>> print(e.params) 308 | { 'a.b': 1, 'a.c': 'd' } 309 | """ 310 | 311 | if value is None: 312 | self.log_param(name, value) 313 | return 314 | 315 | def _flatten(d: Dict, prefix: str, separator: str) -> Dict: 316 | items = [] 317 | for k, v in d.items(): 318 | child_key = prefix + separator + str(k) if prefix else str(k) 319 | if isinstance(v, Dict) and v: 320 | items.extend(_flatten(v, child_key, separator).items()) 321 | else: 322 | items.append((child_key, v)) 323 | return dict(items) 324 | 325 | value = _flatten(value, name, separator) 326 | self.log_params(value) 327 | 328 | def log_metric(self, name: str, score: float): 329 | """ 330 | Log a metric under the logging directory. 331 | 332 | Args: 333 | name: 334 | Metric name. 335 | score: 336 | Metric value. 337 | """ 338 | name = _sanitize(name) 339 | score = _sanitize(score) 340 | self.metrics[name] = score 341 | 342 | if self.with_mlflow: 343 | import mlflow 344 | from mlflow.exceptions import MlflowException 345 | 346 | try: 347 | mlflow.log_metric(name, score) 348 | except MlflowException as e: 349 | warnings.warn('Error in logging metric {} to mlflow. Skipped. {}'.format(name, e)) 350 | 351 | def log_metrics(self, metrics: Dict): 352 | """ 353 | Log a batch of metrics under the logging directory. 354 | 355 | Args: 356 | metrics: dictionary of metrics. 357 | """ 358 | for k, v in metrics.items(): 359 | self.log_metric(k, v) 360 | 361 | def log_numpy(self, name: str, array: np.ndarray): 362 | """ 363 | Log a numpy ndarray under the logging directory. 364 | 365 | Args: 366 | name: 367 | Name of the file. A .npy extension will be appended to the file name if it does not already have one. 368 | array: 369 | Array data to be saved. 370 | """ 371 | path = os.path.join(self.logging_directory, name) 372 | np.save(path, array) 373 | 374 | if self.with_mlflow: 375 | import mlflow 376 | mlflow.log_artifact(path + '.npy') 377 | 378 | def log_dataframe(self, name: str, df: pd.DataFrame, file_format: str = 'feather'): 379 | """ 380 | Log a pandas dataframe under the logging directory. 381 | 382 | Args: 383 | name: 384 | Name of the file. A ``.f`` or ``.csv`` extension will be appended to the file name 385 | if it does not already have one. 386 | df: 387 | A dataframe to be saved. 388 | file_format: 389 | A format of output file. ``csv`` and ``feather`` are supported. 390 | """ 391 | path = os.path.join(self.logging_directory, name) 392 | if file_format == 'feather': 393 | if not path.endswith('.f'): 394 | path += '.f' 395 | df.to_feather(path) 396 | elif file_format == 'csv': 397 | if not path.endswith('.csv'): 398 | path += '.csv' 399 | df.to_csv(path, index=False) 400 | else: 401 | raise RuntimeError('format not supported') 402 | 403 | if self.with_mlflow: 404 | import mlflow 405 | mlflow.log_artifact(path) 406 | 407 | def log_artifact(self, src_file_path: str): 408 | """ 409 | Make a copy of the file under the logging directory. 410 | 411 | Args: 412 | src_file_path: 413 | Path of the file. If path is not a child of the logging directory, the file will be copied. 414 | If ``with_mlflow`` is True, ``mlflow.log_artifact`` will be called (then another copy will be made). 415 | """ 416 | logging_path = os.path.abspath(self.logging_directory) 417 | src_file_path = os.path.abspath(src_file_path) 418 | 419 | if os.path.commonpath([logging_path]) != os.path.commonpath([logging_path, src_file_path]): 420 | src_file = os.path.basename(src_file_path) 421 | shutil.copy(src_file, self.logging_directory) 422 | 423 | if self.with_mlflow: 424 | import mlflow 425 | mlflow.log_artifact(src_file_path) 426 | 427 | 428 | def add_leaderboard_score(logging_directory: str, score: float): 429 | """ 430 | Record leaderboard score to the existing experiment directory. 431 | 432 | Args: 433 | logging_directory: 434 | The directory to be added 435 | score: 436 | Leaderboard score 437 | """ 438 | with Experiment.continue_from(logging_directory) as e: 439 | e.log_metric('LB', score) 440 | -------------------------------------------------------------------------------- /nyaggle/experiment/hyperparameter_tuner.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import warnings 3 | from typing import Dict, Iterable, Optional, Union 4 | 5 | import optuna.integration.lightgbm as optuna_lgb 6 | import pandas as pd 7 | import sklearn.utils.multiclass as multiclass 8 | from sklearn.model_selection import BaseCrossValidator 9 | 10 | from nyaggle.validation.split import check_cv 11 | 12 | 13 | def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series, 14 | cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, 15 | groups: Optional[pd.Series] = None, 16 | time_budget: Optional[int] = None, 17 | type_of_target: str = 'auto') -> Dict: 18 | """ 19 | Search hyperparameter for lightgbm using optuna. 20 | 21 | Args: 22 | base_param: 23 | Base parameters passed to lgb.train. 24 | X: 25 | Training data. 26 | y: 27 | Target 28 | cv: 29 | int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. 30 | groups: 31 | Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). 32 | time_budget: 33 | Time budget for tuning (in seconds). 34 | type_of_target: 35 | The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. 36 | Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. 37 | 38 | Returns: 39 | The best parameters found 40 | """ 41 | cv = check_cv(cv, y) 42 | 43 | if type_of_target == 'auto': 44 | type_of_target = multiclass.type_of_target(y) 45 | 46 | train_index, test_index = next(cv.split(X, y, groups)) 47 | 48 | dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index]) 49 | dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index]) 50 | 51 | params = copy.deepcopy(base_param) 52 | if 'early_stopping_rounds' not in params: 53 | params['early_stopping_rounds'] = 100 54 | 55 | if params.get('feature_pre_filter'): 56 | warnings.warn("feature_pre_filter will be set to False to tune min_data_in_leaf.") 57 | params['feature_pre_filter'] = False 58 | 59 | if not any([p in params for p in ('num_iterations', 'num_iteration', 60 | 'num_trees', 'num_tree', 61 | 'num_rounds', 'num_round')]): 62 | params['num_iterations'] = params.get('n_estimators', 10000) 63 | 64 | if 'objective' not in params: 65 | tot_to_objective = { 66 | 'binary': 'binary', 67 | 'continuous': 'regression', 68 | 'multiclass': 'multiclass' 69 | } 70 | params['objective'] = tot_to_objective[type_of_target] 71 | 72 | if 'metric' not in params and 'objective' in params: 73 | if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root', 74 | 'root_mean_squared_error', 'rmse']: 75 | params['metric'] = 'l2' 76 | if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']: 77 | params['metric'] = 'l1' 78 | if params['objective'] in ['binary']: 79 | params['metric'] = 'binary_logloss' 80 | if params['objective'] in ['multiclass']: 81 | params['metric'] = 'multi_logloss' 82 | 83 | if not any([p in params for p in ('verbose', 'verbosity')]): 84 | params['verbosity'] = -1 85 | 86 | model = optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0, 87 | time_budget=time_budget) 88 | 89 | return model.params 90 | -------------------------------------------------------------------------------- /nyaggle/experiment/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import time 4 | from collections import namedtuple 5 | from datetime import datetime 6 | from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union 7 | 8 | import pandas as pd 9 | import sklearn.utils.multiclass as multiclass 10 | from sklearn.base import BaseEstimator 11 | from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss 12 | from sklearn.model_selection import BaseCrossValidator 13 | 14 | from nyaggle.environment import requires_catboost, requires_lightgbm, requires_xgboost 15 | from nyaggle.experiment.auto_prep import autoprep_gbdt 16 | from nyaggle.experiment.experiment import Experiment 17 | from nyaggle.experiment.hyperparameter_tuner import find_best_lgbm_parameter 18 | from nyaggle.feature_store import load_features 19 | from nyaggle.util import plot_importance, is_gbdt_instance, make_submission_df 20 | from nyaggle.validation.cross_validate import cross_validate 21 | from nyaggle.validation.split import check_cv 22 | 23 | ExperimentResult = namedtuple('ExperimentResult', 24 | [ 25 | 'oof_prediction', 26 | 'test_prediction', 27 | 'metrics', 28 | 'models', 29 | 'importance', 30 | 'time', 31 | 'submission_df' 32 | ]) 33 | 34 | 35 | class ExpeimentProxy(object): 36 | __slots__ = ["_obj", "__weakref__"] 37 | 38 | def __init__(self, obj): 39 | object.__setattr__(self, "_obj", obj) 40 | 41 | def __getattribute__(self, name): 42 | return getattr(object.__getattribute__(self, "_obj"), name) 43 | 44 | def __setattr__(self, name, value): 45 | setattr(object.__getattribute__(self, "_obj"), name, value) 46 | 47 | def __enter__(self): 48 | return self 49 | 50 | def __exit__(self, ex_type, ex_value, trace): 51 | pass 52 | 53 | 54 | def run_experiment(model_params: Dict[str, Any], 55 | X_train: pd.DataFrame, y: pd.Series, 56 | X_test: Optional[pd.DataFrame] = None, 57 | logging_directory: str = 'output/{time}', 58 | if_exists: str = 'error', 59 | eval_func: Optional[Callable] = None, 60 | algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm', 61 | fit_params: Optional[Union[Dict[str, Any], Callable]] = None, 62 | cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, 63 | groups: Optional[pd.Series] = None, 64 | categorical_feature: Optional[List[str]] = None, 65 | sample_submission: Optional[pd.DataFrame] = None, 66 | submission_filename: Optional[str] = None, 67 | type_of_target: str = 'auto', 68 | feature_list: Optional[List[Union[int, str]]] = None, 69 | feature_directory: Optional[str] = None, 70 | inherit_experiment: Optional[Experiment] = None, 71 | with_auto_hpo: bool = False, 72 | with_auto_prep: bool = False, 73 | with_mlflow: bool = False 74 | ): 75 | """ 76 | Evaluate metrics by cross-validation and stores result 77 | (log, oof prediction, test prediction, feature importance plot and submission file) 78 | under the directory specified. 79 | 80 | One of the following estimators are used (automatically dispatched by ``type_of_target(y)`` and ``gbdt_type``). 81 | 82 | * LGBMClassifier 83 | * LGBMRegressor 84 | * CatBoostClassifier 85 | * CatBoostRegressor 86 | 87 | The output files are laid out as follows: 88 | 89 | .. code-block:: none 90 | 91 | / 92 | log.txt <== Logging file 93 | importance.png <== Feature importance plot generated by nyaggle.util.plot_importance 94 | oof_prediction.npy <== Out of fold prediction in numpy array format 95 | test_prediction.npy <== Test prediction in numpy array format 96 | submission.csv <== Submission csv file 97 | metrics.json <== Metrics 98 | params.json <== Parameters 99 | models/ 100 | fold1 <== The trained model in fold 1 101 | ... 102 | 103 | Args: 104 | model_params: 105 | Parameters passed to the constructor of the classifier/regressor object (i.e. LGBMRegressor). 106 | X_train: 107 | Training data. Categorical feature should be casted to pandas categorical type or encoded to integer. 108 | y: 109 | Target 110 | X_test: 111 | Test data (Optional). If specified, prediction on the test data is performed using ensemble of models. 112 | logging_directory: 113 | Path to directory where output of experiment is stored. 114 | if_exists: 115 | How to behave if the logging directory already exists. 116 | 117 | - error: Raise a ValueError. 118 | - replace: Delete logging directory before logging. 119 | - append: Append to exisitng experiment. 120 | - rename: Rename current directory by adding "_1", "_2"... prefix 121 | fit_params: 122 | Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except 123 | eval_set passed for each fold. If callable is passed, 124 | returning value of ``fit_params(fold_id, train_index, test_index)`` will be used for each fold. 125 | eval_func: 126 | Function used for logging and calculation of returning scores. 127 | This parameter isn't passed to GBDT, so you should set objective and eval_metric separately if needed. 128 | If ``eval_func`` is None, ``roc_auc_score`` or ``mean_squared_error`` is used by default. 129 | gbdt_type: 130 | Type of gradient boosting library used. "lgbm" (lightgbm) or "cat" (catboost) 131 | cv: 132 | int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. 133 | 134 | - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, 135 | - integer, to specify the number of folds in a ``(Stratified)KFold``, 136 | - CV splitter (the instance of ``BaseCrossValidator``), 137 | - An iterable yielding (train, test) splits as arrays of indices. 138 | groups: 139 | Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). 140 | sample_submission: 141 | A sample dataframe alined with test data (Usually in Kaggle, it is available as sample_submission.csv). 142 | The submission file will be created with the same schema as this dataframe. 143 | submission_filename: 144 | The name of submission file will be created under logging directory. If ``None``, the basename of the logging 145 | directory will be used as a filename. 146 | categorical_feature: 147 | List of categorical column names. If ``None``, categorical columns are automatically determined by dtype. 148 | type_of_target: 149 | The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. 150 | Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. 151 | feature_list: 152 | The list of feature ids saved through nyaggle.feature_store module. 153 | feature_directory: 154 | The location of features stored. Only used if feature_list is not empty. 155 | inherit_experiment: 156 | An experiment object which is used to log results. if not ``None``, all logs in this function are treated 157 | as a part of this experiment. 158 | with_auto_prep: 159 | If True, the input datasets will be copied and automatic preprocessing will be performed on them. 160 | For example, if ``gbdt_type = 'cat'``, all missing values in categorical features will be filled. 161 | with_auto_hpo: 162 | If True, model parameters will be automatically updated using optuna (only available in lightgbm). 163 | with_mlflow: 164 | If True, `mlflow tracking `_ is used. 165 | One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow. 166 | Note that all output 167 | mlflow's directory (``mlruns`` by default). 168 | :return: 169 | Namedtuple with following members 170 | 171 | * oof_prediction: 172 | numpy array, shape (len(X_train),) Predicted value on Out-of-Fold validation data. 173 | * test_prediction: 174 | numpy array, shape (len(X_test),) Predicted value on test data. ``None`` if X_test is ``None`` 175 | * metrics: 176 | list of float, shape(nfolds+1) ``metrics[i]`` denotes validation score in i-th fold. 177 | ``metrics[-1]`` is overall score. 178 | * models: 179 | list of objects, shape(nfolds) Trained models for each folds. 180 | * importance: 181 | list of pd.DataFrame, feature importance for each fold (type="gain"). 182 | * time: 183 | Training time in seconds. 184 | * submit_df: 185 | The dataframe saved as submission.csv 186 | """ 187 | start_time = time.time() 188 | cv = check_cv(cv, y) 189 | 190 | if feature_list: 191 | X = pd.concat([X_train, X_test]) if X_test is not None else X_train 192 | X.reset_index(drop=True, inplace=True) 193 | X = load_features(X, feature_list, directory=feature_directory) 194 | ntrain = len(X_train) 195 | X_train, X_test = X.iloc[:ntrain, :], X.iloc[ntrain:, :].reset_index(drop=True) 196 | 197 | _check_input(X_train, y, X_test) 198 | 199 | if categorical_feature is None: 200 | categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']] 201 | 202 | if type_of_target == 'auto': 203 | type_of_target = multiclass.type_of_target(y) 204 | model_type, eval_func, cat_param_name = _dispatch_models(algorithm_type, type_of_target, eval_func) 205 | 206 | if with_auto_prep: 207 | assert algorithm_type in ('cat', 'xgb', 'lgbm'), "with_auto_prep is only supported for gbdt" 208 | X_train, X_test = autoprep_gbdt(algorithm_type, X_train, X_test, categorical_feature) 209 | 210 | logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S')) 211 | 212 | if inherit_experiment is not None: 213 | experiment = ExpeimentProxy(inherit_experiment) 214 | else: 215 | experiment = Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) 216 | 217 | with experiment as exp: 218 | exp.log('Algorithm: {}'.format(algorithm_type)) 219 | exp.log('Experiment: {}'.format(exp.logging_directory)) 220 | exp.log('Params: {}'.format(model_params)) 221 | exp.log('Features: {}'.format(list(X_train.columns))) 222 | exp.log_param('algorithm_type', algorithm_type) 223 | exp.log_param('num_features', X_train.shape[1]) 224 | if callable(fit_params): 225 | exp.log_param('fit_params', str(fit_params)) 226 | else: 227 | exp.log_dict('fit_params', fit_params) 228 | exp.log_dict('model_params', model_params) 229 | if feature_list is not None: 230 | exp.log_param('features', feature_list) 231 | 232 | if with_auto_hpo: 233 | assert algorithm_type == 'lgbm', 'auto-tuning is only supported for LightGBM' 234 | model_params = find_best_lgbm_parameter(model_params, X_train, y, cv=cv, groups=groups, 235 | type_of_target=type_of_target) 236 | exp.log_param('model_params_tuned', model_params) 237 | 238 | exp.log('Categorical: {}'.format(categorical_feature)) 239 | 240 | models = [model_type(**model_params) for _ in range(cv.get_n_splits())] 241 | 242 | if fit_params is None: 243 | fit_params = {} 244 | if cat_param_name is not None and not callable(fit_params) and cat_param_name not in fit_params: 245 | fit_params[cat_param_name] = categorical_feature 246 | 247 | if isinstance(fit_params, Dict): 248 | exp.log_params(fit_params) 249 | 250 | result = cross_validate(models, X_train=X_train, y=y, X_test=X_test, cv=cv, groups=groups, 251 | logger=exp.get_logger(), eval_func=eval_func, fit_params=fit_params, 252 | type_of_target=type_of_target) 253 | 254 | # save oof 255 | exp.log_numpy('oof_prediction', result.oof_prediction) 256 | exp.log_numpy('test_prediction', result.test_prediction) 257 | 258 | for i in range(cv.get_n_splits()): 259 | exp.log_metric('Fold {}'.format(i + 1), result.scores[i]) 260 | exp.log_metric('Overall', result.scores[-1]) 261 | 262 | # save importance plot 263 | if result.importance: 264 | importance = pd.concat(result.importance) 265 | plot_file_path = os.path.join(exp.logging_directory, 'importance.png') 266 | plot_importance(importance, plot_file_path) 267 | exp.log_artifact(plot_file_path) 268 | 269 | # save trained model 270 | for i, model in enumerate(models): 271 | _save_model(model, exp.logging_directory, i + 1, exp) 272 | 273 | # save submission.csv 274 | submit_df = None 275 | if X_test is not None: 276 | submit_df = make_submission_df(result.test_prediction, sample_submission, y) 277 | exp.log_dataframe(submission_filename or os.path.basename(exp.logging_directory), submit_df, 'csv') 278 | 279 | elapsed_time = time.time() - start_time 280 | 281 | return ExperimentResult(result.oof_prediction, result.test_prediction, 282 | result.scores, models, result.importance, elapsed_time, submit_df) 283 | 284 | 285 | def _dispatch_eval_func(target_type: str, custom_eval: Optional[Callable] = None): 286 | default_eval_func = { 287 | 'binary': roc_auc_score, 288 | 'multiclass': log_loss, 289 | 'continuous': mean_squared_error 290 | } 291 | return custom_eval if custom_eval is not None else default_eval_func[target_type] 292 | 293 | 294 | def _dispatch_gbdt_class(algorithm_type: str, type_of_target: str): 295 | is_regression = type_of_target == 'continuous' 296 | 297 | if algorithm_type == 'lgbm': 298 | requires_lightgbm() 299 | from lightgbm import LGBMClassifier, LGBMRegressor 300 | return LGBMRegressor if is_regression else LGBMClassifier 301 | elif algorithm_type == 'cat': 302 | requires_catboost() 303 | from catboost import CatBoostClassifier, CatBoostRegressor 304 | return CatBoostRegressor if is_regression else CatBoostClassifier 305 | else: 306 | requires_xgboost() 307 | assert algorithm_type == 'xgb' 308 | from xgboost import XGBClassifier, XGBRegressor 309 | return XGBRegressor if is_regression else XGBClassifier 310 | 311 | 312 | def _dispatch_models(algorithm_type: Union[str, Type[BaseEstimator]], 313 | target_type: str, custom_eval: Optional[Callable] = None): 314 | if not isinstance(algorithm_type, str): 315 | assert issubclass(algorithm_type, BaseEstimator), "algorithm_type should be str or subclass of BaseEstimator" 316 | return algorithm_type, _dispatch_eval_func(target_type, custom_eval), None 317 | 318 | cat_features = { 319 | 'lgbm': 'categorical_feature', 320 | 'cat': 'cat_features', 321 | 'xgb': None 322 | } 323 | 324 | gbdt_class = _dispatch_gbdt_class(algorithm_type, target_type) 325 | eval_func = _dispatch_eval_func(target_type, custom_eval) 326 | 327 | return gbdt_class, eval_func, cat_features[algorithm_type] 328 | 329 | 330 | def _save_model(model: BaseEstimator, logging_directory: str, fold: int, exp: Experiment): 331 | model_dir = os.path.join(logging_directory, 'models') 332 | os.makedirs(model_dir, exist_ok=True) 333 | path = os.path.join(model_dir, 'fold{}'.format(fold)) 334 | 335 | if is_gbdt_instance(model, 'lgbm'): 336 | model.booster_.save_model(path) 337 | elif is_gbdt_instance(model, ('xgb', 'cat')): 338 | model.save_model(path) 339 | else: 340 | with open(path, "wb") as f: 341 | pickle.dump(model, f) 342 | 343 | exp.log_artifact(path) 344 | 345 | 346 | def _check_input(X_train: pd.DataFrame, y: pd.Series, 347 | X_test: Optional[pd.DataFrame] = None): 348 | assert len(X_train) == len(y), "length of X_train and y are different. len(X_train) = {}, len(y) = {}".format( 349 | len(X_train), len(y) 350 | ) 351 | 352 | if X_test is not None: 353 | assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test" 354 | -------------------------------------------------------------------------------- /nyaggle/feature/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyanp/nyaggle/636532292d7ce3468cd47a3337bc50d620f0d23b/nyaggle/feature/__init__.py -------------------------------------------------------------------------------- /nyaggle/feature/base.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | 3 | 4 | class BaseFeaturizer(BaseEstimator, TransformerMixin): 5 | pass 6 | -------------------------------------------------------------------------------- /nyaggle/feature/category_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.feature.category_encoder.target_encoder import KFoldEncoderWrapper, TargetEncoder 2 | -------------------------------------------------------------------------------- /nyaggle/feature/category_encoder/target_encoder.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Iterable, Union 2 | 3 | import category_encoders as ce 4 | import numpy as np 5 | import pandas as pd 6 | from category_encoders.utils import convert_input, convert_input_vector 7 | from sklearn.base import BaseEstimator, clone 8 | from sklearn.model_selection import BaseCrossValidator 9 | 10 | from nyaggle.feature.base import BaseFeaturizer 11 | from nyaggle.validation.split import check_cv 12 | 13 | 14 | class KFoldEncoderWrapper(BaseFeaturizer): 15 | """KFold Wrapper for sklearn like interface 16 | 17 | This class wraps sklearn's TransformerMixIn (object that has fit/transform/fit_transform methods), 18 | and call it as K-fold manner. 19 | 20 | Args: 21 | base_transformer: 22 | Transformer object to be wrapped. 23 | cv: 24 | int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. 25 | 26 | - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, 27 | - integer, to specify the number of folds in a ``(Stratified)KFold``, 28 | - CV splitter (the instance of ``BaseCrossValidator``), 29 | - An iterable yielding (train, test) splits as arrays of indices. 30 | groups: 31 | Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). 32 | return_same_type: 33 | If True, `transform` and `fit_transform` return the same type as X. 34 | If False, these APIs always return a numpy array, similar to sklearn's API. 35 | """ 36 | 37 | def __init__(self, base_transformer: BaseEstimator, 38 | cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, return_same_type: bool = True, 39 | groups: Optional[pd.Series] = None): 40 | self.cv = cv 41 | self.base_transformer = base_transformer 42 | 43 | self.n_splits = None 44 | self.transformers = None 45 | self.return_same_type = return_same_type 46 | self.groups = groups 47 | 48 | def _pre_train(self, y): 49 | self.cv = check_cv(self.cv, y) 50 | self.n_splits = self.cv.get_n_splits() 51 | self.transformers = [clone(self.base_transformer) for _ in range(self.n_splits + 1)] 52 | 53 | def _fit_train(self, X: pd.DataFrame, y: Optional[pd.Series], **fit_params) -> pd.DataFrame: 54 | if y is None: 55 | X_ = self.transformers[-1].transform(X) 56 | return self._post_transform(X_) 57 | 58 | X_ = X.copy() 59 | 60 | for i, (train_index, test_index) in enumerate(self.cv.split(X_, y, self.groups)): 61 | self.transformers[i].fit(X.iloc[train_index], y.iloc[train_index], **fit_params) 62 | X_.iloc[test_index, :] = self.transformers[i].transform(X.iloc[test_index]) 63 | self.transformers[-1].fit(X, y, **fit_params) 64 | 65 | return X_ 66 | 67 | def _post_fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: 68 | return X 69 | 70 | def _post_transform(self, X: pd.DataFrame) -> pd.DataFrame: 71 | return X 72 | 73 | def fit(self, X: pd.DataFrame, y: pd.Series): 74 | """ 75 | Fit models for each fold. 76 | 77 | Args: 78 | X: 79 | Data 80 | y: 81 | Target 82 | Returns: 83 | returns the transformer object. 84 | """ 85 | self._post_fit(self.fit_transform(X, y), y) 86 | return self 87 | 88 | def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Union[pd.DataFrame, np.ndarray]: 89 | """ 90 | Transform X 91 | 92 | Args: 93 | X: Data 94 | 95 | Returns: 96 | Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True. 97 | """ 98 | is_pandas = isinstance(X, pd.DataFrame) 99 | X_ = self._fit_train(X, None) 100 | X_ = self._post_transform(X_) 101 | return X_ if self.return_same_type and is_pandas else X_.values 102 | 103 | def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y: pd.Series = None, **fit_params) \ 104 | -> Union[pd.DataFrame, np.ndarray]: 105 | """ 106 | Fit models for each fold, then transform X 107 | 108 | Args: 109 | X: 110 | Data 111 | y: 112 | Target 113 | fit_params: 114 | Additional parameters passed to models 115 | 116 | Returns: 117 | Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True. 118 | """ 119 | assert len(X) == len(y) 120 | self._pre_train(y) 121 | 122 | is_pandas = isinstance(X, pd.DataFrame) 123 | X = convert_input(X) 124 | y = convert_input_vector(y, X.index) 125 | 126 | if y.isnull().sum() > 0: 127 | # y == null is regarded as test data 128 | X_ = X.copy() 129 | X_.loc[~y.isnull(), :] = self._fit_train(X[~y.isnull()], y[~y.isnull()], **fit_params) 130 | X_.loc[y.isnull(), :] = self._fit_train(X[y.isnull()], None, **fit_params) 131 | else: 132 | X_ = self._fit_train(X, y, **fit_params) 133 | 134 | X_ = self._post_transform(self._post_fit(X_, y)) 135 | 136 | return X_ if self.return_same_type and is_pandas else X_.values 137 | 138 | 139 | class TargetEncoder(KFoldEncoderWrapper): 140 | """Target Encoder 141 | 142 | KFold version of category_encoders.TargetEncoder in 143 | https://contrib.scikit-learn.org/categorical-encoding/targetencoder.html. 144 | 145 | Args: 146 | cv: 147 | int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. 148 | 149 | - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, 150 | - integer, to specify the number of folds in a ``(Stratified)KFold``, 151 | - CV splitter (the instance of ``BaseCrossValidator``), 152 | - An iterable yielding (train, test) splits as arrays of indices. 153 | groups: 154 | Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). 155 | cols: 156 | A list of columns to encode, if None, all string columns will be encoded. 157 | drop_invariant: 158 | Boolean for whether or not to drop columns with 0 variance. 159 | handle_missing: 160 | Options are ‘error’, ‘return_nan’ and ‘value’, defaults to ‘value’, which returns the target mean. 161 | handle_unknown: 162 | Options are ‘error’, ‘return_nan’ and ‘value’, defaults to ‘value’, which returns the target mean. 163 | min_samples_leaf: 164 | Minimum samples to take category average into account. 165 | smoothing: 166 | Smoothing effect to balance categorical average vs prior. Higher value means stronger regularization. 167 | The value must be strictly bigger than 0. 168 | return_same_type: 169 | If True, ``transform`` and ``fit_transform`` return the same type as X. 170 | If False, these APIs always return a numpy array, similar to sklearn's API. 171 | """ 172 | 173 | def __init__(self, cv: Optional[Union[Iterable, BaseCrossValidator]] = None, 174 | groups: Optional[pd.Series] = None, 175 | cols: List[str] = None, 176 | drop_invariant: bool = False, handle_missing: str = 'value', handle_unknown: str = 'value', 177 | min_samples_leaf: int = 20, smoothing: float = 10.0, return_same_type: bool = True): 178 | e = ce.TargetEncoder(cols=cols, drop_invariant=drop_invariant, return_df=True, 179 | handle_missing=handle_missing, 180 | handle_unknown=handle_unknown, 181 | min_samples_leaf=min_samples_leaf, smoothing=smoothing) 182 | 183 | super().__init__(e, cv, return_same_type, groups) 184 | 185 | def _post_transform(self, X: pd.DataFrame) -> pd.DataFrame: 186 | cols = self.transformers[0].cols 187 | for c in cols: 188 | X[c] = X[c].astype(float) 189 | return X 190 | -------------------------------------------------------------------------------- /nyaggle/feature/groupby.py: -------------------------------------------------------------------------------- 1 | # Modified work: 2 | # ----------------------------------------------------------------------------- 3 | # Copyright (c) 2020 Kota Yuhara (@wakamezake) 4 | # ----------------------------------------------------------------------------- 5 | 6 | # Original work of aggregation: 7 | # https://github.com/pfnet-research/xfeat/blob/master/xfeat/helper.py 8 | # ----------------------------------------------------------------------------- 9 | # MIT License 10 | # 11 | # Copyright (c) 2020 Preferred Networks, Inc. 12 | # 13 | # Permission is hereby granted, free of charge, to any person obtaining a copy 14 | # of this software and associated documentation files (the "Software"), to deal 15 | # in the Software without restriction, including without limitation the rights 16 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 17 | # copies of the Software, and to permit persons to whom the Software is 18 | # furnished to do so, subject to the following conditions: 19 | # 20 | # The above copyright notice and this permission notice shall be included in all 21 | # copies or substantial portions of the Software. 22 | # 23 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 | # SOFTWARE. 30 | # ----------------------------------------------------------------------------- 31 | 32 | from inspect import isroutine 33 | from types import FunctionType, LambdaType 34 | from typing import Callable, List, Tuple, Union 35 | 36 | import pandas as pd 37 | from pandas.core.common import get_callable_name 38 | 39 | 40 | def _is_lambda_function(obj): 41 | """ 42 | Example: 43 | >>> import numpy as np 44 | >>> def custom_function(x): return np.sum(x) 45 | >>> _is_lambda_function(lambda x: np.sum(x)) 46 | True 47 | >>> _is_lambda_function(np.sum) 48 | False 49 | >>> _is_lambda_function(custom_function) 50 | False 51 | """ 52 | # It's worth noting that types.LambdaType is an alias for types.FunctionType 53 | return isinstance(obj, LambdaType) and obj.__name__ == "" 54 | 55 | 56 | def aggregation( 57 | input_df: pd.DataFrame, 58 | group_key: str, 59 | group_values: List[str], 60 | agg_methods: List[Union[str, FunctionType]], 61 | ) -> Tuple[pd.DataFrame, List[str]]: 62 | """ 63 | Aggregate values after grouping table rows by a given key. 64 | 65 | Args: 66 | input_df: 67 | Input data frame. 68 | group_key: 69 | Used to determine the groups for the groupby. 70 | group_values: 71 | Used to aggregate values for the groupby. 72 | agg_methods: 73 | List of function or function names, e.g. ['mean', 'max', 'min', numpy.mean]. 74 | Do not use a lambda function because the name attribute of the lambda 75 | function cannot generate a unique string of column names in . 76 | Returns: 77 | Tuple of output dataframe and new column names. 78 | """ 79 | new_df = input_df.copy() 80 | 81 | new_cols = [] 82 | for agg_method in agg_methods: 83 | if _is_lambda_function(agg_method): 84 | raise ValueError('Not supported lambda function.') 85 | elif isinstance(agg_method, str): 86 | pass 87 | elif isinstance(agg_method, FunctionType): 88 | pass 89 | elif isroutine(agg_method): 90 | pass 91 | else: 92 | raise ValueError('Supported types are: {} or {}.' 93 | ' Got {} instead.'.format(str, Callable, type(agg_method))) 94 | 95 | for agg_method in agg_methods: 96 | for col in group_values: 97 | # only str or FunctionType 98 | if isinstance(agg_method, str): 99 | agg_method_name = agg_method 100 | else: 101 | agg_method_name = get_callable_name(agg_method) 102 | new_col = "agg_{}_{}_by_{}".format(agg_method_name, col, group_key) 103 | 104 | df_agg = ( 105 | input_df[[col] + [group_key]].groupby(group_key)[[col]].agg( 106 | agg_method) 107 | ) 108 | df_agg.columns = [new_col] 109 | new_cols.append(new_col) 110 | new_df = new_df.merge( 111 | df_agg, how="left", right_index=True, left_on=group_key 112 | ) 113 | 114 | return new_df, new_cols 115 | -------------------------------------------------------------------------------- /nyaggle/feature/nlp/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.feature.nlp.bert import BertSentenceVectorizer 2 | -------------------------------------------------------------------------------- /nyaggle/feature/nlp/bert.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, List, Optional, Union 2 | import transformers 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from category_encoders.utils import convert_input 7 | from sklearn.decomposition import TruncatedSVD 8 | from tqdm import tqdm 9 | 10 | from nyaggle.environment import requires_torch 11 | from nyaggle.feature.base import BaseFeaturizer 12 | 13 | 14 | class BertSentenceVectorizer(BaseFeaturizer): 15 | """Sentence Vectorizer using BERT pretrained model. 16 | 17 | Extract fixed-length feature vector from English/Japanese variable-length sentence using BERT. 18 | 19 | Args: 20 | lang: 21 | Type of language. If set to "jp", Japanese BERT model is used (you need to install MeCab). 22 | n_components: 23 | Number of components in SVD. If `None`, SVD is not applied. 24 | text_columns: 25 | List of processing columns. If `None`, all object columns are regarded as text column. 26 | pooling_strategy: 27 | The pooling algorithm for generating fixed length encoding vector. 'reduce_mean' and 'reduce_max' use 28 | average pooling and max pooling respectively to reduce vector from (num-words, emb-dim) to (emb_dim). 29 | 'reduce_mean_max' performs 'reduce_mean' and 'reduce_max' separately and concat them. 30 | 'cls_token' takes the first element (i.e. [CLS]). 31 | use_cuda: 32 | If `True`, inference is performed on GPU. 33 | tokenizer: 34 | The custom tokenizer used instead of default tokenizer 35 | model: 36 | The custom pretrained model used instead of default BERT model 37 | return_same_type: 38 | If True, `transform` and `fit_transform` return the same type as X. 39 | If False, these APIs always return a numpy array, similar to sklearn's API. 40 | column_format: 41 | Name of transformed columns (used if returning type is pd.DataFrame) 42 | """ 43 | 44 | def __init__(self, lang: str = 'en', n_components: Optional[int] = None, 45 | text_columns: List[str] = None, pooling_strategy: str = 'reduce_mean', 46 | use_cuda: bool = False, tokenizer: transformers.PreTrainedTokenizer = None, 47 | model=None, return_same_type: bool = True, column_format: str = '{col}_{idx}'): 48 | if tokenizer is not None: 49 | assert model is not None 50 | self.tokenizer = tokenizer 51 | self.model = model 52 | if lang == 'en': 53 | pretrained_model_name = 'bert-base-uncased' 54 | self.tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_model_name) 55 | self.model = transformers.BertModel.from_pretrained(pretrained_model_name) 56 | elif lang == 'jp': 57 | pretrained_model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking' 58 | self.tokenizer = transformers.BertJapaneseTokenizer.from_pretrained(pretrained_model_name) 59 | self.model = transformers.BertModel.from_pretrained(pretrained_model_name) 60 | else: 61 | raise ValueError('Specified language type () is invalid.'.format(lang)) 62 | 63 | self.lang = lang 64 | self.n_components = n_components 65 | self.text_columns = text_columns 66 | self.pooling_strategy = pooling_strategy 67 | self.use_cuda = use_cuda 68 | self.return_same_type = return_same_type 69 | self.svd = {} 70 | self.column_format = column_format 71 | 72 | def _process_text(self, text: str) -> np.ndarray: 73 | requires_torch() 74 | import torch 75 | 76 | tokens_tensor = torch.tensor([self.tokenizer.encode(text, add_special_tokens=True)]) 77 | if self.use_cuda: 78 | tokens_tensor = tokens_tensor.to('cuda') 79 | self.model.to('cuda') 80 | 81 | self.model.eval() 82 | with torch.no_grad(): 83 | outputs = self.model(tokens_tensor) 84 | 85 | embedding = outputs.last_hidden_state.cpu().numpy()[0] 86 | if self.pooling_strategy == 'reduce_mean': 87 | return np.mean(embedding, axis=0) 88 | elif self.pooling_strategy == 'reduce_max': 89 | return np.max(embedding, axis=0) 90 | elif self.pooling_strategy == 'reduce_mean_max': 91 | return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)] 92 | elif self.pooling_strategy == 'cls_token': 93 | return embedding[0] 94 | else: 95 | raise ValueError("specify valid pooling_strategy: {reduce_mean, reduce_max, reduce_mean_max, cls_token}") 96 | 97 | def _fit_one(self, col: str, emb: np.ndarray): 98 | if not self.n_components or self.n_components >= emb.shape[1]: 99 | return emb 100 | self.svd[col] = TruncatedSVD(n_components=self.n_components, algorithm='arpack', random_state=0) 101 | return self.svd[col].fit(emb) 102 | 103 | def _transform_one(self, col: str, emb: np.ndarray): 104 | if not self.n_components or self.n_components >= emb.shape[1]: 105 | return emb 106 | return self.svd[col].transform(emb) 107 | 108 | def _fit_transform_one(self, col: str, emb: np.ndarray): 109 | if not self.n_components or self.n_components >= emb.shape[1]: 110 | return emb 111 | self.svd[col] = TruncatedSVD(n_components=self.n_components, algorithm='arpack', random_state=0) 112 | return self.svd[col].fit_transform(emb) 113 | 114 | def _process(self, X: pd.DataFrame, func: Callable[[str, np.ndarray], Any]): 115 | is_pandas = isinstance(X, pd.DataFrame) 116 | X = convert_input(X) 117 | 118 | tqdm.pandas() 119 | columns = self.text_columns or [c for c in X.columns if X[c].dtype == object] 120 | non_text_columns = [c for c in X.columns if c not in columns] 121 | 122 | column_names = [] 123 | processed = [] 124 | for c in columns: 125 | emb = np.vstack(X[c].progress_apply(lambda x: self._process_text(x))) 126 | emb = func(c, emb) 127 | processed.append(emb) 128 | column_names += [self.column_format.format(col=c, idx=i) for i in range(emb.shape[1])] 129 | 130 | processed_df = pd.DataFrame(np.hstack(processed), columns=column_names) 131 | 132 | if non_text_columns: 133 | X_ = X[non_text_columns].copy() 134 | X_ = pd.concat([X_, processed_df], axis=1) 135 | else: 136 | X_ = processed_df 137 | 138 | return X_ if self.return_same_type and is_pandas else X_.values 139 | 140 | def fit(self, X: Union[pd.DataFrame, np.ndarray], y=None): 141 | """ 142 | Fit SVD model on training data X. 143 | 144 | Args: 145 | X: 146 | Data 147 | y: 148 | Ignored 149 | """ 150 | self._process(X, self._fit_one) 151 | return self 152 | 153 | def transform(self, X: Union[pd.DataFrame, np.ndarray], y=None): 154 | """ 155 | Perform feature extraction and dimensionality reduction using 156 | BERT pre-trained model and trained SVD model. 157 | 158 | Args: 159 | X: 160 | Data 161 | y: 162 | Ignored 163 | """ 164 | return self._process(X, self._transform_one) 165 | 166 | def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y=None, **fit_params): 167 | """ 168 | Fit SVD model on training data X and perform feature extraction and dimensionality reduction using 169 | BERT pre-trained model and trained SVD model. 170 | 171 | Args: 172 | X: 173 | Data 174 | y: 175 | Ignored 176 | """ 177 | return self._process(X, self._fit_transform_one) 178 | -------------------------------------------------------------------------------- /nyaggle/feature_store/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.feature_store.feature_store import cached_feature, save_feature, load_feature, load_features 2 | -------------------------------------------------------------------------------- /nyaggle/feature_store/feature_store.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os 3 | import warnings 4 | from typing import List, Optional, Union 5 | 6 | import pandas as pd 7 | import pyarrow 8 | from tqdm import tqdm 9 | 10 | 11 | def validate_train_test_difference(train: pd.Series, test: pd.Series): 12 | # % of nulls 13 | if test.isnull().mean() == 1.0: 14 | raise RuntimeError('Error in feature {}: all values in test data is null'.format(train.name)) 15 | 16 | 17 | def validate_feature(df: pd.DataFrame, y: pd.Series): 18 | if len(y) < len(df): 19 | # assuming that the first part of the dataframe is train part 20 | train = df.iloc[:len(y), :] 21 | test = df.iloc[len(y):, :] 22 | else: 23 | train = df[~y.isnull()] 24 | test = df[y.isnull()] 25 | 26 | for c in df.columns: 27 | validate_train_test_difference(train[c], test[c]) 28 | 29 | 30 | def save_feature(df: pd.DataFrame, feature_name: Union[int, str], directory: str = './features/', 31 | with_csv_dump: bool = False, create_directory: bool = True, 32 | reference_target_variable: Optional[pd.Series] = None, overwrite: bool = False): 33 | """ 34 | Save pandas dataframe as feather-format 35 | 36 | Args: 37 | df: 38 | The dataframe to be saved. 39 | feature_name: 40 | The name of the feature. The output file will be ``{feature_name}.f``. 41 | directory: 42 | The directory where the feature will be stored. 43 | with_csv_dump: 44 | If True, the first 1000 lines are dumped to csv file for debug. 45 | create_directory: 46 | If True, create directory if not exists. 47 | reference_target_variable: 48 | If not None, instant validation will be made on the feature. 49 | overwrite: 50 | If False and file already exists, RuntimeError will be raised. 51 | """ 52 | if create_directory: 53 | os.makedirs(directory, exist_ok=True) 54 | 55 | if reference_target_variable is not None: 56 | validate_feature(df, reference_target_variable) 57 | 58 | path = os.path.join(directory, str(feature_name) + '.f') 59 | 60 | if not overwrite and os.path.exists(path): 61 | raise RuntimeError('File already exists') 62 | 63 | df.to_feather(path) 64 | 65 | if with_csv_dump: 66 | df.head(1000).to_csv(os.path.join(directory, str(feature_name) + '.csv'), index=False) 67 | 68 | 69 | def load_feature(feature_name: Union[int, str], directory: str = './features/', 70 | ignore_columns: List[str] = None) -> pd.DataFrame: 71 | """ 72 | Load feature as pandas DataFrame. 73 | 74 | Args: 75 | feature_name: 76 | The name of the feature (used in ``save_feature``). 77 | directory: 78 | The directory where the feature is stored. 79 | ignore_columns: 80 | The list of columns that will be dropped from the loaded dataframe. 81 | Returns: 82 | The feature dataframe 83 | """ 84 | path = os.path.join(directory, str(feature_name) + '.f') 85 | 86 | df = pd.read_feather(path) 87 | if ignore_columns: 88 | return df.drop([c for c in ignore_columns if c in df.columns], axis=1) 89 | else: 90 | return df 91 | 92 | 93 | def load_features(base_df: Optional[pd.DataFrame], 94 | feature_names: List[Union[int, str]], directory: str = './features/', 95 | ignore_columns: List[str] = None, create_directory: bool = True, 96 | rename_duplicate: bool = True) -> pd.DataFrame: 97 | """ 98 | Load features and returns concatenated dataframe 99 | 100 | Args: 101 | base_df: 102 | The base dataframe. If not None, resulting dataframe will consist of base and loaded feature columns. 103 | feature_names: 104 | The list of feature names to be loaded. 105 | directory: 106 | The directory where the feature is stored. 107 | ignore_columns: 108 | The list of columns that will be dropped from the loaded dataframe. 109 | create_directory: 110 | If True, create directory if not exists. 111 | rename_duplicate: 112 | If True, duplicated column name will be renamed automatically (feature name will be used as suffix). 113 | If False, duplicated columns will be as-is. 114 | Returns: 115 | The merged dataframe 116 | """ 117 | if create_directory: 118 | os.makedirs(directory, exist_ok=True) 119 | 120 | dfs = [load_feature(f, directory=directory, ignore_columns=ignore_columns) for f in tqdm(feature_names)] 121 | 122 | if base_df is None: 123 | base_df = dfs[0] 124 | dfs = dfs[1:] 125 | feature_names = feature_names[1:] 126 | 127 | columns = list(base_df.columns) 128 | 129 | for df, feature_name in zip(dfs, feature_names): 130 | if len(df) != len(base_df): 131 | raise RuntimeError('DataFrame length are different. feature={}'.format(feature_name)) 132 | 133 | for c in df.columns: 134 | if c in columns: 135 | warnings.warn('A feature name {} is duplicated.'.format(c)) 136 | 137 | if rename_duplicate: 138 | while c in columns: 139 | c += '_' + str(feature_name) 140 | warnings.warn('The duplicated name in feature={} will be renamed to {}'.format(feature_name, c)) 141 | columns.append(c) 142 | 143 | concatenated = pd.concat([base_df] + dfs, axis=1) 144 | concatenated.columns = columns 145 | return concatenated 146 | 147 | 148 | def cached_feature(feature_name: Union[int, str], directory: str = './features/', ignore_columns: List[str] = None): 149 | """ 150 | Decorator to wrap a function which returns pd.DataFrame with a memorizing callable that saves dataframe using 151 | ``feature_store.save_feature``. 152 | 153 | Args: 154 | feature_name: 155 | The name of the feature (used in ``save_feature``). 156 | directory: 157 | The directory where the feature is stored. 158 | ignore_columns: 159 | The list of columns that will be dropped from the loaded dataframe. 160 | 161 | Example: 162 | >>> from nyaggle.feature_store import cached_feature 163 | >>> 164 | >>> @cached_feature('x') 165 | >>> def make_feature_x(param) -> pd.DataFrame: 166 | >>> print('called') 167 | >>> ... 168 | >>> return df 169 | >>> 170 | >>> x = make_feature_x(...) # if x.f does not exist, call the function and save result to x.f 171 | "called" 172 | >>> x = make_feature_x(...) # load from file in the second time 173 | """ 174 | 175 | def _decorator(fun): 176 | @functools.wraps(fun) 177 | def _decorated_fun(*args, **kwargs): 178 | try: 179 | return load_feature(feature_name, directory, ignore_columns) 180 | except (pyarrow.ArrowIOError, IOError): 181 | df = fun(*args, **kwargs) 182 | assert isinstance(df, pd.DataFrame), "returning value of @cached_feature should be pd.DataFrame" 183 | save_feature(df, feature_name, directory) 184 | return df 185 | 186 | return _decorated_fun 187 | 188 | return _decorator 189 | -------------------------------------------------------------------------------- /nyaggle/hyper_parameters/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.hyper_parameters.parameters import get_hyperparam_byname, list_hyperparams 2 | -------------------------------------------------------------------------------- /nyaggle/hyper_parameters/catboost.py: -------------------------------------------------------------------------------- 1 | parameters = [ 2 | { 3 | "name": "ieee-2019-17th", 4 | "url": "https://nbviewer.jupyter.org/github/tmheo/IEEE-Fraud-Detection-17th-Place-Solution/blob/master/notebook/IEEE-17th-Place-Solution-CatBoost-Ensemble.ipynb", 5 | "competition": "ieee-fraud-detection", 6 | "rank": 17, 7 | "metric": "auc", 8 | "parameters": { 9 | 'learning_rate': 0.07, 10 | 'eval_metric': 'AUC', 11 | 'loss_function': 'Logloss', 12 | 'metric_period': 500, 13 | 'od_wait': 500, 14 | 'depth': 8, 15 | } 16 | }, 17 | { 18 | "name": "elo-2018-11th", 19 | "url": "https://github.com/kangzhang0709/2019-kaggle-elo-top-11-solution", 20 | "competition": "elo-merchant-category-recommendation", 21 | "rank": 11, 22 | "metric": "rmse", 23 | "parameters": { 24 | 'learning_rate': 0.01, 25 | 'max_depth': 8, 26 | 'bagging_temperature': 0.8, 27 | 'l2_leaf_reg': 45, 28 | 'od_type': 'Iter' 29 | } 30 | }, 31 | { 32 | "name": "plasticc-2018-3rd", 33 | "url": "https://github.com/takashioya/plasticc/blob/master/scripts/train.py", 34 | "competition": "PLAsTiCC-2018", 35 | "rank": 3, 36 | "metric": "multi-class log-loss", 37 | "parameters": { 38 | 'learning_rate': 0.1, 39 | 'depth': 3, 40 | 'loss_function': 'MultiClass', 41 | 'colsample_bylevel': 0.7, 42 | } 43 | }, 44 | ] 45 | 46 | -------------------------------------------------------------------------------- /nyaggle/hyper_parameters/parameters.py: -------------------------------------------------------------------------------- 1 | from more_itertools import first_true 2 | from typing import Dict, List, Union 3 | 4 | from nyaggle.hyper_parameters.catboost import parameters as params_cat 5 | from nyaggle.hyper_parameters.lightgbm import parameters as params_lgb 6 | from nyaggle.hyper_parameters.xgboost import parameters as params_xgb 7 | 8 | 9 | def _get_hyperparam_byname(param_table: List[Dict], name: str, with_metadata: bool): 10 | found = first_true(param_table, pred=lambda x: x['name'] == name) 11 | if found is None: 12 | raise RuntimeError('Hyperparameter {} not found.'.format(name)) 13 | 14 | if with_metadata: 15 | return found 16 | else: 17 | return found['parameters'] 18 | 19 | 20 | def _return(parameter: Union[List[Dict], Dict], with_metadata: bool) -> Union[List[Dict], Dict]: 21 | if with_metadata: 22 | return parameter 23 | 24 | if isinstance(parameter, list): 25 | return [p['parameters'] for p in parameter] 26 | else: 27 | return parameter['parameters'] 28 | 29 | 30 | def _get_table(gbdt_type: str = 'lgbm'): 31 | if gbdt_type == 'lgbm': 32 | return params_lgb 33 | elif gbdt_type == 'cat': 34 | return params_cat 35 | elif gbdt_type == 'xgb': 36 | return params_xgb 37 | raise ValueError('gbdt type should be one of (lgbm, cat, xgb)') 38 | 39 | 40 | def list_hyperparams(gbdt_type: str = 'lgbm', with_metadata: bool = False) -> List[Dict]: 41 | """ 42 | List all hyperparameters 43 | 44 | Args: 45 | gbdt_type: 46 | The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used. 47 | with_metadata: 48 | When set to True, parameters are wrapped by metadata dictionary which contains information about 49 | source URL, competition name etc. 50 | Returns: 51 | A list of hyper-parameters used in Kaggle gold medal solutions 52 | """ 53 | return _return(_get_table(gbdt_type), with_metadata) 54 | 55 | 56 | def get_hyperparam_byname(name: str, gbdt_type: str = 'lgbm', with_metadata: bool = False) -> Dict: 57 | """ 58 | Get a hyperparameter by parameter name 59 | 60 | Args: 61 | name: 62 | The name of parameter (e.g. "ieee-2019-10th"). 63 | gbdt_type: 64 | The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used. 65 | with_metadata: 66 | When set to True, parameters are wrapped by metadata dictionary which contains information about 67 | source URL, competition name etc. 68 | Returns: 69 | A hyperparameter dictionary. 70 | """ 71 | param_table = _get_table(gbdt_type) 72 | found = first_true(param_table, pred=lambda x: x['name'] == name) 73 | if found is None: 74 | raise RuntimeError('Hyperparameter {} not found.'.format(name)) 75 | 76 | return _return(found, with_metadata) 77 | -------------------------------------------------------------------------------- /nyaggle/hyper_parameters/xgboost.py: -------------------------------------------------------------------------------- 1 | parameters = [ 2 | { 3 | "name": "ieee-2019-1st", 4 | "url": "https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600", 5 | "competition": "ieee-fraud-detection", 6 | "rank": 1, 7 | "metric": "auc", 8 | "parameters": { 9 | "max_depth": 12, 10 | "learning_rate": 0.02, 11 | "subsample": 0.8, 12 | "colsample_bytree": 0.4, 13 | "missing": -1, 14 | "eval_metric": "auc", 15 | "tree_method": "hist" 16 | } 17 | }, 18 | { 19 | "name": "womens-ml-competition-2019-1st", 20 | "url": "https://github.com/salmatfq/KaggleMarchMadnessFirstPlace/blob/master/win_ncaa_men.R", 21 | "competition": "womens-machine-learning-competition-2019", 22 | "rank": 1, 23 | "metric": "log-loss", 24 | "parameters": { 25 | "eval_metric": "mae", 26 | "booster": "gbtree", 27 | "eta": 0.02, 28 | "subsample": 0.35, 29 | "colsample_bytree": 0.7, 30 | "num_parallel_tree": 10, 31 | "min_child_weight": 40, 32 | "gamma": 10, 33 | "max_depth": 3 34 | } 35 | }, 36 | 37 | # 2018, Santander Value Prediction Challenge 38 | { 39 | "name": "santander-2018-5th", 40 | "url": "https://github.com/vlarine/kaggle/blob/master/santander-value-prediction-challenge/santander.py", 41 | "competition": "santander-value-prediction-challenge", 42 | "rank": 5, 43 | "metric": "rmsle", 44 | "parameters": { 45 | 'colsample_bytree': 0.055, 46 | 'colsample_bylevel': 0.4, 47 | 'gamma': 1.5, 48 | 'learning_rate': 0.01, 49 | 'max_depth': 5, 50 | 'objective': 'reg:linear', 51 | 'booster': 'gbtree', 52 | 'min_child_weight': 10, 53 | 'reg_alpha': 0, 54 | 'reg_lambda': 0, 55 | 'eval_metric': 'rmse', 56 | 'subsample': 0.7, 57 | } 58 | }, 59 | 60 | # 2018, Elo Merchant Category Recommendation 61 | { 62 | "name": "elo-2018-11th", 63 | "url": "https://github.com/kangzhang0709/2019-kaggle-elo-top-11-solution/blob/master/Models/model_xgb.ipynb", 64 | "competition": "elo-merchant-category-recommendation", 65 | "rank": 11, 66 | "metric": "rmse", 67 | "parameters": { 68 | 'objective': 'reg:linear', 69 | 'booster': 'gbtree', 70 | 'learning_rate': 0.01, 71 | 'max_depth': 10, 72 | 'gamma': 1.45, 73 | 'alpha': 0.1, 74 | 'lambda': 0.3, 75 | 'subsample': 0.9, 76 | 'colsample_bytree': 0.054, 77 | 'colsample_bylevel': 0.50 78 | } 79 | }, 80 | 81 | # 2018, DonorsChoose.org Application Screening 82 | { 83 | "name": "donorschoose-2018-1st", 84 | "url": "https://www.kaggle.com/shadowwarrior/1st-place-solution/notebook", 85 | "competition": "donorschoose-application-screening", 86 | "rank": 1, 87 | "metric": "auc", 88 | "parameters": { 89 | 'objective': 'binary:logistic', 90 | 'eval_metric': 'auc', 91 | 'eta': 0.01, 92 | 'max_depth': 7, 93 | 'subsample': 0.8, 94 | 'colsample_bytree': 0.4, 95 | 'min_child_weight': 10, 96 | 'gamma': 2 97 | } 98 | }, 99 | 100 | # 2018, Recruit Restaurant Visitor Forecasting 101 | 102 | # 2017, Instacart Market Basket Analysis 103 | { 104 | "name": "instacart-2017-2nd", 105 | "url": "https://github.com/KazukiOnodera/Instacart/blob/master/py_model/002_xgb_holdout_item_812_1.py", 106 | "competition": "instacart-market-basket-analysis", 107 | "rank": 2, 108 | "metric": "", 109 | "parameters": { 110 | 'max_depth': 10, 111 | 'eta': 0.02, 112 | 'colsample_bytree': 0.4, 113 | 'subsample': 0.75, 114 | 'eval_metric': 'logloss', 115 | 'objective': 'binary:logistic', 116 | 'tree_method': 'hist' 117 | } 118 | }, 119 | 120 | # 2017, Two Sigma Connect; Rental Listing Inquiries 121 | { 122 | "name": "two-sigma-2017-1st", 123 | "url": "https://github.com/plantsgo/Rental-Listing-Inquiries/blob/master/xgb.py", 124 | "competition": "two-sigma-connect-rental-listing-inquiries", 125 | "rank": 1, 126 | "metric": "multi-class log-loss", 127 | "parameters": { 128 | 'booster': 'gbtree', 129 | 'objective': 'multi:softprob', 130 | 'eval_metric': 'mlogloss', 131 | 'gamma': 1, 132 | 'min_child_weight': 1.5, 133 | 'max_depth': 5, 134 | 'lambda': 10, 135 | 'subsample': 0.7, 136 | 'colsample_bytree': 0.7, 137 | 'colsample_bylevel': 0.7, 138 | 'eta': 0.03, 139 | 'tree_method': 'exact' 140 | } 141 | }, 142 | 143 | # 2016, Santander Product Recommendation 144 | { 145 | "name": "santander-2016-2nd", 146 | "url": "https://github.com/ttvand/Santander-Product-Recommendation/blob/master/First%20level%20learners/xgboost.R", 147 | "competition": "santander-product-recommendation", 148 | "rank": 2, 149 | "metric": "map7", 150 | "parameters": { 151 | "etaC": 10, 152 | "subsample": 1, 153 | "colsample_bytree": 0.5, 154 | "max_depth": 8, 155 | "min_child_weight": 0, 156 | "gamma": 0.1 157 | } 158 | }, 159 | 160 | # 2016, TalkingData Mobile User Demographics 161 | { 162 | "name": "talkingdata-2016-3rd-1", 163 | "url": "https://github.com/chechir/talking_data/blob/master/danijel/xgb/xgb_cv5_train_events.R", 164 | "competition": "talkingdata-mobile-user-demographics", 165 | "rank": 3, 166 | "metric": "multi-class log-loss", 167 | "parameters": { 168 | "booster": 'gbtree', 169 | "objective": 'reg:logistic', 170 | "eval_metric": 'logloss', 171 | "learning_rate": 0.025, 172 | "max_depth": 6, 173 | "subsample": 0.8, 174 | "colsample_bytree": 0.5, 175 | "colsample_bylevel": 0.5 176 | } 177 | }, 178 | { 179 | "name": "talkingdata-2016-3rd-2", 180 | "url": "https://github.com/chechir/talking_data/blob/master/danijel/xgb/xgb_cv5_train_noevents.R", 181 | "competition": "talkingdata-mobile-user-demographics", 182 | "rank": 3, 183 | "metric": "multi-class log-loss", 184 | "parameters": { 185 | "booster": 'gbtree', 186 | "objective": 'reg:logistic', 187 | "eval_metric": 'logloss', 188 | "learning_rate": 0.05, 189 | "max_depth": 2, 190 | "colsample_bytree": 0.8, 191 | "colsample_bylevel": 0.8 192 | } 193 | }, 194 | 195 | # 2016, Allstate Claims Severity 196 | { 197 | "name": "allstate-2016-3rd", 198 | "url": "https://www.kaggle.com/c/allstate-claims-severity/discussion/26447#150319", 199 | "competition": "allstate-claims-severity", 200 | "rank": 3, 201 | "metric": "mae", 202 | "parameters": { 203 | 'colsample_bytree': 0.4, 204 | 'subsample': 0.975, 205 | 'learning_rate': 0.015, 206 | 'gamma': 1.5, 207 | 'lambda': 2, 208 | 'alpha': 2, 209 | 'max_depth': 25, 210 | 'num_parallel_tree': 1, 211 | 'min_child_weight': 50, 212 | 'eval_metric': 'mae', 213 | 'max_delta_step': 0, 214 | } 215 | }, 216 | 217 | # 2016, Bosch Production Line Performance 218 | { 219 | "name": "bosch-2016-1st", 220 | "url": "https://www.kaggle.com/c/bosch-production-line-performance/discussion/25434#144628", 221 | "competition": "bosch-production-line-performance", 222 | "rank": 1, 223 | "metric": "mcc", 224 | "parameters": { 225 | "eval_metric": "auc", 226 | "alpha": 0, 227 | "booster": "gbtree", 228 | "colsample_bytree": 0.6, 229 | "minchildweight": 5, 230 | "subsample": 0.9, 231 | "eta": 0.03, 232 | "objective": "binary:logistic", 233 | "max_depth": 14, 234 | "lambda": 4 235 | } 236 | }, 237 | ] 238 | -------------------------------------------------------------------------------- /nyaggle/testing/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.testing.util import * 2 | -------------------------------------------------------------------------------- /nyaggle/testing/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import uuid 5 | from contextlib import contextmanager 6 | from typing import Tuple 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.datasets import make_classification, make_regression 11 | 12 | 13 | def make_classification_df(n_samples: int = 1024, 14 | n_num_features: int = 20, 15 | n_cat_features: int = 0, 16 | class_sep: float = 1.0, 17 | n_classes: int = 2, 18 | feature_name: str = 'col_{}', 19 | target_name: str = 'target', 20 | random_state: int = 0, 21 | id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]: 22 | np.random.seed(random_state) 23 | X, y = make_classification(n_samples=n_samples, n_features=n_num_features, class_sep=class_sep, 24 | random_state=random_state, n_classes=n_classes, n_informative=max(n_classes, 2)) 25 | 26 | X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)]) 27 | y = pd.Series(y, name=target_name) 28 | 29 | if id_column is not None: 30 | X[id_column] = range(n_samples) 31 | 32 | for i in range(n_cat_features): 33 | X['cat_{}'.format(i)] = \ 34 | pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype('category') 35 | 36 | return X, y 37 | 38 | 39 | def make_regression_df(n_samples: int = 1024, 40 | n_num_features: int = 20, 41 | n_cat_features: int = 0, 42 | feature_name: str = 'col_{}', 43 | target_name: str = 'target', 44 | random_state: int = 0, 45 | id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]: 46 | np.random.seed(random_state) 47 | X, y = make_regression(n_samples=n_samples, n_features=n_num_features, 48 | random_state=random_state) 49 | 50 | X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)]) 51 | y = pd.Series(y, name=target_name) 52 | 53 | if id_column is not None: 54 | X[id_column] = range(n_samples) 55 | 56 | for i in range(n_cat_features): 57 | X['cat_{}'.format(i)] = \ 58 | pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype(str).astype('category') 59 | 60 | return X, y 61 | 62 | 63 | 64 | 65 | @contextmanager 66 | def get_temp_directory() -> str: 67 | path = None 68 | try: 69 | path = os.path.join(tempfile.gettempdir(), uuid.uuid4().hex) 70 | yield path 71 | finally: 72 | if path: 73 | shutil.rmtree(path, ignore_errors=True) 74 | 75 | -------------------------------------------------------------------------------- /nyaggle/util/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.util.plot_importance import plot_importance 2 | from nyaggle.util.traits import is_instance, is_gbdt_instance 3 | from nyaggle.util.submission import make_submission_df 4 | -------------------------------------------------------------------------------- /nyaggle/util/plot_importance.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | import seaborn as sns 6 | 7 | 8 | def plot_importance(importance: pd.DataFrame, path: Optional[str] = None, top_n: int = 100, 9 | figsize: Optional[Tuple[int, int]] = None, 10 | title: Optional[str] = None): 11 | """ 12 | Plot feature importance and write to image 13 | 14 | Args: 15 | importance: 16 | The dataframe which has "feature" and "importance" column 17 | path: 18 | The file path to be saved 19 | top_n: 20 | The number of features to be visualized 21 | figsize: 22 | The size of the figure 23 | title: 24 | The title of the plot 25 | Example: 26 | >>> import pandas as pd 27 | >>> import lightgbm as lgb 28 | >>> from nyaggle.util import plot_importance 29 | >>> from sklearn.datasets import make_classification 30 | 31 | >>> X, y = make_classification() 32 | >>> X = pd.DataFrame(X, columns=['col{}'.format(i) for i in range(X.shape[1])]) 33 | >>> booster = lgb.train({'objective': 'binary'}, lgb.Dataset(X, y)) 34 | >>> importance = pd.DataFrame({ 35 | >>> 'feature': X.columns, 36 | >>> 'importance': booster.feature_importance('gain') 37 | >>> }) 38 | >>> plot_importance(importance, 'importance.png') 39 | """ 40 | importance = importance.groupby('feature')['importance'] \ 41 | .mean() \ 42 | .reset_index() \ 43 | .sort_values(by='importance', ascending=False) 44 | 45 | if len(importance) > top_n: 46 | importance = importance.iloc[:top_n, :] 47 | 48 | if figsize is None: 49 | figsize = (10, 16) 50 | 51 | if title is None: 52 | title = 'Feature Importance' 53 | 54 | plt.figure(figsize=figsize) 55 | sns.barplot(x="importance", y="feature", data=importance) 56 | plt.title(title) 57 | plt.tight_layout() 58 | if path is not None: 59 | plt.savefig(path) 60 | -------------------------------------------------------------------------------- /nyaggle/util/submission.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def make_submission_df(test_prediction: np.ndarray, sample_submission: Optional[pd.DataFrame] = None, 8 | y: Optional[pd.Series] = None) -> pd.DataFrame: 9 | """ 10 | Make a dataframe formatted as a kaggle competition style. 11 | 12 | Args: 13 | test_prediction: 14 | A test prediction to be formatted. 15 | sample_submission: 16 | A sample dataframe alined with test data (Usually in Kaggle, it is available as sample_submission.csv). 17 | The submission file will be created with the same schema as this dataframe. 18 | y: 19 | Target variables which is used for inferring the column name. Ignored if ``sample_submission`` is passed. 20 | Returns: 21 | The formatted dataframe 22 | """ 23 | if sample_submission is not None: 24 | submit_df = sample_submission.copy() 25 | 26 | if test_prediction.ndim > 1 and test_prediction.shape[1] > 1: 27 | n_id_cols = submit_df.shape[1] - test_prediction.shape[1] 28 | for i in range(test_prediction.shape[1]): 29 | submit_df.iloc[:, n_id_cols + i] = test_prediction[:, i] 30 | else: 31 | submit_df.iloc[:, -1] = test_prediction 32 | else: 33 | submit_df = pd.DataFrame() 34 | id_col_name = y.index.name if y is not None and y.index.name else 'id' 35 | 36 | submit_df[id_col_name] = np.arange(len(test_prediction)) 37 | 38 | if test_prediction.ndim > 1 and test_prediction.shape[1] > 1: 39 | tgt_col_names = sorted(y.unique()) if y is not None else [str(i) for i in range(test_prediction.shape[1])] 40 | for i, y in enumerate(tgt_col_names): 41 | submit_df[y] = test_prediction[:, i] 42 | else: 43 | tgt_col_name = y.name if y is not None and y.name else 'target' 44 | submit_df[tgt_col_name] = test_prediction 45 | 46 | return submit_df 47 | -------------------------------------------------------------------------------- /nyaggle/util/traits.py: -------------------------------------------------------------------------------- 1 | # Original work of safe_instance: 2 | # https://github.com/slundberg/shap/blob/master/shap/common.py 3 | # ----------------------------------------------------------------------------- 4 | # The MIT License (MIT) 5 | # 6 | # Copyright (c) 2018 Scott Lundberg 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | # ----------------------------------------------------------------------------- 26 | 27 | import importlib 28 | from typing import List, Tuple, Union 29 | 30 | 31 | def is_instance(obj, class_path_str: Union[str, List, Tuple]) -> bool: 32 | """ 33 | Acts as a safe version of isinstance without having to explicitly 34 | import packages which may not exist in the users environment. 35 | Checks if obj is an instance of type specified by class_path_str. 36 | Parameters 37 | ---------- 38 | obj: Any 39 | Some object you want to test against 40 | class_path_str: str or list 41 | A string or list of strings specifying full class paths 42 | Example: `sklearn.ensemble.RandomForestRegressor` 43 | Returns 44 | -------- 45 | bool: True if isinstance is true and the package exists, False otherwise 46 | """ 47 | if isinstance(class_path_str, str): 48 | class_path_strs = [class_path_str] 49 | elif isinstance(class_path_str, list) or isinstance(class_path_str, tuple): 50 | class_path_strs = class_path_str 51 | else: 52 | class_path_strs = [''] 53 | 54 | # try each module path in order 55 | for class_path_str in class_path_strs: 56 | if "." not in class_path_str: 57 | raise ValueError("class_path_str must be a string or list of strings specifying a full \ 58 | module path to a class. Eg, 'sklearn.ensemble.RandomForestRegressor'") 59 | 60 | # Splits on last occurence of "." 61 | module_name, class_name = class_path_str.rsplit(".", 1) 62 | 63 | # Check module exists 64 | try: 65 | spec = importlib.util.find_spec(module_name) 66 | except: 67 | spec = None 68 | if spec is None: 69 | continue 70 | 71 | module = importlib.import_module(module_name) 72 | 73 | # Get class 74 | _class = getattr(module, class_name, None) 75 | if _class is None: 76 | continue 77 | 78 | if isinstance(obj, _class): 79 | return True 80 | 81 | return False 82 | 83 | 84 | def is_gbdt_instance(obj, algorithm_type: Union[str, Tuple]) -> bool: 85 | if isinstance(algorithm_type, str): 86 | algorithm_type = (algorithm_type,) 87 | 88 | gbdt_instance_name = { 89 | 'lgbm': 'lightgbm.sklearn.LGBMModel', 90 | 'xgb': 'xgboost.sklearn.XGBModel', 91 | 'cat': 'catboost.core.CatBoost' 92 | } 93 | 94 | return is_instance(obj, tuple(gbdt_instance_name[t] for t in algorithm_type)) 95 | -------------------------------------------------------------------------------- /nyaggle/validation/__init__.py: -------------------------------------------------------------------------------- 1 | from nyaggle.validation.cross_validate import cross_validate 2 | from nyaggle.validation.adversarial_validate import adversarial_validate 3 | from nyaggle.validation.split import \ 4 | check_cv, TimeSeriesSplit, SlidingWindowSplit, Take, Nth, Skip, StratifiedGroupKFold 5 | -------------------------------------------------------------------------------- /nyaggle/validation/adversarial_validate.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from typing import Iterable, List, Optional, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.base import BaseEstimator 7 | from sklearn.metrics import roc_auc_score 8 | from sklearn.model_selection import KFold, BaseCrossValidator 9 | 10 | from nyaggle.environment import requires_lightgbm 11 | from nyaggle.util import is_instance 12 | from nyaggle.validation.cross_validate import cross_validate 13 | from nyaggle.validation.split import Take 14 | 15 | ADVResult = namedtuple('ADVResult', ['auc', 'importance']) 16 | 17 | 18 | def adversarial_validate(X_train: pd.DataFrame, 19 | X_test: pd.DataFrame, 20 | importance_type: str = 'gain', 21 | estimator: Optional[BaseEstimator] = None, 22 | categorical_feature: List[str] = None, 23 | cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None) -> ADVResult: 24 | """ 25 | Perform adversarial validation between X_train and X_test. 26 | 27 | Args: 28 | X_train: 29 | Training data 30 | X_test: 31 | Test data 32 | importance_type: 33 | The type of feature importance calculated. 34 | estimator: 35 | The custom estimator. If None, LGBMClassifier is automatically used. 36 | Only LGBMModel or CatBoost instances are supported. 37 | categorical_feature: 38 | List of categorical column names. If ``None``, categorical columns are automatically determined by dtype. 39 | cv: 40 | Cross validation split. If ``None``, the first fold out of 5 fold is used as validation. 41 | Returns: 42 | Namedtuple with following members 43 | 44 | * auc: 45 | float, ROC AUC score of adversarial validation. 46 | * importance: 47 | pandas DataFrame, feature importance of adversarial model (order by importance) 48 | 49 | Example: 50 | >>> from sklearn.model_selection import train_test_split 51 | >>> from nyaggle.testing import make_regression_df 52 | >>> from nyaggle.validation import adversarial_validate 53 | 54 | >>> X, y = make_regression_df(n_samples=8) 55 | >>> X_train, X_test, y_train, y_test = train_test_split(X, y) 56 | >>> auc, importance = cross_validate(X_train, X_test) 57 | >>> 58 | >>> print(auc) 59 | 0.51078231 60 | >>> importance.head() 61 | feature importance 62 | col_1 231.5827204 63 | col_5 207.1837266 64 | col_7 188.6920685 65 | col_4 174.5668498 66 | col_9 170.6438643 67 | """ 68 | concat = pd.concat([X_train, X_test]).copy().reset_index(drop=True) 69 | y = np.array([1] * len(X_train) + [0] * len(X_test)) 70 | 71 | if estimator is None: 72 | requires_lightgbm() 73 | from lightgbm import LGBMClassifier 74 | estimator = LGBMClassifier(n_estimators=10000, objective='binary', importance_type=importance_type, 75 | random_state=0) 76 | else: 77 | assert is_instance(estimator, ('lightgbm.sklearn.LGBMModel', 'catboost.core.CatBoost')), \ 78 | 'Only CatBoostClassifier or LGBMClassifier is allowed' 79 | 80 | if cv is None: 81 | cv = Take(1, KFold(5, shuffle=True, random_state=0)) 82 | 83 | fit_params = {'verbose': -1} 84 | if categorical_feature: 85 | fit_params['categorical_feature'] = categorical_feature 86 | 87 | result = cross_validate(estimator, concat, y, None, cv=cv, 88 | eval_func=roc_auc_score, fit_params=fit_params, importance_type=importance_type) 89 | 90 | importance = pd.concat(result.importance) 91 | importance = importance.groupby('feature')['importance'].mean().reset_index() 92 | importance.sort_values(by='importance', ascending=False, inplace=True) 93 | importance.reset_index(drop=True, inplace=True) 94 | 95 | return ADVResult(result.scores[-1], importance) 96 | -------------------------------------------------------------------------------- /nyaggle/validation/cross_validate.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import time 3 | import warnings 4 | from collections import namedtuple 5 | from logging import Logger, getLogger 6 | from typing import Any, Callable, Dict, Iterable, List, Optional, Union 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import sklearn.utils.multiclass as multiclass 11 | from category_encoders.utils import convert_input, convert_input_vector 12 | from sklearn.base import BaseEstimator 13 | from sklearn.model_selection import BaseCrossValidator 14 | 15 | from nyaggle.util.traits import is_gbdt_instance 16 | from nyaggle.validation.split import check_cv 17 | 18 | CVResult = namedtuple('CVResult', ['oof_prediction', 'test_prediction', 'scores', 'importance']) 19 | 20 | 21 | def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]], 22 | X_train: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], 23 | X_test: Union[pd.DataFrame, np.ndarray] = None, 24 | cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, 25 | groups: Optional[pd.Series] = None, 26 | eval_func: Optional[Callable] = None, logger: Optional[Logger] = None, 27 | on_each_fold: Optional[Callable[[int, BaseEstimator, pd.DataFrame, pd.Series], None]] = None, 28 | fit_params: Optional[Union[Dict[str, Any], Callable]] = None, 29 | importance_type: str = 'gain', 30 | early_stopping: bool = True, 31 | type_of_target: str = 'auto') -> CVResult: 32 | """ 33 | Evaluate metrics by cross-validation. It also records out-of-fold prediction and test prediction. 34 | 35 | Args: 36 | estimator: 37 | The object to be used in cross-validation. For list inputs, ``estimator[i]`` is trained on i-th fold. 38 | X_train: 39 | Training data 40 | y: 41 | Target 42 | X_test: 43 | Test data (Optional). If specified, prediction on the test data is performed using ensemble of models. 44 | cv: 45 | int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. 46 | 47 | - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, 48 | - integer, to specify the number of folds in a ``(Stratified)KFold``, 49 | - CV splitter (the instance of ``BaseCrossValidator``), 50 | - An iterable yielding (train, test) splits as arrays of indices. 51 | groups: 52 | Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). 53 | eval_func: 54 | Function used for logging and returning scores 55 | logger: 56 | logger 57 | on_each_fold: 58 | called for each fold with (idx_fold, model, X_fold, y_fold) 59 | fit_params: 60 | Parameters passed to the fit method of the estimator 61 | importance_type: 62 | The type of feature importance to be used to calculate result. 63 | Used only in ``LGBMClassifier`` and ``LGBMRegressor``. 64 | early_stopping: 65 | If ``True``, ``eval_set`` will be added to ``fit_params`` for each fold. 66 | ``early_stopping_rounds = 100`` will also be appended to fit_params if it does not already have one. 67 | type_of_target: 68 | The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. 69 | Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. 70 | Returns: 71 | Namedtuple with following members 72 | 73 | * oof_prediction (numpy array, shape (len(X_train),)): 74 | The predicted value on put-of-Fold validation data. 75 | * test_prediction (numpy array, hape (len(X_test),)): 76 | The predicted value on test data. ``None`` if X_test is ``None``. 77 | * scores (list of float, shape (nfolds+1,)): 78 | ``scores[i]`` denotes validation score in i-th fold. 79 | ``scores[-1]`` is the overall score. `None` if eval is not specified. 80 | * importance (list of pandas DataFrame, shape (nfolds,)): 81 | ``importance[i]`` denotes feature importance in i-th fold model. 82 | If the estimator is not GBDT, empty array is returned. 83 | 84 | Example: 85 | >>> from sklearn.datasets import make_regression 86 | >>> from sklearn.linear_model import Ridge 87 | >>> from sklearn.metrics import mean_squared_error 88 | >>> from nyaggle.validation import cross_validate 89 | 90 | >>> X, y = make_regression(n_samples=8) 91 | >>> model = Ridge(alpha=1.0) 92 | >>> pred_oof, pred_test, scores, _ = \ 93 | >>> cross_validate(model, 94 | >>> X_train=X[:3, :], 95 | >>> y=y[:3], 96 | >>> X_test=X[3:, :], 97 | >>> cv=3, 98 | >>> eval_func=mean_squared_error) 99 | >>> print(pred_oof) 100 | [-101.1123267 , 26.79300693, 17.72635528] 101 | >>> print(pred_test) 102 | [-10.65095894 -12.18909059 -23.09906427 -17.68360714 -20.08218267] 103 | >>> print(scores) 104 | [71912.80290003832, 15236.680239881942, 15472.822033121925, 34207.43505768073] 105 | """ 106 | cv = check_cv(cv, y) 107 | n_output_cols = 1 108 | if type_of_target == 'auto': 109 | type_of_target = multiclass.type_of_target(y) 110 | if type_of_target == 'multiclass': 111 | n_output_cols = y.nunique(dropna=True) 112 | 113 | if isinstance(estimator, list): 114 | assert len(estimator) == cv.get_n_splits(), "Number of estimators should be same to nfolds." 115 | 116 | X_train = convert_input(X_train) 117 | y = convert_input_vector(y, X_train.index) 118 | if X_test is not None: 119 | X_test = convert_input(X_test) 120 | 121 | if not isinstance(estimator, list): 122 | estimator = [estimator] * cv.get_n_splits() 123 | 124 | assert len(estimator) == cv.get_n_splits() 125 | 126 | if logger is None: 127 | logger = getLogger(__name__) 128 | 129 | def _predict(model: BaseEstimator, x: pd.DataFrame, _type_of_target: str): 130 | if _type_of_target in ('binary', 'multiclass'): 131 | if hasattr(model, "predict_proba"): 132 | proba = model.predict_proba(x) 133 | elif hasattr(model, "decision_function"): 134 | warnings.warn('Since {} does not have predict_proba method, ' 135 | 'decision_function is used for the prediction instead.'.format(type(model))) 136 | proba = model.decision_function(x) 137 | else: 138 | raise RuntimeError('Estimator in classification problem should have ' 139 | 'either predict_proba or decision_function') 140 | if proba.ndim == 1: 141 | return proba 142 | else: 143 | return proba[:, 1] if proba.shape[1] == 2 else proba 144 | else: 145 | return model.predict(x) 146 | 147 | oof = np.zeros((len(X_train), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_train)) 148 | evaluated = np.full(len(X_train), False) 149 | test = None 150 | if X_test is not None: 151 | test = np.zeros((len(X_test), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_test)) 152 | 153 | scores = [] 154 | eta_all = [] 155 | importance = [] 156 | 157 | for n, (train_idx, valid_idx) in enumerate(cv.split(X_train, y, groups)): 158 | start_time = time.time() 159 | 160 | train_x, train_y = X_train.iloc[train_idx], y.iloc[train_idx] 161 | valid_x, valid_y = X_train.iloc[valid_idx], y.iloc[valid_idx] 162 | 163 | if fit_params is None: 164 | fit_params_fold = {} 165 | elif callable(fit_params): 166 | fit_params_fold = fit_params(n, train_idx, valid_idx) 167 | else: 168 | fit_params_fold = copy.copy(fit_params) 169 | 170 | if is_gbdt_instance(estimator[n], ('lgbm', 'cat', 'xgb')): 171 | if early_stopping: 172 | if 'eval_set' not in fit_params_fold: 173 | fit_params_fold['eval_set'] = [(valid_x, valid_y)] 174 | if 'early_stopping_rounds' not in fit_params_fold: 175 | fit_params_fold['early_stopping_rounds'] = 100 176 | 177 | estimator[n].fit(train_x, train_y, **fit_params_fold) 178 | else: 179 | estimator[n].fit(train_x, train_y, **fit_params_fold) 180 | 181 | oof[valid_idx] = _predict(estimator[n], valid_x, type_of_target) 182 | evaluated[valid_idx] = True 183 | 184 | if X_test is not None: 185 | test += _predict(estimator[n], X_test, type_of_target) 186 | 187 | if on_each_fold is not None: 188 | on_each_fold(n, estimator[n], train_x, train_y) 189 | 190 | if is_gbdt_instance(estimator[n], ('lgbm', 'cat', 'xgb')): 191 | importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type)) 192 | 193 | if eval_func is not None: 194 | score = eval_func(valid_y, oof[valid_idx]) 195 | scores.append(score) 196 | logger.info('Fold {} score: {}'.format(n, score)) 197 | 198 | elapsed = time.time() - start_time 199 | eta_all.append(elapsed) 200 | logger.debug('{:.3f} sec / fold'.format(elapsed)) 201 | 202 | if eval_func is not None: 203 | score = eval_func(y.loc[evaluated], oof[evaluated]) 204 | scores.append(score) 205 | logger.info('Overall score: {}'.format(score)) 206 | 207 | if X_test is not None: 208 | predicted = test / cv.get_n_splits(X_train, y, groups) 209 | else: 210 | predicted = None 211 | 212 | return CVResult(oof, predicted, scores, importance) 213 | 214 | 215 | def _get_gbdt_importance(gbdt_model: BaseEstimator, features: List[str], 216 | importance_type: str) -> pd.DataFrame: 217 | df = pd.DataFrame() 218 | 219 | df['feature'] = features 220 | 221 | if is_gbdt_instance(gbdt_model, 'cat'): 222 | df['importance'] = gbdt_model.get_feature_importance() 223 | elif is_gbdt_instance(gbdt_model, 'xgb'): 224 | df['importance'] = gbdt_model.feature_importances_ 225 | elif is_gbdt_instance(gbdt_model, 'lgbm'): 226 | df['importance'] = gbdt_model.booster_.feature_importance(importance_type=importance_type) 227 | 228 | return df 229 | -------------------------------------------------------------------------------- /nyaggle/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.6' 2 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | torch 2 | mlflow 3 | catboost 4 | lightgbm<4.0.0 5 | xgboost 6 | mecab-python3>=1.0.0 7 | flake8 8 | pytest 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders 2 | matplotlib 3 | more-itertools 4 | numpy 5 | optuna>=1.0.0 6 | pandas 7 | pyarrow 8 | seaborn 9 | scikit-learn 10 | tqdm 11 | transformers[ja] 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from codecs import open 2 | from os import path 3 | 4 | from setuptools import find_packages, setup 5 | 6 | 7 | def get_long_description(): 8 | here = path.abspath(path.dirname(__file__)) 9 | 10 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 11 | long_description = f.read() 12 | return long_description 13 | 14 | 15 | def get_version(): 16 | version_filepath = path.join(path.dirname(__file__), 'nyaggle', 'version.py') 17 | with open(version_filepath) as f: 18 | for line in f: 19 | if line.startswith('__version__'): 20 | return line.strip().split()[-1][1:-1] 21 | 22 | 23 | setup( 24 | name='nyaggle', 25 | packages=find_packages(), 26 | 27 | version=get_version(), 28 | 29 | license='MIT', 30 | 31 | install_requires=[ 32 | 'category_encoders', 33 | 'matplotlib', 34 | 'more-itertools', 35 | 'numpy', 36 | 'optuna>=1.0.0', 37 | 'pandas', 38 | 'pyarrow', 39 | 'seaborn', 40 | 'scikit-learn', 41 | 'tqdm', 42 | 'transformers>=2.3.0', 43 | ], 44 | 45 | extras_require={ 46 | 'all': ['catboost>=0.17', 'lightgbm', 'xgboost', 'torch', 'mlflow'] 47 | }, 48 | 49 | author='nyanp', 50 | author_email='Noumi.Taiga@gmail.com', 51 | url='https://github.com/nyanp/nyaggle', 52 | description='Code for Kaggle and Offline Competitions.', 53 | long_description=get_long_description(), 54 | long_description_content_type='text/markdown', 55 | keywords='nyaggle kaggle', 56 | classifiers=[ 57 | 'License :: OSI Approved :: BSD License', 58 | 'Programming Language :: Python :: 3.8', 59 | 'Programming Language :: Python :: 3.9', 60 | 'Programming Language :: Python :: 3.10', 61 | 'Programming Language :: Python :: 3.11' 62 | ] 63 | ) 64 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyanp/nyaggle/636532292d7ce3468cd47a3337bc50d620f0d23b/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import uuid 5 | 6 | import pytest 7 | 8 | 9 | @pytest.fixture(scope='function', autouse=True) 10 | def tmpdir_name(): 11 | path = None 12 | try: 13 | path = os.path.join(tempfile.gettempdir(), uuid.uuid4().hex) 14 | yield path 15 | finally: 16 | if path: 17 | shutil.rmtree(path, ignore_errors=True) 18 | -------------------------------------------------------------------------------- /tests/ensemble/test_averaging.py: -------------------------------------------------------------------------------- 1 | import scipy.stats as stats 2 | from numpy.testing import assert_array_almost_equal 3 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 4 | from sklearn.linear_model import Ridge, LogisticRegression 5 | from sklearn.metrics import roc_auc_score, mean_squared_error 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.svm import SVC, SVR 8 | from sklearn.utils.multiclass import type_of_target 9 | 10 | from nyaggle.ensemble import averaging, averaging_opt 11 | from nyaggle.testing import make_classification_df, make_regression_df 12 | from nyaggle.validation import cross_validate 13 | 14 | 15 | def _make_1st_stage_preds(X, y, X_test): 16 | if type_of_target(y) == 'continuous': 17 | models = [ 18 | SVR(), 19 | Ridge(random_state=0), 20 | RandomForestRegressor(n_estimators=30, random_state=0) 21 | ] 22 | else: 23 | models = [ 24 | SVC(random_state=0), 25 | LogisticRegression(random_state=0), 26 | RandomForestClassifier(n_estimators=30, random_state=0) 27 | ] 28 | 29 | results = [cross_validate(m, X, y, X_test, cv=5) for m in models] 30 | 31 | return [r.oof_prediction for r in results], [r.test_prediction for r in results] 32 | 33 | 34 | def test_averaging(): 35 | X, y = make_classification_df() 36 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 37 | 38 | _, test = _make_1st_stage_preds(X_train, y_train, X_test) 39 | 40 | result = averaging(test) 41 | 42 | assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction) 43 | assert result.score is None 44 | assert result.oof_prediction is None 45 | 46 | 47 | def test_averaging_with_oof(): 48 | X, y = make_classification_df() 49 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 50 | 51 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 52 | 53 | result = averaging(test, oof, y_train) 54 | 55 | assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction) 56 | assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction) 57 | assert result.score is None 58 | 59 | 60 | def test_averaging_regression(): 61 | X, y = make_regression_df() 62 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 63 | 64 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 65 | 66 | result = averaging(test, oof, y_train) 67 | 68 | assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction) 69 | assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction) 70 | assert result.score is None 71 | 72 | 73 | def test_averaging_multiclass(): 74 | X, y = make_classification_df(n_classes=5) 75 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 76 | 77 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 78 | 79 | result = averaging(test, oof, y_train) 80 | 81 | assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction) 82 | assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction) 83 | assert result.score is None 84 | 85 | 86 | def test_averaging_with_metrics(): 87 | X, y = make_classification_df() 88 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 89 | 90 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 91 | 92 | result = averaging(test, oof, y_train, eval_func=roc_auc_score) 93 | 94 | assert result.score == roc_auc_score(y_train, result.oof_prediction) 95 | 96 | 97 | def test_weight_averaging(): 98 | X, y = make_classification_df() 99 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 100 | 101 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 102 | 103 | result = averaging(test, oof, y_train, weights=[0.2, 0.4, 0.3]) 104 | 105 | assert_array_almost_equal(0.2 * test[0] + 0.4 * test[1] + 0.3 * test[2], result.test_prediction) 106 | assert_array_almost_equal(0.2 * oof[0] + 0.4 * oof[1] + 0.3 * oof[2], result.oof_prediction) 107 | assert result.score is None 108 | 109 | 110 | def test_rank_averaging(): 111 | X, y = make_classification_df(n_samples=1024) 112 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 113 | 114 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 115 | 116 | result = averaging(test, rank_averaging=True) 117 | 118 | test_rank = [stats.rankdata(t) / len(X_test) for t in test] 119 | 120 | assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction) 121 | assert result.score is None 122 | 123 | 124 | def test_rank_averaging_with_oof(): 125 | X, y = make_classification_df(n_samples=1024) 126 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 127 | 128 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 129 | 130 | result = averaging(test, oof, y_train, rank_averaging=True) 131 | 132 | oof_rank = [stats.rankdata(o) / len(X_train) for o in oof] 133 | test_rank = [stats.rankdata(t) / len(X_test) for t in test] 134 | 135 | assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction) 136 | assert_array_almost_equal((oof_rank[0] + oof_rank[1] + oof_rank[2]) / 3, result.oof_prediction) 137 | assert result.score is None 138 | 139 | 140 | def test_averaging_opt_maximize(): 141 | X, y = make_classification_df(n_samples=1024) 142 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 143 | 144 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 145 | 146 | best_single_model = max(roc_auc_score(y_train, oof[0]), 147 | roc_auc_score(y_train, oof[1]), 148 | roc_auc_score(y_train, oof[2])) 149 | 150 | result = averaging_opt(test, oof, y_train, roc_auc_score, higher_is_better=True) 151 | 152 | assert result.score >= best_single_model 153 | 154 | result_simple_avg = averaging(test, oof, y_train, eval_func=roc_auc_score) 155 | 156 | assert result.score >= result_simple_avg.score 157 | 158 | 159 | def test_averaging_opt_minimize(): 160 | X, y = make_regression_df(n_samples=1024) 161 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 162 | 163 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 164 | 165 | best_single_model = min(mean_squared_error(y_train, oof[0]), 166 | mean_squared_error(y_train, oof[1]), 167 | mean_squared_error(y_train, oof[2])) 168 | 169 | result = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False) 170 | 171 | assert result.score <= best_single_model 172 | 173 | result_simple_avg = averaging(test, oof, y_train, eval_func=mean_squared_error) 174 | 175 | assert result.score <= result_simple_avg.score 176 | 177 | 178 | def test_averaging_opt_minimize_with_method(): 179 | X, y = make_regression_df(n_samples=1024) 180 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 181 | 182 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 183 | 184 | best_single_model = min(mean_squared_error(y_train, oof[0]), 185 | mean_squared_error(y_train, oof[1]), 186 | mean_squared_error(y_train, oof[2])) 187 | 188 | result1 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False) 189 | result2 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False, method='Nelder-Mead') 190 | result3 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False, method='SLSQP') 191 | 192 | assert result1.score != result2.score 193 | assert result1.score == result3.score 194 | 195 | assert result1.score <= best_single_model 196 | assert result2.score <= best_single_model 197 | 198 | 199 | def test_rank_averaging_opt_maximize(): 200 | X, y = make_classification_df(n_samples=1024) 201 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 202 | 203 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 204 | 205 | best_single_model = max(roc_auc_score(y_train, oof[0]), 206 | roc_auc_score(y_train, oof[1]), 207 | roc_auc_score(y_train, oof[2])) 208 | 209 | result = averaging_opt(test, oof, y_train, roc_auc_score, higher_is_better=True, rank_averaging=True) 210 | 211 | assert result.score >= best_single_model 212 | 213 | result_simple_avg = averaging(test, oof, y_train, eval_func=roc_auc_score, rank_averaging=True) 214 | 215 | assert result.score >= result_simple_avg.score 216 | -------------------------------------------------------------------------------- /tests/ensemble/test_stacking.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 2 | from sklearn.linear_model import Ridge, LogisticRegression 3 | from sklearn.metrics import mean_squared_error, roc_auc_score 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.svm import SVC, SVR 6 | from sklearn.utils.multiclass import type_of_target 7 | 8 | from nyaggle.ensemble import stacking 9 | from nyaggle.testing import make_classification_df, make_regression_df 10 | from nyaggle.validation import cross_validate 11 | 12 | 13 | def _make_1st_stage_preds(X, y, X_test): 14 | if type_of_target(y) == 'continuous': 15 | models = [ 16 | SVR(), 17 | Ridge(random_state=0), 18 | RandomForestRegressor(n_estimators=30, random_state=0) 19 | ] 20 | else: 21 | models = [ 22 | SVC(random_state=0), 23 | LogisticRegression(random_state=0), 24 | RandomForestClassifier(n_estimators=30, random_state=0) 25 | ] 26 | 27 | results = [cross_validate(m, X, y, X_test, cv=5) for m in models] 28 | 29 | return [r.oof_prediction for r in results], [r.test_prediction for r in results] 30 | 31 | 32 | def test_stacking_classification(): 33 | X, y = make_classification_df() 34 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 35 | 36 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 37 | 38 | worst_base_roc = min(roc_auc_score(y_train, _oof) for _oof in oof) 39 | 40 | result = stacking(test, oof, y_train, eval_func=roc_auc_score) 41 | 42 | assert roc_auc_score(y_train, result.oof_prediction) > worst_base_roc 43 | 44 | 45 | def test_stacking_regression(): 46 | X, y = make_regression_df() 47 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 48 | 49 | oof, test = _make_1st_stage_preds(X_train, y_train, X_test) 50 | 51 | worst_base_rmse = max(mean_squared_error(y_train, _oof) for _oof in oof) 52 | 53 | result = stacking(test, oof, y_train, eval_func=mean_squared_error) 54 | 55 | assert mean_squared_error(y_train, result.oof_prediction) < worst_base_rmse 56 | -------------------------------------------------------------------------------- /tests/experiment/test_experiment.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pytest 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | from nyaggle.experiment import Experiment 9 | 10 | 11 | def test_log_params(tmpdir_name): 12 | with Experiment(tmpdir_name) as e: 13 | e.log_param('x', 1) 14 | e.log_param('x', 2) 15 | e.log_params({ 16 | 'y': 'ABC', 17 | 'z': None, 18 | }) 19 | 20 | with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f: 21 | params = json.load(f) 22 | 23 | expected = { 24 | 'x': 2, # if the key is duplicated, the latter one is stored 25 | 'y': 'ABC', 26 | 'z': 'None' # all non-numerical values are casted to string before logging 27 | } 28 | assert params == expected 29 | 30 | 31 | def test_log_params_empty(tmpdir_name): 32 | with Experiment(tmpdir_name): 33 | pass 34 | 35 | with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f: 36 | params = json.load(f) 37 | assert params == {} 38 | 39 | 40 | def test_log_metrics(tmpdir_name): 41 | with Experiment(tmpdir_name) as e: 42 | e.log_metric('x', 1) 43 | e.log_metric('x', 2) 44 | e.log_metrics({ 45 | 'y': 3, 46 | 'z': 4, 47 | }) 48 | 49 | with open(os.path.join(tmpdir_name, 'metrics.json'), 'r') as f: 50 | params = json.load(f) 51 | 52 | expected = { 53 | 'x': 2, 54 | 'y': 3, 55 | 'z': 4, 56 | } 57 | assert params == expected 58 | 59 | 60 | def test_log_metrics_empty(tmpdir_name): 61 | with Experiment(tmpdir_name): 62 | pass 63 | 64 | with open(os.path.join(tmpdir_name, 'metrics.json'), 'r') as f: 65 | params = json.load(f) 66 | assert params == {} 67 | 68 | 69 | def test_log_dict(tmpdir_name): 70 | with Experiment(tmpdir_name) as e: 71 | e.log_dict('foo', {'a': 1, 'b': 'foo', 'c': {'d': 'e', 'f': {}, 'g': {'h': 'i'}, 'j': None}}) 72 | 73 | with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f: 74 | params = json.load(f) 75 | assert params == { 76 | 'foo.a': 1, 77 | 'foo.b': 'foo', 78 | 'foo.c.d': 'e', 79 | 'foo.c.f': '{}', 80 | 'foo.c.g.h': 'i', 81 | 'foo.c.j': 'None' 82 | } 83 | 84 | 85 | def test_error_while_experiment(tmpdir_name): 86 | try: 87 | with Experiment(tmpdir_name) as e: 88 | e.log_metric('x', 0.5) 89 | e.log_param('foo', 'bar') 90 | e.log_numpy('np', np.zeros(100)) 91 | e.log_dataframe('df', pd.DataFrame({'a': [1, 2, 3]})) 92 | 93 | raise KeyboardInterrupt() 94 | except KeyboardInterrupt: 95 | pass 96 | 97 | # all logs are saved even if error raised inside experiment 98 | with open(os.path.join(tmpdir_name, 'metrics.json'), 'r') as f: 99 | metrics = json.load(f) 100 | assert metrics == {'x': 0.5} 101 | 102 | with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f: 103 | params = json.load(f) 104 | assert params == {'foo': 'bar'} 105 | 106 | assert os.path.exists(os.path.join(tmpdir_name, 'np.npy')) 107 | assert os.path.exists(os.path.join(tmpdir_name, 'df.f')) 108 | 109 | 110 | def test_experiment_duplicated_error(tmpdir_name): 111 | with Experiment(tmpdir_name) as e: 112 | e.log_metric('CV', 0.97) 113 | 114 | with pytest.raises(ValueError): 115 | with Experiment(tmpdir_name): 116 | pass 117 | 118 | with pytest.raises(ValueError): 119 | with Experiment(tmpdir_name, if_exists='error'): 120 | pass 121 | 122 | 123 | def test_experiment_duplicated_replace(tmpdir_name): 124 | with Experiment(tmpdir_name) as e: 125 | e.log_metric('CV', 0.97) 126 | 127 | with Experiment(tmpdir_name, if_exists='replace') as e: 128 | e.log_metric('LB', 0.95) 129 | 130 | with open(os.path.join(tmpdir_name, 'metrics.json')) as f: 131 | metrics = json.load(f) 132 | 133 | # replaced by the new result 134 | assert 'LB' in metrics 135 | assert 'CV' not in metrics 136 | 137 | 138 | def test_experiment_duplicated_append(tmpdir_name): 139 | with Experiment(tmpdir_name) as e: 140 | e.log_metric('CV', 0.97) 141 | 142 | with Experiment(tmpdir_name, if_exists='append') as e: 143 | e.log_metric('LB', 0.95) 144 | 145 | with open(os.path.join(tmpdir_name, 'metrics.json')) as f: 146 | metrics = json.load(f) 147 | 148 | # appended to the existing result 149 | assert 'LB' in metrics 150 | assert 'CV' in metrics 151 | 152 | 153 | def test_experiment_duplicated_rename(tmpdir_name): 154 | with Experiment(tmpdir_name) as e: 155 | e.log_metric('CV', 0.97) 156 | 157 | with Experiment(tmpdir_name, if_exists='rename') as e: 158 | e.log_metric('LB', 0.95) 159 | 160 | with open(os.path.join(tmpdir_name, 'metrics.json')) as f: 161 | metrics = json.load(f) 162 | assert 'LB' not in metrics 163 | assert 'CV' in metrics 164 | 165 | with open(os.path.join(tmpdir_name + '_1', 'metrics.json')) as f: 166 | metrics = json.load(f) 167 | assert 'LB' in metrics 168 | assert 'CV' not in metrics 169 | 170 | 171 | def test_experiment_duplicated_replace_mlflow(tmpdir_name): 172 | import mlflow 173 | 174 | with Experiment(tmpdir_name, with_mlflow=True) as e: 175 | e.log_metric('CV', 0.97) 176 | run_id_old = e.mlflow_run_id 177 | 178 | with Experiment(tmpdir_name, with_mlflow=True, if_exists='replace') as e: 179 | e.log_metric('LB', 0.95) 180 | run_id_new = e.mlflow_run_id 181 | 182 | assert run_id_old != run_id_new 183 | 184 | client = mlflow.tracking.MlflowClient() 185 | old_run = client.get_run(run_id_old) 186 | new_run = client.get_run(run_id_new) 187 | assert old_run.info.lifecycle_stage == 'deleted' 188 | assert new_run.info.lifecycle_stage == 'active' 189 | 190 | 191 | def test_experiment_duplicated_append_mlflow(tmpdir_name): 192 | with Experiment(tmpdir_name, with_mlflow=True) as e: 193 | e.log_metric('CV', 0.97) 194 | run_id_old = e.mlflow_run_id 195 | 196 | with Experiment(tmpdir_name, with_mlflow=True, if_exists='append') as e: 197 | e.log_metric('LB', 0.95) 198 | run_id_new = e.mlflow_run_id 199 | 200 | with open(os.path.join(tmpdir_name, 'metrics.json')) as f: 201 | metrics = json.load(f) 202 | 203 | # appended to the existing result 204 | assert 'LB' in metrics 205 | assert 'CV' in metrics 206 | 207 | assert run_id_old == run_id_new 208 | 209 | import mlflow 210 | client = mlflow.tracking.MlflowClient() 211 | old_run = client.get_run(run_id_old) 212 | assert old_run.info.lifecycle_stage == 'active' 213 | 214 | 215 | def test_experiment_duplicated_rename_mlflow(tmpdir_name): 216 | with Experiment(tmpdir_name, with_mlflow=True) as e: 217 | e.log_metric('CV', 0.97) 218 | run_id_old = e.mlflow_run_id 219 | 220 | with Experiment(tmpdir_name, with_mlflow=True, if_exists='rename') as e: 221 | e.log_metric('LB', 0.95) 222 | run_id_new = e.mlflow_run_id 223 | 224 | assert run_id_old != run_id_new 225 | 226 | 227 | def test_experiment_continue(tmpdir_name): 228 | with Experiment(tmpdir_name, with_mlflow=True) as e: 229 | e.log_metric('CV', 0.97) 230 | 231 | # appending to exising local & mlflow result 232 | with Experiment.continue_from(tmpdir_name, with_mlflow=True) as e: 233 | e.log_metric('LB', 0.95) 234 | 235 | metric_file = os.path.join(tmpdir_name, 'metrics.json') 236 | 237 | import mlflow 238 | 239 | client = mlflow.tracking.MlflowClient() 240 | data = client.get_run(mlflow.active_run().info.run_id).data 241 | assert data.metrics['CV'] == 0.97 242 | assert data.metrics['LB'] == 0.95 243 | 244 | with open(metric_file, 'r') as f: 245 | obj = json.load(f) 246 | assert obj['CV'] == 0.97 247 | assert obj['LB'] == 0.95 248 | 249 | with Experiment(tmpdir_name, with_mlflow=True, if_exists='append') as e: 250 | e.log_metric('X', 1.1) 251 | 252 | import mlflow 253 | 254 | client = mlflow.tracking.MlflowClient() 255 | data = client.get_run(mlflow.active_run().info.run_id).data 256 | assert data.metrics['CV'] == 0.97 257 | assert data.metrics['LB'] == 0.95 258 | assert data.metrics['X'] == 1.1 259 | 260 | # stop logging to mlflow, still continue logging on local dir 261 | with Experiment.continue_from(tmpdir_name, with_mlflow=False) as e: 262 | e.log_metric('Y', 1.1) 263 | import mlflow 264 | assert mlflow.active_run() is None 265 | 266 | with open(metric_file, 'r') as f: 267 | obj = json.load(f) 268 | assert 'Y' in obj 269 | -------------------------------------------------------------------------------- /tests/experiment/test_hyperparameter_tuner.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import datasets 3 | 4 | from nyaggle.experiment.hyperparameter_tuner import find_best_lgbm_parameter 5 | 6 | 7 | def _check_parameter_tunes(params, x, y): 8 | best_params = find_best_lgbm_parameter(params, x, y) 9 | # lightgbm_tuner tuned params 10 | tuned_params = { 11 | 'num_leaves', 'feature_fraction', 'bagging_fraction', 'bagging_freq', 12 | 'lambda_l1', 'lambda_l2', 'min_child_samples' 13 | } 14 | intersection = set(best_params.keys()) & tuned_params 15 | assert intersection == tuned_params 16 | 17 | 18 | def test_regression_problem_parameter_tunes(): 19 | x, y = datasets.load_diabetes(return_X_y=True, as_frame=True) 20 | params = { 21 | 'objective': 'regression', 22 | 'metric': 'rmse', 23 | 'verbosity': -1, 24 | } 25 | _check_parameter_tunes(params, x, y) 26 | 27 | 28 | def test_binary_classification_parameter_tunes(): 29 | dataset = datasets.load_breast_cancer() 30 | x = pd.DataFrame(dataset.data, columns=dataset.feature_names) 31 | y = pd.Series(dataset.target) 32 | params = { 33 | 'objective': 'binary', 34 | 'metric': 'binary_logloss', 35 | 'verbosity': -1, 36 | } 37 | _check_parameter_tunes(params, x, y) 38 | 39 | 40 | def test_multi_classification_parameter_tunes(): 41 | dataset = datasets.load_wine() 42 | x = pd.DataFrame(dataset.data, columns=dataset.feature_names) 43 | y = pd.Series(dataset.target) 44 | params = { 45 | 'objective': 'multiclass', 46 | 'num_class': 3, 47 | 'verbosity': -1, 48 | } 49 | _check_parameter_tunes(params, x, y) 50 | -------------------------------------------------------------------------------- /tests/feature/category_encoder/test_target_encoder.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import category_encoders as ce 4 | import numpy.testing as npt 5 | import pandas as pd 6 | from pandas.testing import assert_frame_equal 7 | from sklearn.model_selection import KFold 8 | 9 | from nyaggle.feature.category_encoder import TargetEncoder 10 | 11 | 12 | def _test_target_encoder(X_train, y_train, X_test, **kw): 13 | cv = KFold(n_splits=2, random_state=42, shuffle=True) 14 | 15 | te = TargetEncoder(cv.split(X_train), **kw) 16 | 17 | ret_train = te.fit_transform(X_train, y_train) 18 | ret_test = te.transform(X_test) 19 | 20 | ret_train2 = copy.deepcopy(X_train) 21 | ret_test2 = copy.deepcopy(X_test) 22 | 23 | for train_idx, test_idx in cv.split(X_train): 24 | te2 = ce.TargetEncoder(**kw) 25 | 26 | if isinstance(X_train, pd.DataFrame): 27 | te2.fit(X_train.loc[train_idx, :], y_train.loc[train_idx]) 28 | ret_train2.loc[test_idx] = te2.transform(ret_train2.loc[test_idx]) 29 | else: 30 | te2.fit(X_train[train_idx, :], y_train[train_idx]) 31 | ret_train2[test_idx] = te2.transform(ret_train2[test_idx]) 32 | 33 | ret_train2 = ret_train2.astype(float) 34 | 35 | if isinstance(ret_train, pd.DataFrame): 36 | assert_frame_equal(ret_train, ret_train2) 37 | else: 38 | npt.assert_array_equal(ret_train, ret_train2) 39 | 40 | te2 = ce.TargetEncoder(**kw) 41 | te2.fit(X_train, y_train) 42 | 43 | ret_test2 = te2.transform(ret_test2) 44 | 45 | if isinstance(ret_train, pd.DataFrame): 46 | assert_frame_equal(ret_test, ret_test2) 47 | else: 48 | npt.assert_array_equal(ret_test, ret_test2) 49 | 50 | 51 | def test_target_encoder_fit_transform(): 52 | X_train = pd.DataFrame({ 53 | 'x': ['A', 'A', 'A', 'B', 'B', 'C'], 54 | 'a': [1, 2, 3, 1, 2, 3] 55 | 56 | }) 57 | y_train = pd.Series([0, 0, 1, 0, 1, 1]) 58 | X_test = pd.DataFrame({ 59 | 'x': ['A', 'B', 'C', 'D'], 60 | 'a': [1, 2, 3, 4] 61 | }) 62 | 63 | X = pd.concat([X_train, X_test]) 64 | y = pd.concat([y_train, pd.Series([None] * 4)]).astype(float) 65 | 66 | ce1 = TargetEncoder(cols=['x']) 67 | ce1.fit(X_train, y_train) 68 | ret1 = ce1.transform(X_test) 69 | 70 | ce2 = TargetEncoder(cols=['x']) 71 | ret2 = ce2.fit_transform(X, y).iloc[6:, :] 72 | 73 | assert_frame_equal(ret1, ret2) 74 | 75 | 76 | def test_target_encoder(): 77 | X_train = pd.DataFrame({ 78 | 'x': ['A', 'A', 'A', 'B', 'B', 'C'], 79 | 80 | }) 81 | y_train = pd.Series([0, 0, 1, 0, 1, 1]) 82 | X_test = pd.DataFrame({ 83 | 'x': ['A', 'B', 'C', 'D'] 84 | }) 85 | 86 | _test_target_encoder(X_train, y_train, X_test) 87 | 88 | 89 | def test_target_encoder_ndarray(): 90 | X_train = pd.DataFrame({ 91 | 'x': ['A', 'A', 'A', 'B', 'B', 'C'], 92 | 93 | }) 94 | y_train = pd.Series([0, 0, 1, 0, 1, 1]) 95 | X_test = pd.DataFrame({ 96 | 'x': ['A', 'B', 'C', 'D'] 97 | }) 98 | 99 | _test_target_encoder(X_train.values, y_train.values, X_test.values) 100 | -------------------------------------------------------------------------------- /tests/feature/nlp/test_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy.testing as npt 4 | import pandas as pd 5 | import pytest 6 | from pandas.testing import assert_frame_equal 7 | 8 | from nyaggle.feature.nlp import BertSentenceVectorizer 9 | 10 | _TEST_SENTENCE_EN = [ 11 | 'This is a pen.', 12 | 'A quick brown fox', 13 | 'Redistribution and use in source and binary forms, with or without modification.', 14 | 'BERT is the state of the art NLP model.', 15 | 'This is a pen.', 16 | 'THIS IS A PEN.', 17 | ] 18 | 19 | _TEST_SENTENCE_JP = [ 20 | '金メダルが5枚欲しい。', 21 | '私は昨日から風邪をひいています。', 22 | 'これはペンです。', 23 | 'BERTは最新の自然言語処理モデルです。', 24 | '金メダルが5枚欲しい。', 25 | '金メダルが 5枚 欲しい。', 26 | ] 27 | 28 | 29 | def _under_py35(): 30 | return not (sys.version_info.major == 3 and sys.version_info.minor >= 6) 31 | 32 | 33 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5") 34 | def test_bert_fit(): 35 | bert = BertSentenceVectorizer(use_cuda=False) 36 | 37 | X = pd.DataFrame({ 38 | 'id': [0, 1, 2, 3, 4, 5], 39 | 'sentence': _TEST_SENTENCE_EN 40 | }) 41 | 42 | bert.fit(X) 43 | ret = bert.transform(X) 44 | 45 | assert ret.shape[0] == 6 46 | assert ret.shape[1] == 768 + 1 # id + embed 47 | 48 | ret.drop('id', axis=1, inplace=True) 49 | npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values) 50 | npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values) 51 | 52 | 53 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5") 54 | def test_bert_fit_transform(): 55 | X = pd.DataFrame({ 56 | 'id': [0, 1, 2, 3, 4, 5], 57 | 'sentence': _TEST_SENTENCE_EN 58 | }) 59 | 60 | bert = BertSentenceVectorizer(use_cuda=False) 61 | ret = bert.fit_transform(X) 62 | 63 | bert = BertSentenceVectorizer(use_cuda=False) 64 | bert.fit(X) 65 | ret2 = bert.fit_transform(X) 66 | 67 | assert_frame_equal(ret, ret2) 68 | 69 | 70 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5") 71 | def test_bert_en_svd(): 72 | n_components = 3 73 | bert = BertSentenceVectorizer(n_components=n_components, use_cuda=False) 74 | 75 | X = pd.DataFrame({ 76 | 'id': [0, 1, 2, 3, 4, 5], 77 | 'sentence': _TEST_SENTENCE_EN 78 | }) 79 | 80 | ret = bert.fit_transform(X) 81 | 82 | assert ret.shape[0] == 6 83 | assert ret.shape[1] == n_components + 1 84 | 85 | ret.drop('id', axis=1, inplace=True) 86 | npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values, decimal=3) 87 | npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values, decimal=3) 88 | 89 | 90 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5") 91 | def test_bert_en_svd_multicol(): 92 | bert = BertSentenceVectorizer(use_cuda=False) 93 | 94 | X = pd.DataFrame({ 95 | 'id': [0, 1, 2, 3, 4, 5], 96 | 'sentence': _TEST_SENTENCE_EN, 97 | 'sentence2': _TEST_SENTENCE_EN 98 | }) 99 | 100 | ret = bert.fit_transform(X) 101 | 102 | assert ret.shape[0] == 6 103 | assert ret.shape[1] == 2 * 768 + 1 104 | 105 | ret.drop('id', axis=1, inplace=True) 106 | npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values, decimal=3) 107 | npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values, decimal=3) 108 | 109 | 110 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5") 111 | def test_bert_jp(): 112 | bert = BertSentenceVectorizer(use_cuda=False, lang='jp') 113 | 114 | X = pd.DataFrame({ 115 | 'id': [0, 1, 2, 3, 4, 5], 116 | 'sentence': _TEST_SENTENCE_JP 117 | }) 118 | 119 | ret = bert.fit_transform(X) 120 | 121 | assert ret.shape[0] == 6 122 | assert ret.shape[1] == 768 + 1 123 | 124 | ret.drop('id', axis=1, inplace=True) 125 | npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values) 126 | npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values) 127 | -------------------------------------------------------------------------------- /tests/feature/test_groupby.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from sklearn import datasets 5 | 6 | from nyaggle.feature.groupby import aggregation 7 | 8 | 9 | @pytest.fixture 10 | def iris_dataframe(): 11 | iris = datasets.load_iris() 12 | df = pd.DataFrame(np.concatenate([iris.data, 13 | iris.target.reshape((iris.target.shape[0], 1))], axis=1)) 14 | df.columns = ['sl', 'sw', 'pl', 'pw', 'species'] 15 | group_key = 'species' 16 | group_values = ['sl', 'sw', 'pl', 'pw'] 17 | return df, group_key, group_values 18 | 19 | 20 | def custom_function(x): 21 | return np.sum(x) 22 | 23 | 24 | def test_return_type_by_aggregation(iris_dataframe): 25 | df, group_key, group_values = iris_dataframe 26 | agg_methods = ["max", np.sum, custom_function] 27 | new_df, new_cols = aggregation(df, group_key, group_values, 28 | agg_methods) 29 | assert isinstance(new_df, pd.DataFrame) 30 | assert isinstance(new_cols, list) 31 | 32 | 33 | @pytest.mark.parametrize('agg_method', [[int], [lambda x: np.max(x)]]) 34 | def test_assert_by_aggregation(iris_dataframe, agg_method): 35 | df, group_key, group_values = iris_dataframe 36 | with pytest.raises(ValueError): 37 | aggregation(df, group_key, group_values, agg_method) 38 | -------------------------------------------------------------------------------- /tests/feature_store/test_feature_store.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from pandas.testing import assert_frame_equal 7 | 8 | import nyaggle.feature_store as fs 9 | from nyaggle.testing import get_temp_directory 10 | 11 | 12 | def test_save_feature(): 13 | df = pd.DataFrame() 14 | 15 | df['a'] = np.arange(100) 16 | 17 | with get_temp_directory() as tmp: 18 | fs.save_feature(df, 0, tmp) 19 | 20 | assert os.path.exists(os.path.join(tmp, '0.f')) 21 | 22 | 23 | def test_load_feature(): 24 | df = pd.DataFrame() 25 | 26 | df['a'] = np.arange(100) 27 | 28 | with get_temp_directory() as tmp: 29 | fs.save_feature(df, 0, tmp) 30 | 31 | df_loaded = fs.load_feature(0, tmp) 32 | assert_frame_equal(df, df_loaded) 33 | 34 | 35 | def test_multi_columns(): 36 | df = pd.DataFrame() 37 | 38 | df['a'] = np.arange(100) 39 | df['b'] = None 40 | 41 | with get_temp_directory() as tmp: 42 | fs.save_feature(df, 0, tmp) 43 | 44 | df_loaded = fs.load_feature(0, tmp) 45 | assert_frame_equal(df, df_loaded) 46 | 47 | 48 | def test_various_dtypes(): 49 | df = pd.DataFrame() 50 | 51 | df['a'] = np.arange(100).astype(float) 52 | df['b'] = np.arange(100).astype(int) 53 | df['c'] = np.arange(100).astype(np.uint8) 54 | df['d'] = np.arange(100).astype(np.uint16) 55 | df['e'] = np.arange(100).astype(np.uint32) 56 | df['f'] = np.arange(100).astype(np.int8) 57 | df['g'] = np.arange(100).astype(np.int16) 58 | df['h'] = np.arange(100).astype(np.int32) 59 | df['i'] = np.arange(100).astype(np.int64) 60 | 61 | with get_temp_directory() as tmp: 62 | fs.save_feature(df, 0, tmp) 63 | 64 | df_loaded = fs.load_feature(0, tmp) 65 | assert_frame_equal(df, df_loaded) 66 | 67 | 68 | def test_load_features(): 69 | df = pd.DataFrame() 70 | 71 | df['a'] = np.arange(100).astype(float) 72 | df['b'] = np.arange(100).astype(int) 73 | df['c'] = np.arange(100).astype(int) 74 | 75 | with get_temp_directory() as tmp: 76 | fs.save_feature(df[['b']], 0, tmp) 77 | fs.save_feature(df[['c']], 1, tmp) 78 | 79 | df_loaded = fs.load_features(df[['a']], [0, 1], tmp) 80 | assert_frame_equal(df, df_loaded) 81 | 82 | 83 | def test_load_features_no_base(): 84 | df = pd.DataFrame() 85 | 86 | df['a'] = np.arange(100).astype(float) 87 | df['b'] = np.arange(100).astype(int) 88 | df['c'] = np.arange(100).astype(int) 89 | 90 | with get_temp_directory() as tmp: 91 | fs.save_feature(df[['b']], 0, tmp) 92 | fs.save_feature(df[['c']], 1, tmp) 93 | fs.save_feature(df[['a']], '2', tmp) 94 | 95 | df_loaded = fs.load_features(None, [0, 1, '2'], tmp) 96 | assert list(df_loaded.columns) == ['b', 'c', 'a'] 97 | 98 | 99 | def test_load_feature_ignore_columns(): 100 | df = pd.DataFrame() 101 | 102 | df['a'] = np.arange(100).astype(float) 103 | df['b'] = np.arange(100).astype(int) 104 | df['c'] = np.arange(100).astype(int) 105 | 106 | with get_temp_directory() as tmp: 107 | fs.save_feature(df, 0, tmp) 108 | 109 | # just skip irrelevant column names 110 | df_loaded = fs.load_feature(0, tmp, ignore_columns=['b', 'X']) 111 | 112 | assert_frame_equal(df_loaded, df.drop('b', axis=1)) 113 | 114 | 115 | def test_load_feature_ignore_all_columns(): 116 | df = pd.DataFrame() 117 | 118 | df['a'] = np.arange(100).astype(float) 119 | df['b'] = np.arange(100).astype(int) 120 | df['c'] = np.arange(100).astype(int) 121 | 122 | with get_temp_directory() as tmp: 123 | fs.save_feature(df, 0, tmp) 124 | 125 | df_loaded = fs.load_feature(0, tmp, ignore_columns=['a', 'b', 'c', 'X']) 126 | 127 | assert_frame_equal(df_loaded, df.drop(['a', 'b', 'c'], axis=1)) 128 | 129 | 130 | def test_load_features_duplicate_col_name(): 131 | df = pd.DataFrame() 132 | 133 | df['a'] = np.arange(100).astype(float) 134 | df['b'] = np.arange(100).astype(int) 135 | df['c'] = np.arange(100).astype(int) 136 | 137 | with get_temp_directory() as tmp: 138 | fs.save_feature(df[['a', 'b']], 0, tmp) 139 | fs.save_feature(df[['b', 'c']], 1, tmp) 140 | fs.save_feature(df[['b', 'a']], 'X', tmp) 141 | 142 | df_loaded = fs.load_features(None, [0, 1, 'X'], tmp, rename_duplicate=True) 143 | assert list(df_loaded.columns) == ['a', 'b', 'b_1', 'c', 'b_X', 'a_X'] 144 | 145 | df_loaded = fs.load_features(None, [0, 1, 'X'], tmp, rename_duplicate=False) 146 | assert list(df_loaded.columns) == ['a', 'b', 'b', 'c', 'b', 'a'] 147 | 148 | 149 | def test_invalid_feature(): 150 | df = pd.DataFrame({ 151 | 'a': [1, 2, 3, 4, 5] + [None] * 5, 152 | 'b': np.random.randint(0, 10, size=10) 153 | }) 154 | y = pd.Series([1, 0, 1, 0, 1]) 155 | 156 | with get_temp_directory() as tmp: 157 | with pytest.raises(RuntimeError): 158 | fs.save_feature(df[['a']], 0, reference_target_variable=y, directory=tmp) 159 | with pytest.raises(RuntimeError): 160 | fs.save_feature(df, 0, reference_target_variable=y, directory=tmp) 161 | 162 | # ok 163 | fs.save_feature(df[['b']], 0, reference_target_variable=y, directory=tmp) 164 | 165 | 166 | def test_feature_exists(): 167 | df = pd.DataFrame({ 168 | 'a': [1, 2, 3, 4, 5] + [None] * 5 169 | }) 170 | 171 | with get_temp_directory() as tmp: 172 | fs.save_feature(df[['a']], 0, directory=tmp) 173 | with pytest.raises(RuntimeError): 174 | fs.save_feature(df, 0, overwrite=False, directory=tmp) 175 | 176 | 177 | def test_decorator(): 178 | with get_temp_directory() as tmp: 179 | @fs.cached_feature('x', tmp) 180 | def make_feature_x(): 181 | return pd.DataFrame({'a': [1, 2, 3, 4, 5]}) 182 | 183 | @fs.cached_feature('y', tmp) 184 | def make_feature_y(n: int): 185 | return pd.DataFrame({'b': np.arange(n)}) 186 | 187 | x = make_feature_x() 188 | assert make_feature_x.__name__ == "make_feature_x" 189 | assert os.path.exists(os.path.join(tmp, "x.f")) 190 | x2 = make_feature_x() 191 | assert_frame_equal(x, x2) 192 | 193 | y = make_feature_y(100) 194 | assert len(y) == 100 195 | assert os.path.exists(os.path.join(tmp, "y.f")) 196 | y2 = make_feature_y(100) 197 | assert_frame_equal(y, y2) 198 | -------------------------------------------------------------------------------- /tests/validation/test_adversarial_validate.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | 3 | from nyaggle.testing import make_classification_df 4 | from nyaggle.validation import adversarial_validate 5 | 6 | 7 | def test_adv(): 8 | X, y = make_classification_df(1024) 9 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 10 | 11 | X_train['target'] = 0 12 | X_test['target'] = 1 13 | 14 | auc, importance = adversarial_validate(X_train, X_test) 15 | 16 | assert importance['feature'][0] == 'target' 17 | assert auc >= 0.9 18 | -------------------------------------------------------------------------------- /tests/validation/test_cross_validate.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | 5 | from catboost import CatBoostClassifier 6 | from lightgbm import LGBMClassifier 7 | from sklearn.datasets import make_classification, make_regression 8 | from sklearn.linear_model import RidgeClassifier, Ridge 9 | from sklearn.metrics import roc_auc_score, r2_score 10 | from sklearn.model_selection import train_test_split, KFold 11 | 12 | from nyaggle.experiment import autoprep_gbdt 13 | from nyaggle.testing import make_classification_df 14 | from nyaggle.validation import cross_validate, Take 15 | 16 | 17 | def test_cv_sklean_binary(): 18 | X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0) 19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 20 | 21 | model = RidgeClassifier(alpha=1.0) 22 | 23 | pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score) 24 | 25 | assert len(scores) == 5 + 1 26 | assert scores[-1] >= 0.85 # overall auc 27 | assert roc_auc_score(y_train, pred_oof) == scores[-1] 28 | assert roc_auc_score(y_test, pred_test) >= 0.85 # test score 29 | 30 | 31 | def test_cv_sklean_regression(): 32 | X, y = make_regression(n_samples=1024, n_features=20, random_state=0) 33 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 34 | 35 | model = Ridge(alpha=1.0) 36 | 37 | pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=r2_score) 38 | 39 | print(scores) 40 | assert len(scores) == 5 + 1 41 | assert scores[-1] >= 0.95 # overall r2 42 | assert r2_score(y_train, pred_oof) == scores[-1] 43 | assert r2_score(y_test, pred_test) >= 0.95 # test r2 44 | 45 | 46 | def test_cv_lgbm(): 47 | X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0) 48 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 49 | 50 | models = [LGBMClassifier(n_estimators=300) for _ in range(5)] 51 | 52 | pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5, 53 | eval_func=roc_auc_score, 54 | fit_params={'early_stopping_rounds': 200}) 55 | 56 | print(scores) 57 | assert len(scores) == 5 + 1 58 | assert scores[-1] >= 0.85 # overall roc_auc 59 | assert roc_auc_score(y_train, pred_oof) == scores[-1] 60 | assert roc_auc_score(y_test, pred_test) >= 0.85 # test roc_auc 61 | assert roc_auc_score(y, models[0].predict_proba(X)[:, 1]) >= 0.85 # make sure models are trained 62 | assert len(importance) == 5 63 | assert list(importance[0].columns) == ['feature', 'importance'] 64 | assert len(importance[0]) == 20 65 | 66 | 67 | def test_cv_lgbm_df(): 68 | X, y = make_classification_df(n_samples=1024, n_num_features=20, n_cat_features=1, class_sep=0.98, random_state=0) 69 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 70 | 71 | models = [LGBMClassifier(n_estimators=300) for _ in range(5)] 72 | 73 | pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5, 74 | eval_func=roc_auc_score) 75 | 76 | print(scores) 77 | assert len(scores) == 5 + 1 78 | assert scores[-1] >= 0.85 # overall roc_auc 79 | assert roc_auc_score(y_train, pred_oof) == scores[-1] 80 | assert roc_auc_score(y_test, pred_test) >= 0.85 # test roc_auc 81 | assert roc_auc_score(y_test, models[0].predict_proba(X_test)[:, 1]) >= 0.85 # make sure models are trained 82 | assert len(importance) == 5 83 | assert list(importance[0].columns) == ['feature', 'importance'] 84 | assert len(importance[0]) == 20 + 1 85 | assert models[0].booster_.num_trees() < 300 # making sure early stopping worked 86 | 87 | 88 | def test_cv_cat_df(): 89 | X, y = make_classification_df(n_samples=1024, n_num_features=20, n_cat_features=1, class_sep=0.98, random_state=0) 90 | X, _ = autoprep_gbdt('cat', X, None) 91 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 92 | 93 | models = [CatBoostClassifier(n_estimators=300) for _ in range(5)] 94 | 95 | pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5, 96 | eval_func=roc_auc_score, 97 | fit_params={'cat_features': ['cat_0']}) 98 | 99 | print(scores) 100 | assert len(scores) == 5 + 1 101 | assert scores[-1] >= 0.85 # overall roc_auc 102 | assert roc_auc_score(y_train, pred_oof) == scores[-1] 103 | assert roc_auc_score(y_test, pred_test) >= 0.85 # test roc_auc 104 | assert roc_auc_score(y_test, models[0].predict_proba(X_test)[:, 1]) >= 0.85 # make sure models are trained 105 | assert len(importance) == 5 106 | assert list(importance[0].columns) == ['feature', 'importance'] 107 | assert len(importance[0]) == 20 + 1 108 | assert models[0].tree_count_ < 300 # making sure early stopping worked 109 | 110 | 111 | def test_cv_partial_evaluate(): 112 | X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0) 113 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 114 | 115 | model = RidgeClassifier(alpha=1.0) 116 | 117 | n = 0 118 | 119 | def _fold_count(*args): 120 | nonlocal n 121 | n += 1 122 | 123 | cv = Take(2, KFold(5)) 124 | 125 | pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=cv, eval_func=roc_auc_score, 126 | on_each_fold=_fold_count) 127 | 128 | assert len(scores) == 2 + 1 129 | assert scores[-1] >= 0.8 # overall auc 130 | assert n == 2 131 | 132 | 133 | def test_fit_params_callback(): 134 | X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0) 135 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 136 | 137 | models = [LGBMClassifier(n_estimators=300) for _ in range(5)] 138 | 139 | sample_weights = np.random.randint(1, 10, size=len(X_train)) 140 | sample_weights = sample_weights / sample_weights.sum() 141 | 142 | def fit_params(n: int, train_index: List[int], valid_index: List[int]): 143 | return { 144 | 'early_stopping_rounds': 100, 145 | 'sample_weight': list(sample_weights[train_index]), 146 | 'eval_sample_weight': [list(sample_weights[valid_index])] 147 | } 148 | 149 | result_w_weight = cross_validate(models, X_train, y_train, X_test, cv=5, 150 | eval_func=roc_auc_score, fit_params=fit_params) 151 | 152 | result_wo_weight = cross_validate(models, X_train, y_train, X_test, cv=5, 153 | eval_func=roc_auc_score, fit_params={'early_stopping_rounds': 50}) 154 | 155 | assert result_w_weight.scores[-1] != result_wo_weight.scores[-1] 156 | -------------------------------------------------------------------------------- /tests/validation/test_split.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pytest 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.model_selection import KFold 7 | 8 | import nyaggle.validation.split as split 9 | 10 | 11 | def _random_uniform_dates(start_date: str, n_days: int, size: int): 12 | return pd.to_datetime(start_date) + pd.to_timedelta(np.random.randint(0, n_days, size=size), 'd') 13 | 14 | 15 | def test_take(): 16 | df = pd.DataFrame() 17 | df['id'] = np.arange(10) 18 | 19 | folds = split.Take(2, KFold(5)).split(df) 20 | 21 | train_index, test_index = next(folds) 22 | assert np.array_equal(test_index, np.array([0, 1])) 23 | 24 | train_index, test_index = next(folds) 25 | assert np.array_equal(test_index, np.array([2, 3])) 26 | 27 | with pytest.raises(StopIteration): 28 | next(folds) 29 | 30 | 31 | def test_take_over(): 32 | df = pd.DataFrame() 33 | df['id'] = np.arange(10) 34 | 35 | # k > base_validator.n_splits 36 | folds = split.Take(3, KFold(2)).split(df) 37 | 38 | train_index, test_index = next(folds) 39 | assert np.array_equal(test_index, np.array([0, 1, 2, 3, 4])) 40 | 41 | train_index, test_index = next(folds) 42 | assert np.array_equal(test_index, np.array([5, 6, 7, 8, 9])) 43 | 44 | with pytest.raises(StopIteration): 45 | next(folds) 46 | 47 | 48 | def test_skip(): 49 | df = pd.DataFrame() 50 | df['id'] = np.arange(10) 51 | 52 | kf = split.Skip(2, KFold(5)) 53 | folds = kf.split(df) 54 | 55 | assert kf.get_n_splits() == 3 56 | 57 | train_index, test_index = next(folds) 58 | assert np.array_equal(test_index, np.array([4, 5])) 59 | 60 | train_index, test_index = next(folds) 61 | assert np.array_equal(test_index, np.array([6, 7])) 62 | 63 | train_index, test_index = next(folds) 64 | assert np.array_equal(test_index, np.array([8, 9])) 65 | 66 | with pytest.raises(StopIteration): 67 | next(folds) 68 | 69 | 70 | def test_nth(): 71 | df = pd.DataFrame() 72 | df['id'] = np.arange(10) 73 | 74 | kf = split.Nth(3, KFold(5)) 75 | folds = kf.split(df) 76 | 77 | assert kf.get_n_splits() == 1 78 | 79 | train_index, test_index = next(folds) 80 | assert np.array_equal(test_index, np.array([4, 5])) 81 | 82 | with pytest.raises(StopIteration): 83 | next(folds) 84 | 85 | kf = split.Nth(1, KFold(5)) 86 | folds = kf.split(df) 87 | 88 | assert kf.get_n_splits() == 1 89 | 90 | train_index, test_index = next(folds) 91 | assert np.array_equal(test_index, np.array([0, 1])) 92 | 93 | with pytest.raises(StopIteration): 94 | next(folds) 95 | 96 | 97 | def test_time_series_split(): 98 | df = pd.DataFrame() 99 | df['time'] = pd.date_range(start='2018/1/1', periods=5) 100 | 101 | folds = split.TimeSeriesSplit('time', 102 | [(('2018-01-01', '2018-01-02'), ('2018-01-02', '2018-01-04')), 103 | (('2018-01-02', '2018-01-03'), ('2018-01-04', '2018-01-06'))]) 104 | 105 | assert folds.get_n_splits() == 2 106 | 107 | splits = folds.split(df) 108 | 109 | train_index, test_index = next(splits) 110 | assert np.array_equal(train_index, np.array([0])) 111 | assert np.array_equal(test_index, np.array([1, 2])) 112 | 113 | train_index, test_index = next(splits) 114 | assert np.array_equal(train_index, np.array([1])) 115 | assert np.array_equal(test_index, np.array([3, 4])) 116 | 117 | with pytest.raises(StopIteration): 118 | next(splits) 119 | 120 | 121 | def test_time_series_open_range(): 122 | df = pd.DataFrame() 123 | df['x'] = [1, 2, 3, 4, 5] 124 | df['time'] = pd.date_range(start='2018/1/1', periods=5) 125 | 126 | folds = split.TimeSeriesSplit(df['time'], 127 | [((None, '2018-01-03'), ('2018-01-03', None))]) 128 | splits = folds.split(df) 129 | 130 | train_index, test_index = next(splits) 131 | assert np.array_equal(train_index, np.array([0, 1])) 132 | assert np.array_equal(test_index, np.array([2, 3, 4])) 133 | 134 | 135 | def test_time_series_add_folds(): 136 | df = pd.DataFrame() 137 | df['x'] = [1, 2, 3, 4, 5] 138 | df['time'] = pd.date_range(start='2018/1/1', periods=5) 139 | 140 | folds = split.TimeSeriesSplit(df['time']) 141 | 142 | assert folds.get_n_splits() == 0 143 | 144 | folds.add_fold((None, '2018-01-03'), ('2018-01-03', None)) 145 | 146 | assert folds.get_n_splits() == 1 147 | 148 | 149 | def test_sliding_window_split(): 150 | window = split.SlidingWindowSplit('time', 151 | train_from='2018-01-20', 152 | train_to='2018-01-23', 153 | test_from='2018-01-27', 154 | test_to='2018-01-31', 155 | n_windows=3, 156 | stride=pd.to_timedelta(2, 'd')) 157 | 158 | # train test 159 | # fold1: 01/16 - 01/19 01/23 - 01/27 (backtest 2) 160 | # fold2: 01/18 - 01/21 01/25 - 01/29 (backtest 1) 161 | # fold3: 01/20 - 01/23 01/27 - 01/31 (base window) 162 | 163 | expected = [ 164 | ((datetime(2018, 1, 16), datetime(2018, 1, 19)), (datetime(2018, 1, 23), datetime(2018, 1, 27))), 165 | ((datetime(2018, 1, 18), datetime(2018, 1, 21)), (datetime(2018, 1, 25), datetime(2018, 1, 29))), 166 | ((datetime(2018, 1, 20), datetime(2018, 1, 23)), (datetime(2018, 1, 27), datetime(2018, 1, 31))) 167 | ] 168 | 169 | assert window.times == expected 170 | 171 | 172 | def test_stratified_group_kfold_one_class_per_grp(): 173 | sgf = split.StratifiedGroupKFold(2, shuffle=False) 174 | 175 | df = pd.DataFrame() 176 | df['group'] = [1, 1, 2, 2, 3, 3, 4, 4] 177 | df['y'] = [0, 0, 1, 1, 0, 0, 1, 1] 178 | df['x'] = [0, 1, 2, 3, 4, 5, 6, 7] 179 | 180 | assert sgf.get_n_splits(df, df['y'], df['group']) == 2 181 | 182 | splits = sgf.split(df, df['y'], df['group']) 183 | 184 | train_index, test_index = next(splits) 185 | assert np.array_equal(train_index, np.array([2, 3, 4, 5])) 186 | assert np.array_equal(test_index, np.array([0, 1, 6, 7])) 187 | 188 | train_index, test_index = next(splits) 189 | assert np.array_equal(train_index, np.array([0, 1, 6, 7])) 190 | assert np.array_equal(test_index, np.array([2, 3, 4, 5])) 191 | 192 | 193 | def test_stratified_group_kfold_multi_class_per_fold(): 194 | sgf = split.StratifiedGroupKFold(2, shuffle=False) 195 | 196 | df = pd.DataFrame() 197 | df['group'] = [1, 1, 2, 2, 3, 3, 4, 4] 198 | df['y'] = [0, 1, 0, 1, 1, 1, 1, 1] 199 | df['x'] = [0, 1, 2, 3, 4, 5, 6, 7] 200 | 201 | assert sgf.get_n_splits(df, df['y'], df['group']) == 2 202 | 203 | splits = sgf.split(df, df['y'], df['group']) 204 | 205 | train_index, test_index = next(splits) 206 | assert np.array_equal(train_index, np.array([0, 1, 4, 5])) 207 | assert np.array_equal(test_index, np.array([2, 3, 6, 7])) 208 | 209 | train_index, test_index = next(splits) 210 | assert np.array_equal(train_index, np.array([2, 3, 6, 7])) 211 | assert np.array_equal(test_index, np.array([0, 1, 4, 5])) 212 | 213 | 214 | def test_stratified_group_kfold_imbalanced_group(): 215 | sgf = split.StratifiedGroupKFold(2, shuffle=False) 216 | 217 | df = pd.DataFrame() 218 | df['group'] = [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4] 219 | df['y'] = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] 220 | df['x'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] 221 | 222 | assert sgf.get_n_splits(df, df['y'], df['group']) == 2 223 | 224 | splits = sgf.split(df, df['y'], df['group']) 225 | 226 | train_index, test_index = next(splits) 227 | assert np.array_equal(train_index, np.array([8, 9, 10, 11])) 228 | assert np.array_equal(test_index, np.array([0, 1, 2, 3, 4, 5, 6, 7])) 229 | 230 | train_index, test_index = next(splits) 231 | assert np.array_equal(train_index, np.array([0, 1, 2, 3, 4, 5, 6, 7])) 232 | assert np.array_equal(test_index, np.array([8, 9, 10, 11])) 233 | --------------------------------------------------------------------------------