├── .github
    └── workflows
    │   ├── pythonpackage.yml
    │   ├── pythonpublish.yml
    │   └── weekly_test.yml
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── Makefile
    ├── conf.py
    ├── image
    │   └── mlflow.png
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── reference
    │       ├── ensemble.rst
    │       ├── experiment.rst
    │       ├── feature_store.rst
    │       ├── features.rst
    │       ├── hyper_parameters.rst
    │       ├── index.rst
    │       ├── util.rst
    │       └── validation.rst
    │   └── tutorial
    │       ├── experiment.rst
    │       ├── experiment_advanced.rst
    │       ├── feature_store.rst
    │       └── index.rst
├── examples
    ├── kaggle-bnp-paribas
    │   └── kaggle_bnp_paribas.py
    ├── kaggle-days-tokyo
    │   └── kaggle_days_tokyo.py
    ├── kaggle-plasticc
    │   └── kaggle_plasticc.py
    └── wine-quality
    │   └── wine-quality.py
├── nyaggle
    ├── __init__.py
    ├── ensemble
    │   ├── __init__.py
    │   ├── averaging.py
    │   ├── common.py
    │   └── stacking.py
    ├── environment.py
    ├── experiment
    │   ├── __init__.py
    │   ├── auto_prep.py
    │   ├── experiment.py
    │   ├── hyperparameter_tuner.py
    │   └── run.py
    ├── feature
    │   ├── __init__.py
    │   ├── base.py
    │   ├── category_encoder
    │   │   ├── __init__.py
    │   │   └── target_encoder.py
    │   ├── groupby.py
    │   └── nlp
    │   │   ├── __init__.py
    │   │   └── bert.py
    ├── feature_store
    │   ├── __init__.py
    │   └── feature_store.py
    ├── hyper_parameters
    │   ├── __init__.py
    │   ├── catboost.py
    │   ├── lightgbm.py
    │   ├── parameters.py
    │   └── xgboost.py
    ├── testing
    │   ├── __init__.py
    │   └── util.py
    ├── util
    │   ├── __init__.py
    │   ├── plot_importance.py
    │   ├── submission.py
    │   └── traits.py
    ├── validation
    │   ├── __init__.py
    │   ├── adversarial_validate.py
    │   ├── cross_validate.py
    │   └── split.py
    └── version.py
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── ensemble
        ├── test_averaging.py
        └── test_stacking.py
    ├── experiment
        ├── test_experiment.py
        ├── test_hyperparameter_tuner.py
        └── test_run.py
    ├── feature
        ├── category_encoder
        │   └── test_target_encoder.py
        ├── nlp
        │   └── test_bert.py
        └── test_groupby.py
    ├── feature_store
        └── test_feature_store.py
    └── validation
        ├── test_adversarial_validate.py
        ├── test_cross_validate.py
        └── test_split.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       max-parallel: 4
11 |       matrix:
12 |         python-version: ['3.8', '3.9', '3.10', '3.11']
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v1
16 |       - name: Set up Python ${{ matrix.python-version }}
17 |         uses: actions/setup-python@v1
18 |         with:
19 |           python-version: ${{ matrix.python-version }}
20 |       - uses: actions/cache@v1
21 |         with:
22 |           path: ~/.cache/pip
23 |           key: >
24 |             ${{ runner.os }}-pip-
25 |             ${{ hashFiles('**/requirements.txt') }}-
26 |             ${{ hashFiles('**/requirements-dev.txt') }}
27 |           restore-keys: |
28 |             ${{ runner.os }}-pip-
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install -r requirements.txt -r requirements-dev.txt
33 |       - name: Install MeCab
34 |         run: |
35 |           sudo apt install mecab libmecab-dev mecab-ipadic-utf8
36 |       - name: Lint with flake8
37 |         run: |
38 |           # stop the build if there are Python syntax errors or undefined names
39 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
40 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
42 |       - name: Test with pytest
43 |         run: |
44 |           export PYTHONPATH=./
45 |           pytest --verbose --color=yes
46 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.github/workflows/weekly_test.yml:
--------------------------------------------------------------------------------
 1 | name: weekly_test
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 0 * * 0"
 6 | 
 7 | jobs:
 8 |   build:
 9 | 
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       max-parallel: 4
13 |       matrix:
14 |         python-version: ['3.8', '3.9', '3.10', '3.11']
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v1
18 |       - name: Set up Python ${{ matrix.python-version }}
19 |         uses: actions/setup-python@v1
20 |         with:
21 |           python-version: ${{ matrix.python-version }}
22 |       - uses: actions/cache@v1
23 |         with:
24 |           path: ~/.cache/pip
25 |           key: >
26 |             ${{ runner.os }}-pip-
27 |             ${{ hashFiles('**/requirements.txt') }}-
28 |             ${{ hashFiles('**/requirements-dev.txt') }}
29 |           restore-keys: |
30 |             ${{ runner.os }}-pip-
31 |       - name: Install dependencies
32 |         run: |
33 |           python -m pip install --upgrade pip
34 |           pip install -r requirements.txt -r requirements-dev.txt
35 |       - name: Install MeCab
36 |         run: |
37 |           sudo apt install mecab libmecab-dev mecab-ipadic-utf8
38 |       - name: Lint with flake8
39 |         run: |
40 |           # stop the build if there are Python syntax errors or undefined names
41 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
42 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
43 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
44 |       - name: Test with pytest
45 |         run: |
46 |           export PYTHONPATH=./
47 |           pytest --verbose --color=yes
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | build/
  2 | dist/
  3 | nyaggle.egg-info/
  4 | .idea/
  5 | __pycache__/
  6 | .pytest_cache/
  7 | mlruns/
  8 | catboost_info
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | cover/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | .pybuilder/
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | #   For a library or package, you might want to ignore these files since the code is
 96 | #   intended to run in multiple environments; otherwise, check them in:
 97 | # .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # poetry
107 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
109 | #   commonly ignored for libraries.
110 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 | 
113 | # pdm
114 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | #   in version control.
118 | #   https://pdm.fming.dev/#use-with-ide
119 | .pdm.toml
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | 
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 | 
144 | # Rope project settings
145 | .ropeproject
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 | 
155 | # Pyre type checker
156 | .pyre/
157 | 
158 | # pytype static type analyzer
159 | .pytype/
160 | 
161 | # Cython debug symbols
162 | cython_debug/
163 | 
164 | # PyCharm
165 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
168 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | #.idea/
170 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 | 
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 |   version: 3.7
22 |   install:
23 |     - requirements: requirements.txt
24 |     - requirements: docs/requirements.txt
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 nyanp
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt README.md
2 | recursive-include docs *.txt
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # nyaggle
  2 | 
  3 | ![GitHub Actions CI Status](https://github.com/nyanp/nyaggle/workflows/Python%20package/badge.svg)
  4 | ![GitHub Actions CI Status](https://github.com/nyanp/nyaggle/workflows/weekly_test/badge.svg)
  5 | ![Python Versions](https://img.shields.io/pypi/pyversions/nyaggle.svg?logo=python&logoColor=white)
  6 | ![Documentation Status](https://readthedocs.org/projects/nyaggle/badge/?version=latest)
  7 | 
  8 | [**Documentation**](https://nyaggle.readthedocs.io/en/latest/index.html)
  9 | | [**Slide (Japanese)**](https://docs.google.com/presentation/d/1jv3J7DISw8phZT4z9rqjM-azdrQ4L4wWJN5P-gKL6fA/edit?usp=sharing)
 10 | 
 11 | **nyaggle** is an utility library for Kaggle and offline competitions. 
 12 | It is particularly focused on experiment tracking, feature engineering, and validation.
 13 | 
 14 | - **nyaggle.ensemble** - Averaging & stacking
 15 | - **nyaggle.experiment** - Experiment tracking
 16 | - **nyaggle.feature_store** - Lightweight feature storage using feather-format
 17 | - **nyaggle.features** - sklearn-compatible features
 18 | - **nyaggle.hyper_parameters** - Collection of GBDT hyper-parameters used in past Kaggle competitions
 19 | - **nyaggle.validation** - Adversarial validation & sklearn-compatible CV splitters
 20 | 
 21 | ## Installation
 22 | 
 23 | You can install nyaggle via pip:
 24 | 
 25 | ```bash
 26 | pip install nyaggle
 27 | ```
 28 | 
 29 | ## Examples
 30 | 
 31 | ### Experiment Tracking
 32 | 
 33 | `run_experiment()` is a high-level API for experiments with cross validation.
 34 | It outputs parameters, metrics, out of fold predictions, test predictions,
 35 | feature importance, and submission.csv under the specified directory.
 36 | 
 37 | To enable mlflow tracking, include the optional `with_mlflow=True` parameter.
 38 | 
 39 | ```python
 40 | from sklearn.model_selection import train_test_split
 41 | 
 42 | from nyaggle.experiment import run_experiment
 43 | from nyaggle.testing import make_classification_df
 44 | 
 45 | X, y = make_classification_df()
 46 | X_train, X_test, y_train, y_test = train_test_split(X, y)
 47 | 
 48 | params = {
 49 |     'n_estimators': 1000,
 50 |     'max_depth': 8
 51 | }
 52 | 
 53 | result = run_experiment(params,
 54 |                         X_train,
 55 |                         y_train,
 56 |                         X_test)
 57 | 
 58 | # You can get outputs that are needed in data science competitions with 1 API
 59 | 
 60 | print(result.test_prediction)  # Test prediction in numpy array
 61 | print(result.oof_prediction)   # Out-of-fold prediction in numpy array
 62 | print(result.models)           # Trained models for each fold
 63 | print(result.importance)       # Feature importance for each fold
 64 | print(result.metrics)          # Evalulation metrics for each fold
 65 | print(result.time)             # Elapsed time
 66 | print(result.submission_df)    # The output dataframe saved as submission.csv
 67 | 
 68 | # ...and all outputs have been saved under the logging directory (default: output/yyyymmdd_HHMMSS).
 69 | 
 70 | 
 71 | # You can use it with mlflow and track your experiments through mlflow-ui
 72 | result = run_experiment(params,
 73 |                         X_train,
 74 |                         y_train,
 75 |                         X_test,
 76 |                         with_mlflow=True)
 77 | ```
 78 | 
 79 | nyaggle also has a low-level API which has similar interface to
 80 | [mlflow tracking](https://www.mlflow.org/docs/latest/tracking.html) and [wandb](https://www.wandb.com/).
 81 | 
 82 | ```python
 83 | from nyaggle.experiment import Experiment
 84 | 
 85 | with Experiment(logging_directory='./output/') as exp:
 86 |     # log key-value pair as a parameter
 87 |     exp.log_param('lr', 0.01)
 88 |     exp.log_param('optimizer', 'adam')
 89 | 
 90 |     # log text
 91 |     exp.log('blah blah blah')
 92 | 
 93 |     # log metric
 94 |     exp.log_metric('CV', 0.85)
 95 | 
 96 |     # log numpy ndarray, pandas dafaframe and any artifacts
 97 |     exp.log_numpy('predicted', predicted)
 98 |     exp.log_dataframe('submission', sub, file_format='csv')
 99 |     exp.log_artifact('path-to-your-file')
100 | ```
101 | 
102 | ### Feature Engineering
103 | 
104 | #### Target Encoding with K-Fold
105 | 
106 | ```python
107 | import pandas as pd
108 | import numpy as np
109 | 
110 | from sklearn.model_selection import KFold
111 | from nyaggle.feature.category_encoder import TargetEncoder
112 | 
113 | 
114 | train = pd.read_csv('train.csv')
115 | test = pd.read_csv('test.csv')
116 | all = pd.concat([train, test]).copy()
117 | 
118 | cat_cols = [c for c in train.columns if train[c].dtype == np.object]
119 | target_col = 'y'
120 | 
121 | kf = KFold(5)
122 | 
123 | # Target encoding with K-fold
124 | te = TargetEncoder(kf.split(train))
125 | 
126 | # use fit/fit_transform to train data, then apply transform to test data
127 | train.loc[:, cat_cols] = te.fit_transform(train[cat_cols], train[target_col])
128 | test.loc[:, cat_cols] = te.transform(test[cat_cols])
129 | 
130 | # ... or just call fit_transform to concatenated data
131 | all.loc[:, cat_cols] = te.fit_transform(all[cat_cols], all[cat_cols])
132 | ```
133 | 
134 | #### Text Vectorization using BERT
135 | 
136 | You need to install pytorch to your virtual environment to use BertSentenceVectorizer. 
137 | MaCab and mecab-python3 are also required if you use the Japanese BERT model.
138 | 
139 | ```python
140 | import pandas as pd
141 | from nyaggle.feature.nlp import BertSentenceVectorizer
142 | 
143 | 
144 | train = pd.read_csv('train.csv')
145 | test = pd.read_csv('test.csv')
146 | all = pd.concat([train, test]).copy()
147 | 
148 | text_cols = ['body']
149 | target_col = 'y'
150 | group_col = 'user_id'
151 | 
152 | 
153 | # extract BERT-based sentence vector
154 | bv = BertSentenceVectorizer(text_columns=text_cols)
155 | 
156 | text_vector = bv.fit_transform(train)
157 | 
158 | 
159 | # BERT + SVD, with cuda
160 | bv = BertSentenceVectorizer(text_columns=text_cols, use_cuda=True, n_components=40)
161 | 
162 | text_vector_svd = bv.fit_transform(train)
163 | 
164 | # Japanese BERT
165 | bv = BertSentenceVectorizer(text_columns=text_cols, lang='jp')
166 | 
167 | japanese_text_vector = bv.fit_transform(train)
168 | ```
169 | 
170 | 
171 | ### Adversarial Validation
172 | 
173 | ```python
174 | import pandas as pd
175 | from nyaggle.validation import adversarial_validate
176 | 
177 | train = pd.read_csv('train.csv')
178 | test = pd.read_csv('test.csv')
179 | 
180 | auc, importance = adversarial_validate(train, test, importance_type='gain')
181 | 
182 | ```
183 | 
184 | ### Validation Splitters
185 | 
186 | nyaggle provides a set of validation splitters that are compatible with sklearn.
187 | 
188 | ```python
189 | import pandas as pd
190 | from sklearn.model_selection import cross_validate, KFold
191 | from nyaggle.validation import TimeSeriesSplit, Take, Skip, Nth
192 | 
193 | train = pd.read_csv('train.csv', parse_dates='dt')
194 | 
195 | # time-series split
196 | ts = TimeSeriesSplit(train['dt'])
197 | ts.add_fold(train_interval=('2019-01-01', '2019-01-10'), test_interval=('2019-01-10', '2019-01-20'))
198 | ts.add_fold(train_interval=('2019-01-06', '2019-01-15'), test_interval=('2019-01-15', '2019-01-25'))
199 | 
200 | cross_validate(..., cv=ts)
201 | 
202 | # take the first 3 folds out of 10
203 | cross_validate(..., cv=Take(3, KFold(10)))
204 | 
205 | # skip the first 3 folds, and evaluate the remaining 7 folds
206 | cross_validate(..., cv=Skip(3, KFold(10)))
207 | 
208 | # evaluate 1st fold
209 | cross_validate(..., cv=Nth(1, ts))
210 | 
211 | ```
212 | 
213 | ### Other Awesome Repositories
214 | 
215 | Here is a list of awesome repositories that provide general utility functions for data science competitions.
216 | Please let me know if you have another one :)
217 | 
218 | - [jeongyoonlee/Kaggler](https://github.com/jeongyoonlee/Kaggler)
219 | - [mxbi/mlcrate](https://github.com/mxbi/mlcrate)
220 | - [analokmaus/kuma_utils](https://github.com/analokmaus/kuma_utils)
221 | - [Far0n/kaggletils](https://github.com/Far0n/kaggletils)
222 | - [MLWave/Kaggle-Ensemble-Guide](https://github.com/MLWave/Kaggle-Ensemble-Guide)
223 | - [rushter/heamy](https://github.com/rushter/heamy)
224 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | import sphinx_rtd_theme
16 | 
17 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18 | 
19 | import nyaggle
20 | 
21 | # -- Project information -----------------------------------------------------
22 | 
23 | project = 'nyaggle'
24 | copyright = '2019, nyanp'
25 | author = 'nyanp'
26 | 
27 | # The full version, including alpha/beta/rc tags
28 | release = nyaggle.__version__
29 | 
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 |     'sphinx.ext.autodoc',
38 |     'sphinx.ext.napoleon',
39 |     'sphinx_autodoc_typehints',
40 |     'sphinx.ext.viewcode'
41 | ]
42 | 
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['_templates']
45 | 
46 | # List of patterns, relative to source directory, that match files and
47 | # directories to ignore when looking for source files.
48 | # This pattern also affects html_static_path and html_extra_path.
49 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
50 | 
51 | 
52 | # -- Options for HTML output -------------------------------------------------
53 | 
54 | # Add any paths that contain custom static files (such as style sheets) here,
55 | # relative to this directory. They are copied after the builtin static files,
56 | # so a file named "default.css" will overwrite the builtin "default.css".
57 | html_static_path = ['_static']
58 | 
59 | # The master toctree document.
60 | master_doc = 'index'
61 | 
62 | # The theme to use for HTML and HTML Help pages.  See the documentation for
63 | # a list of builtin themes.
64 | html_theme = 'sphinx_rtd_theme'
65 | 
66 | # Theme options are theme-specific and customize the look and feel of a theme
67 | # further.  For a list of options available for each theme, see the
68 | # documentation.
69 | 
70 | # Add any paths that contain custom themes here, relative to this directory.
71 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
72 | 


--------------------------------------------------------------------------------
/docs/image/mlflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyanp/nyaggle/636532292d7ce3468cd47a3337bc50d620f0d23b/docs/image/mlflow.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. nyaggle documentation master file, created by
 2 |    sphinx-quickstart on Thu Dec 26 08:09:20 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to nyaggle's documentation!
 7 | ===================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    installation
14 |    source/tutorial/index
15 |    source/reference/index
16 | 
17 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ===================================
 3 | 
 4 | You can install nyaggle via pip:
 5 | 
 6 | 
 7 | .. code-block:: bash
 8 | 
 9 |     pip install nyaggle   # Install core parts of nyaggle
10 | 
11 | 
12 | nyaggle does not install the following packages by default:
13 | 
14 | - catboost
15 | - lightgbm
16 | - xgboost
17 | - mlflow
18 | - pytorch
19 | 
20 | 
21 | Modules which depends on these packages won't work until you also install them.
22 | For example, ``run_experiment`` with ``algorithm_type='xgb'``, ``'lgbm'`` and ``'cat'`` options won't work
23 | until you also install xgboost, lightgbm and catboost respectively.
24 | 
25 | If you want to install everything required in nyaggle, This command can be used:
26 | 
27 | .. code-block:: bash
28 | 
29 |     pip install nyaggle[all]  # Install everything
30 | 
31 | 
32 | If you use :code:`lang=ja` option in :code:`BertSentenceVecorizer`,
33 | you also need to intall MeCab and mecab-python3 package to your environment.
34 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx-autodoc-typehints
2 | 


--------------------------------------------------------------------------------
/docs/source/reference/ensemble.rst:
--------------------------------------------------------------------------------
1 | nyaggle.ensemble
2 | -----------------------
3 | 
4 | .. automodule:: nyaggle.ensemble
5 |     :members:
6 |     :imported-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/reference/experiment.rst:
--------------------------------------------------------------------------------
1 | nyaggle.experiment
2 | -----------------------
3 | 
4 | .. automodule:: nyaggle.experiment
5 |     :members:
6 |     :imported-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/reference/feature_store.rst:
--------------------------------------------------------------------------------
1 | nyaggle.feature_store
2 | ---------------------------
3 | 
4 | .. automodule:: nyaggle.feature_store
5 |     :members:
6 |     :imported-members:
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/source/reference/features.rst:
--------------------------------------------------------------------------------
 1 | nyaggle.feature
 2 | ----------------------------------------
 3 | 
 4 | .. automodule:: nyaggle.feature.category_encoder
 5 |     :members:
 6 |     :imported-members:
 7 |     :inherited-members:
 8 | 
 9 | .. automodule:: nyaggle.feature.nlp
10 |     :members:
11 |     :imported-members:
12 | 
13 | 
14 | .. automodule:: nyaggle.feature.groupby
15 |     :members:
16 | 


--------------------------------------------------------------------------------
/docs/source/reference/hyper_parameters.rst:
--------------------------------------------------------------------------------
1 | nyaggle.hyper_parameters
2 | --------------------------
3 | 
4 | .. automodule:: nyaggle.hyper_parameters
5 |     :members:
6 |     :imported-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/reference/index.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | .. toctree::
 5 |     :maxdepth: 1
 6 | 
 7 |     ensemble
 8 |     experiment
 9 |     feature_store
10 |     features
11 |     hyper_parameters
12 |     util
13 |     validation
14 | 


--------------------------------------------------------------------------------
/docs/source/reference/util.rst:
--------------------------------------------------------------------------------
1 | nyaggle.util
2 | -----------------------
3 | 
4 | .. automodule:: nyaggle.util
5 |     :members:
6 |     :imported-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/reference/validation.rst:
--------------------------------------------------------------------------------
1 | nyaggle.validation
2 | --------------------------
3 | 
4 | .. automodule:: nyaggle.validation
5 |     :members:
6 |     :imported-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/experiment.rst:
--------------------------------------------------------------------------------
  1 | Tracking your machine learning experiments with run_experiment
  2 | ===============================================================
  3 | 
  4 | Concept
  5 | -------------------------------
  6 | 
  7 | 
  8 | In a typical tabular data competition, you may probably repeat evaluating your idea
  9 | by cross-validation with logging the parameters and results to track your experiments.
 10 | 
 11 | The ``nyaggle.experiment.run_experiment`` is an API for such situations.
 12 | If you are using LightGBM as your model, the code will be quite simple:
 13 | 
 14 | 
 15 | .. code-block:: python
 16 | 
 17 |   import pandas as pd
 18 |   from nyaggle.experiment import run_experiment
 19 |   from nyaggle.experiment import make_classification_df
 20 | 
 21 |   INPUT_DIR = '../input'
 22 |   target_column = 'target'
 23 | 
 24 |   X_train = pd.read_csv(f'{INPUT_DIR}/train.csv')
 25 |   X_test = pd.read_csv(f'{INPUT_DIR}/test.csv')
 26 |   sample_df = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')  # OPTIONAL
 27 | 
 28 |   y = X_train[target_column]
 29 |   X_train = X_train.drop(target_column, axis=1)
 30 | 
 31 |   lightgbm_params = {
 32 |       'max_depth': 8
 33 |   }
 34 | 
 35 |   result = run_experiment(lightgbm_params, 
 36 |                           X_train, 
 37 |                           y, 
 38 |                           X_test,
 39 |                           sample_submission=sample_df)
 40 | 
 41 | 
 42 | The ``run_experiment`` API performs cross-validation and stores artifacts to the logging directory. You will see the output files stored as follows:
 43 | 
 44 | ::
 45 | 
 46 |     output
 47 |     └── 20200130123456          # yyyymmssHHMMSS
 48 |         ├── params.json         # Parameters
 49 |         ├── metrics.json        # Metrics (single fold & overall CV score)
 50 |         ├── oof_prediction.npy  # Out of fold prediction
 51 |         ├── test_prediction.npy # Test prediction
 52 |         ├── 20200130123456.csv  # Submission csv file
 53 |         ├── importances.png     # Feature importance plot
 54 |         ├── log.txt             # Log file
 55 |         └── models              # The trained models for each fold
 56 |               ├── fold1
 57 |               ├── fold2
 58 |               ├── fold3
 59 |               ├── fold4
 60 |               └── fold5
 61 | 
 62 | 
 63 | 
 64 | .. hint::
 65 |   The default validation strategy is a 5-fold CV. You can change this behavior by passing ``cv`` parameter
 66 |   (see API reference in detail).
 67 | 
 68 | 
 69 | If you want to use XGBoost, CatBoost or other sklearn estimators,
 70 | specify the type of algorithm:
 71 | 
 72 | 
 73 | .. code-block:: python
 74 | 
 75 |   # CatBoost
 76 |   catboost_params = {
 77 |       'eval_metric': 'Logloss',
 78 |       'loss_function': 'Logloss',
 79 |       'depth': 8,
 80 |       'task_type': 'GPU'
 81 |   }
 82 |   result = run_experiment(catboost_params, 
 83 |                           X_train,
 84 |                           y, 
 85 |                           X_test,
 86 |                           algorithm_type='cat')
 87 | 
 88 |   # XGBoost
 89 |   xgboost_params = {
 90 |       'objective': 'reg:linear',
 91 |       'max_depth': 8
 92 |   }
 93 |   result = run_experiment(xgboost_params, 
 94 |                           X_train, 
 95 |                           y, 
 96 |                           X_test,
 97 |                           algorithm_type='xgb')
 98 | 
 99 |   # sklearn estimator
100 |   from sklearn.linear_model import Ridge
101 |   rigde_params = {
102 |       'alpha': 1.0
103 |   }
104 |   result = run_experiment(rigde_params, 
105 |                           X_train, 
106 |                           y, 
107 |                           X_test,
108 |                           algorithm_type=Ridge)
109 | 
110 | 
111 | 
112 | .. hint::
113 |   The parameter will be passed to the constructor of sklearn API (e.g. ``LGBMClassifier``).
114 | 
115 | 
116 | Collaborating with mlflow
117 | ------------------------------
118 | 
119 | If you want GUI dashboard to manage your experiments, you can use ``run_experiment`` 
120 | with mlflow by just setting ``with_mlfow = True`` (you need to install mlflow beforehand).
121 | 
122 | 
123 | .. code-block:: python
124 | 
125 |   result = run_experiment(params, 
126 |                           X_train, 
127 |                           y, 
128 |                           X_test, 
129 |                           with_mlflow=True)
130 | 
131 | 
132 | 
133 | In the same directory as the script executed, run
134 | 
135 | .. code-block:: bash
136 | 
137 |   mlflow ui
138 | 
139 | 
140 | 
141 | and view it at http://localhost:5000 .
142 | On this page, you can see the list of experiments with CV scores and parameters.
143 | 
144 | 
145 | .. image:: ../../image/mlflow.png
146 | 
147 | 
148 | If you want to customize the behavior of logging, you can call ``run_experiment`` in
149 | the context of mlflow run. If there is an active run, ``run_experiment`` will use the
150 | currently active run instead of creating a new one.
151 | 
152 | 
153 | .. code-block:: python
154 | 
155 |   mlflow.set_tracking_uri('gs://ok-i-want-to-use-gcs')
156 | 
157 |   with mlflow.start_run(run_name='your-favorite-run-name'):
158 |       mlflow.log_param('something-you-want-to-log', 42)
159 | 
160 |       result = run_experiment(params, 
161 |                               X_train, 
162 |                               y, 
163 |                               X_test,
164 |                               with_mlflow=True)
165 | 
166 | 
167 | 
168 | 
169 | 
170 | What does ``run_experiment`` not do?
171 | -------------------------------------
172 | 
173 | ``run_experiment`` can be considered as a mere cross-validation API with logging functionality.
174 | Therefore, you have to choose model parameters and perform feature engineering yourself.
175 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/experiment_advanced.rst:
--------------------------------------------------------------------------------
  1 | Advanced usage
  2 | ==============================
  3 | 
  4 | Using low-level experiment API
  5 | -------------------------------
  6 | 
  7 | While nyaggle provides ``run_experiment`` as a high-level API,
  8 | ``Experiment`` class can be used as a low-level API that provides primitive functionality for logging experiments.
  9 | 
 10 | It is useful when you want to track something other than CV, or need to implement your own CV logic.
 11 | 
 12 | 
 13 | .. code-block:: python
 14 | 
 15 |   from nyaggle.experiment import Experiment
 16 | 
 17 | 
 18 |   with Experiment(logging_directory='./output/') as exp:
 19 |       # log key-value pair as a parameter
 20 |       exp.log_param('lr', 0.01)
 21 |       exp.log_param('optimizer', 'adam')
 22 | 
 23 |       # log text
 24 |       exp.log('blah blah blah')
 25 | 
 26 |       # log metric
 27 |       exp.log_metric('CV', 0.85)
 28 | 
 29 |       # log numpy ndarray
 30 |       exp.log_numpy('predicted', predicted)
 31 | 
 32 |       # log pandas dataframe
 33 |       exp.log_dataframe('submission', sub, file_format='csv')
 34 | 
 35 |       # log any file
 36 |       exp.log_artifact('path-to-your-file')
 37 | 
 38 | 
 39 |   # you can continue logging from existing result
 40 |   with Experiment.continue_from('./output') as exp:
 41 |       ...
 42 | 
 43 | 
 44 | If you are familiar with mlflow tracking, you may notice that these APIs are similar to mlflow.
 45 | ``Experiment`` can be treated as a thin wrapper if you pass ``mlflow=True`` to the constructor.
 46 | 
 47 | 
 48 | .. code-block:: python
 49 | 
 50 |   from nyaggle.experiment import Experiment
 51 | 
 52 |   with Experiment(logging_directory='./output/', with_mlflow=True) as exp:
 53 |       # logging as you want, and you can see the result in mlflow ui
 54 |       ...
 55 | 
 56 | 
 57 | 
 58 | Logging extra parameters to run_experiment
 59 | -------------------------------------------
 60 | 
 61 | By using ``inherit_experiment`` parameter, you can mix any additional logging with the results ``run_experiment`` will create.
 62 | In the following example, nyaggle records the result of ``run_experiment`` under the same experiment as
 63 | the parameter and metrics written outside of the function.
 64 | 
 65 | .. code-block:: python
 66 | 
 67 |   from nyaggle.experiment import Experiment, run_experiment
 68 | 
 69 |   with Experiment(logging_directory='./output/') as exp:
 70 | 
 71 |       exp.log_param('my extra param', 'bar')
 72 | 
 73 |       run_experiment(..., inherit_experiment=exp)
 74 | 
 75 |       exp.log_metrics('my extra metrics', 0.999)
 76 | 
 77 | 
 78 | Tracking seed averaging experiment
 79 | ---------------------------------------
 80 | 
 81 | If you train a bunch of models with different seeds to ensemble them, tracking individual models with mlflow
 82 | will make GUI filled up with these results and make it difficult to manage.
 83 | A nested run functionality of mlflow is useful to display multiple models together in one result.
 84 | 
 85 | .. code-block:: python
 86 | 
 87 |   import mlflow
 88 |   from nyaggle.ensemble import averaging
 89 |   from nyaggle.util import make_submission_df
 90 | 
 91 |   mlflow.start_run()
 92 |   base_logging_dir = './seed-avg/'
 93 |   results = []
 94 | 
 95 |   for i in range(3):
 96 |       mlflow.start_run(nested=True)  # use nested-run to place each experiments under the parent run
 97 |       params['seed'] = i
 98 | 
 99 |       result = run_experiment(params,
100 |                               X_train,
101 |                               y_train,
102 |                               X_test,
103 |                               logging_directory=base_logging_dir+f'seed_{i}',
104 |                               with_mlflow=True)
105 |       results.append(result)
106 | 
107 |       mlflow.end_run()
108 | 
109 | 
110 |   ensemble = averaging([result.test_prediction for result in results])
111 |   sub = make_submission_df(ensemble.test_prediction, pd.read_csv('sample_submission.csv'))
112 |   sub.to_csv('ensemble_sub.csv', index=False)
113 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/feature_store.rst:
--------------------------------------------------------------------------------
  1 | Feature management using feature_store
  2 | =======================================
  3 | 
  4 | Concept
  5 | -------------------------------
  6 | 
  7 | Feature engineering is one of the most important parts of Kaggle.
  8 | If you do a lot of feature engineering, it is time-consuming to calculate
  9 | features each time you build a model.
 10 | 
 11 | Many skilled Kagglers save their features to local disk as binary (npy, pickle or feather)
 12 | to manage their features [1]_ [2]_ [3]_ [4]_.
 13 | 
 14 | ``feature_store`` provides simple helper APIs for feature management.
 15 | 
 16 | 
 17 | .. code-block:: python
 18 | 
 19 |   import pandas as pd
 20 |   import nyaggle.feature_store as fs
 21 | 
 22 |   def make_feature_1(df: pd.DataFrame) -> pd.DataFrame:
 23 |       return ...
 24 | 
 25 |   def make_feature_2(df: pd.DataFrame) -> pd.DataFrame:
 26 |       return ...
 27 | 
 28 |   # feature 1
 29 |   feature_1 = make_feature_1(df)
 30 | 
 31 |   # feature 2
 32 |   feature_2 = make_feature_2(df)
 33 | 
 34 |   # name can be str or int
 35 |   fs.save_feature(feature_1, "my_feature_1")
 36 |   fs.save_feature(feature_2, 42, '../my_favorite_feature_store')  # change directory to be saved
 37 | 
 38 | 
 39 | ``save_feature`` stores dataframe as a feather format under the feature directory (``./features`` by default).
 40 | If you want to load the feature, just call ``load_feature`` by name.
 41 | 
 42 | .. code-block:: python
 43 | 
 44 |   feature_1_restored = fs.load_feature("my_feature_1")
 45 |   feature_2_restored = fs.load_feature(999)
 46 | 
 47 | 
 48 | To merge all features into the main dataframe, call ``load_features`` with the main dataframe you want to merge with.
 49 | 
 50 | 
 51 | .. code-block:: python
 52 | 
 53 |   train = pd.read_csv('train.csv')
 54 |   test = pd.read_csv('test.csv')
 55 |   base_df = pd.concat([train, test])
 56 | 
 57 |   df_with_features = fs.load_features(base_df, ["my_feature_1", "magic_1", "leaky_1"])
 58 | 
 59 | 
 60 | .. note::
 61 |   ``load_features`` assumes that the stored feature values are concatenated in the
 62 |   order [train, test].
 63 | 
 64 | 
 65 | If you don't like separating your feature engineering code into the independent module,
 66 | ``cached_feature`` decorator provides cache functionality.
 67 | The function with this decorator automatically saves the return value using ``save_feature`` on the first call,
 68 | and returns the result of ``load_feature`` on subsequent calls instead of executing the function body.
 69 | 
 70 | .. code-block:: python
 71 | 
 72 |   import pandas as pd
 73 |   import nyaggle.feature_store as fs
 74 | 
 75 |   @fs.cached_feature("my_feature_1")
 76 |   def make_feature_1(df: pd.DataFrame) -> pd.DataFrame:
 77 |       ...
 78 |       return result
 79 | 
 80 |   # saves automatically to features/my_feature_1.f
 81 |   feature_1 = make_feature_1(df)
 82 | 
 83 |   # loads from saved binary instead of calling make_feature_1
 84 |   feature_1 = make_feature_1(df)
 85 | 
 86 | 
 87 | .. note::
 88 |   The function decorated by ``cached_feature`` must return pandas DataFrame.
 89 | 
 90 | 
 91 | Use with ``run_experiment``
 92 | -------------------------------
 93 | 
 94 | If you pass ``feature_list`` and ``feature_directory`` parameters to ``run_experiment`` API,
 95 | nyaggle will combine specified features to the given dataframe before performing cross-validation.
 96 | 
 97 | List of features is logged as parameters (and of course can be seen in mlflow ui),
 98 | that makes your experiment cycle much simpler.
 99 | 
100 | .. code-block:: python
101 | 
102 |   import pandas as pd
103 |   import nyaggle.feature_store as fs
104 |   from nyaggle.experiment import run_experiment
105 | 
106 |   run_experiment(params,
107 |                  X_train,
108 |                  y,
109 |                  X_test,
110 |                  feature_list=["my_feature_1", "magic_1", "leaky_1"],
111 |                  feature_directory="../my_features")
112 | 
113 | 
114 | 
115 | 
116 | Reference
117 | -------------------------------
118 | 
119 | 
120 | .. [1] https://www.kaggle.com/c/avito-demand-prediction/discussion/59881
121 | .. [2] https://github.com/flowlight0/talkingdata-adtracking-fraud-detection
122 | .. [3] https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/55581
123 | .. [4] https://amalog.hateblo.jp/entry/kaggle-feature-management


--------------------------------------------------------------------------------
/docs/source/tutorial/index.rst:
--------------------------------------------------------------------------------
1 | Tutorial
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 | 
7 |    experiment
8 |    feature_store
9 |    experiment_advanced


--------------------------------------------------------------------------------
/examples/kaggle-bnp-paribas/kaggle_bnp_paribas.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | 
 4 | from sklearn.metrics import log_loss
 5 | from nyaggle.experiment import run_experiment
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 | 
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('-g', '--gpu', action='store_true', default=False)
12 |     args = parser.parse_args()
13 | 
14 |     X_train = pd.read_csv('train.csv', index_col='ID')
15 |     X_test = pd.read_csv('test.csv', index_col='ID')
16 |     y_train = X_train['target']
17 |     X_train = X_train.drop('target', axis=1)
18 | 
19 |     cat_params = {
20 |         'eval_metric': 'Logloss',
21 |         'loss_function': 'Logloss',
22 |         'metric_period': 10,
23 |         'depth': 8,
24 |         'task_type': 'GPU' if args.gpu else 'CPU'
25 |     }
26 | 
27 |     result = run_experiment(cat_params, X_train, y_train, X_test, logging_directory='bnp-paribas-{time}',
28 |                             eval_func=log_loss,
29 |                             algorithm_type='cat',
30 |                             sample_submission=pd.read_csv('sample_submission.csv'),
31 |                             with_mlflow=True)
32 | 


--------------------------------------------------------------------------------
/examples/kaggle-days-tokyo/kaggle_days_tokyo.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.metrics import mean_squared_error
 3 | from sklearn.model_selection import GroupKFold
 4 | 
 5 | from nyaggle.experiment import run_experiment
 6 | from nyaggle.feature.category_encoder import TargetEncoder
 7 | 
 8 | lgb_params = {
 9 |     "objective": "rmse",
10 |     "n_estimators": 2000,
11 |     "max_depth": 10,
12 |     "colsample_bytree": 0.8
13 | }
14 | 
15 | X_train = pd.read_csv('train.csv')
16 | X_test = pd.read_csv('test.csv')
17 | y_train = X_train['age']
18 | X_train = X_train.drop('age', axis=1)
19 | 
20 | te_cols = [c for c in X_train.columns if X_train[c].dtype.name == 'object' and c not in ['user_id', 'ts']]
21 | te = TargetEncoder(cv=GroupKFold(5), cols=te_cols, groups=X_train['user_id']).fit(X_train, y_train)
22 | 
23 | 
24 | def transform(te: TargetEncoder, df: pd.DataFrame, y: pd.Series):
25 |     df.drop('ts', axis=1, inplace=True)
26 | 
27 |     if y is not None:
28 |         df = te.fit_transform(df, y)
29 |         y = y.groupby(df['user_id']).min()
30 |     else:
31 |         df = te.transform(df)
32 | 
33 |     df = df.groupby('user_id').agg(['mean', 'min', 'max'])
34 |     df.columns = [e[0] + '_' + e[1] for e in df.columns]
35 |     return df, y
36 | 
37 | 
38 | X_train, y_train = transform(te, X_train, y_train)
39 | X_test, _ = transform(te, X_test, None)
40 | 
41 | # generated submission.csv scores 11.61445 in private LB (35th)
42 | run_experiment(logging_directory='baseline_kaggledays_tokyo',
43 |                model_params=lgb_params,
44 |                X_train=X_train,
45 |                y=y_train,
46 |                X_test=X_test,
47 |                eval_func=mean_squared_error,
48 |                type_of_target='continuous',
49 |                if_exists='replace',
50 |                with_auto_hpo=True,
51 |                sample_submission=pd.read_csv('sample_submission.csv'))
52 | 


--------------------------------------------------------------------------------
/examples/kaggle-plasticc/kaggle_plasticc.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from sklearn.model_selection import StratifiedKFold
 4 | from nyaggle.experiment import run_experiment
 5 | 
 6 | 
 7 | meta = pd.read_csv('training_set_metadata.csv')
 8 | 
 9 | is_extra = meta.hostgal_photoz > 0.0
10 | meta_extra = meta[is_extra]
11 | meta_inner = meta[~is_extra]
12 | 
13 | lgb_param_extra = {
14 |     'objective': 'multiclass',
15 |     'metric': 'multi_logloss',
16 |     'num_class': 9
17 | }
18 | 
19 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
20 | 
21 | result_extra = run_experiment(lgb_param_extra,
22 |                               meta_extra.drop('target', axis=1),
23 |                               meta_extra['target'],
24 |                               logging_directory='plasticc-{time}',
25 |                               cv=skf,
26 |                               type_of_target='multiclass')
27 | 


--------------------------------------------------------------------------------
/examples/wine-quality/wine-quality.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.model_selection import train_test_split
 3 | 
 4 | from nyaggle.experiment import run_experiment
 5 | 
 6 | 
 7 | csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
 8 | 
 9 | data = pd.read_csv(csv_url, sep=';')
10 | 
11 | X = data.drop('quality', axis=1)
12 | y = data['quality']
13 | 
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
15 | 
16 | 
17 | params = {
18 |     'max_depth': 4,
19 |     'n_estimators': 1000,
20 |     'reg_alpha': 0.1
21 | }
22 | 
23 | result = run_experiment(params,
24 |                         X_train,
25 |                         y_train,
26 |                         X_test,
27 |                          './wine-quality-{time}',
28 |                         type_of_target='continuous',
29 |                         with_mlflow=True,
30 |                         with_auto_hpo=True)
31 | 


--------------------------------------------------------------------------------
/nyaggle/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.version import __version__
2 | 


--------------------------------------------------------------------------------
/nyaggle/ensemble/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.ensemble.averaging import averaging, averaging_opt
2 | from nyaggle.ensemble.stacking import stacking


--------------------------------------------------------------------------------
/nyaggle/ensemble/averaging.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, List, Optional, Tuple
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import scipy.stats as stats
  6 | from scipy.optimize import minimize
  7 | 
  8 | from nyaggle.ensemble.common import EnsembleResult
  9 | 
 10 | 
 11 | def averaging(test_predictions: List[np.ndarray],
 12 |               oof_predictions: Optional[List[np.ndarray]] = None,
 13 |               y: Optional[pd.Series] = None,
 14 |               weights: Optional[List[float]] = None,
 15 |               eval_func: Optional[Callable] = None,
 16 |               rank_averaging: bool = False) -> EnsembleResult:
 17 |     """
 18 |     Perform averaging on model predictions.
 19 | 
 20 |     Args:
 21 |         test_predictions:
 22 |             List of predicted values on test data.
 23 |         oof_predictions:
 24 |             List of predicted values on out-of-fold training data.
 25 |         y:
 26 |             Target value
 27 |         weights:
 28 |             Weights for each predictions
 29 |         eval_func:
 30 |             Evaluation metric used for calculating result score. Used only if ``oof_predictions`` and ``y`` are given.
 31 |         rank_averaging:
 32 |             If ``True``, predictions will be converted to rank before averaging.
 33 |     Returns:
 34 |         Namedtuple with following members
 35 | 
 36 |         * test_prediction:
 37 |             numpy array, Average prediction on test data.
 38 |         * oof_prediction:
 39 |             numpy array, Average prediction on Out-of-Fold validation data. ``None`` if ``oof_predictions`` = ``None``.
 40 |         * score:
 41 |             float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``.
 42 |     """
 43 |     if weights is None:
 44 |         weights = np.ones((len(test_predictions))) / len(test_predictions)
 45 | 
 46 |     if rank_averaging:
 47 |         test_predictions, oof_predictions = _to_rank(test_predictions, oof_predictions)
 48 | 
 49 |     def _weighted_average(predictions: List[np.ndarray], weights: List[float]):
 50 |         if len(predictions) != len(weights):
 51 |             raise ValueError('len(predictions) != len(weights)')
 52 |         average = np.zeros_like(predictions[0])
 53 | 
 54 |         for i, weight in enumerate(weights):
 55 |             if predictions[i].shape != average.shape:
 56 |                 raise ValueError('predictions[{}].shape != predictions[0].shape'.format(i))
 57 |             average += predictions[i] * weight
 58 | 
 59 |         return average
 60 | 
 61 |     average_test = _weighted_average(test_predictions, weights)
 62 |     if oof_predictions is not None:
 63 |         average_oof = _weighted_average(oof_predictions, weights)
 64 |         score = eval_func(y, average_oof) if eval_func is not None else None
 65 |     else:
 66 |         average_oof = None
 67 |         score = None
 68 | 
 69 |     return EnsembleResult(average_test, average_oof, score)
 70 | 
 71 | 
 72 | def averaging_opt(test_predictions: List[np.ndarray],
 73 |                   oof_predictions: Optional[List[np.ndarray]],
 74 |                   y: Optional[pd.Series],
 75 |                   eval_func: Optional[Callable[[np.ndarray, np.ndarray], float]],
 76 |                   higher_is_better: bool,
 77 |                   weight_bounds: Tuple[float, float] = (0.0, 1.0),
 78 |                   rank_averaging: bool = False,
 79 |                   method: Optional[str] = None) -> EnsembleResult:
 80 |     """
 81 |     Perform averaging with optimal weights using scipy.optimize.
 82 | 
 83 |     Args:
 84 |         test_predictions:
 85 |             List of predicted values on test data.
 86 |         oof_predictions:
 87 |             List of predicted values on out-of-fold training data.
 88 |         y:
 89 |             Target value
 90 |         eval_func:
 91 |             Evaluation metric f(y_true, y_pred) used for calculating result score.
 92 |             Used only if ``oof_predictions`` and ``y`` are given.
 93 |         higher_is_better:
 94 |             Determine the direction of optimize ``eval_func``.
 95 |         weight_bounds:
 96 |             Specify lower/upper bounds of each weight.
 97 |         rank_averaging:
 98 |             If ``True``, predictions will be converted to rank before averaging.
 99 |         method:
100 |             Type of solver. If ``None``, SLSQP will be used.
101 |     Returns:
102 |         Namedtuple with following members
103 | 
104 |         * test_prediction:
105 |             numpy array, Average prediction on test data.
106 |         * oof_prediction:
107 |             numpy array, Average prediction on Out-of-Fold validation data. ``None`` if ``oof_predictions`` = ``None``.
108 |         * score:
109 |             float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``.
110 |     """
111 | 
112 |     def _minimize(weights):
113 |         prediction = np.zeros_like(oof_predictions[0])
114 |         for weight, oof in zip(weights, oof_predictions):
115 |             prediction += weight * oof
116 |         oof_score = eval_func(y, prediction)
117 | 
118 |         return -oof_score if higher_is_better else oof_score
119 | 
120 |     weights = np.ones((len(test_predictions))) / len(test_predictions)
121 | 
122 |     if rank_averaging:
123 |         test_predictions, oof_predictions = _to_rank(test_predictions, oof_predictions)
124 | 
125 |     method = method or 'SLSQP'
126 | 
127 |     if method in ['COBYLA', 'SLSQP', 'trust-constr']:
128 |         cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
129 |     else:
130 |         cons = None
131 | 
132 |     bounds = [weight_bounds] * len(test_predictions)
133 | 
134 |     result = minimize(_minimize, weights, method=method, constraints=cons, bounds=bounds)
135 | 
136 |     return averaging(test_predictions, oof_predictions, y, result['x'], eval_func)
137 | 
138 | 
139 | def _to_rank(test_predictions: List[np.ndarray], oof_predictions: Optional[List[np.ndarray]]):
140 |     if oof_predictions is not None:
141 |         oof_predictions = [stats.rankdata(oof) / len(oof) for oof in oof_predictions]
142 |     test_predictions = [stats.rankdata(test) / len(test) for test in test_predictions]
143 | 
144 |     return test_predictions, oof_predictions
145 | 


--------------------------------------------------------------------------------
/nyaggle/ensemble/common.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | 
3 | EnsembleResult = namedtuple('EnsembleResult', ['test_prediction', 'oof_prediction', 'score'])
4 | 
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/nyaggle/ensemble/stacking.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Iterable, List, Union, Optional
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import sklearn.utils.multiclass as multiclass
 6 | from category_encoders.utils import convert_input, convert_input_vector
 7 | from sklearn.base import BaseEstimator
 8 | from sklearn.linear_model import LogisticRegression, Ridge
 9 | from sklearn.model_selection import BaseCrossValidator, GridSearchCV
10 | from sklearn.pipeline import make_pipeline
11 | from sklearn.preprocessing import StandardScaler
12 | 
13 | from nyaggle.ensemble.common import EnsembleResult
14 | from nyaggle.validation import cross_validate
15 | 
16 | 
17 | def stacking(test_predictions: List[np.ndarray],
18 |              oof_predictions: List[np.ndarray],
19 |              y: pd.Series,
20 |              estimator: Optional[BaseEstimator] = None,
21 |              cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
22 |              groups: Optional[pd.Series] = None,
23 |              type_of_target: str = 'auto',
24 |              eval_func: Optional[Callable] = None) -> EnsembleResult:
25 |     """
26 |     Perform stacking on predictions.
27 | 
28 |     Args:
29 |         test_predictions:
30 |             List of predicted values on test data.
31 |         oof_predictions:
32 |             List of predicted values on out-of-fold training data.
33 |         y:
34 |             Target value
35 |         estimator:
36 |             Estimator used for the 2nd-level model.
37 |             If ``None``, the default estimator (auto-tuned linear model) will be used.
38 |         cv:
39 |             int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
40 | 
41 |             - None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
42 |             - integer, to specify the number of folds in a ``(Stratified)KFold``,
43 |             - CV splitter (the instance of ``BaseCrossValidator``),
44 |             - An iterable yielding (train, test) splits as arrays of indices.
45 |         groups:
46 |             Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
47 |         type_of_target:
48 |             The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
49 |             Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
50 |         eval_func:
51 |             Evaluation metric used for calculating result score. Used only if ``oof_predictions`` and ``y`` are given.
52 |     Returns:
53 |         Namedtuple with following members
54 | 
55 |         * test_prediction:
56 |             numpy array, Average prediction on test data.
57 |         * oof_prediction:
58 |             numpy array, Average prediction on Out-of-Fold validation data. ``None`` if ``oof_predictions`` = ``None``.
59 |         * score:
60 |             float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``.
61 |     """
62 |     assert len(oof_predictions) == len(test_predictions), "Number of oof and test predictions should be same"
63 | 
64 |     def _stack(predictions):
65 |         if predictions[0].ndim == 1:
66 |             predictions = [p.reshape(len(p), -1) for p in predictions]
67 |         return np.hstack(predictions)
68 | 
69 |     X_train = convert_input(_stack(oof_predictions))
70 |     y = convert_input_vector(y, X_train.index)
71 |     X_test = convert_input(_stack(test_predictions))
72 | 
73 |     assert len(X_train) == len(y)
74 | 
75 |     if type_of_target == 'auto':
76 |         type_of_target = multiclass.type_of_target(y)
77 | 
78 |     if estimator is None:
79 |         # if estimator is None, tuned linear estimator is used
80 |         if type_of_target == 'continuous':
81 |             estimator = make_pipeline(StandardScaler(), Ridge(random_state=0))
82 |             param_grid = {
83 |                 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10],
84 |             }
85 |         else:
86 |             estimator = LogisticRegression(random_state=0, solver='liblinear')
87 |             param_grid = {
88 |                 'penalty': ['l1', 'l2'],
89 |                 'C': [0.001, 0.01, 0.1, 1, 10],
90 |             }
91 |         grid_search = GridSearchCV(estimator, param_grid, cv=cv)
92 |         grid_search.fit(X_train, y, groups=groups)
93 |         estimator = grid_search.best_estimator_
94 | 
95 |     result = cross_validate(estimator, X_train, y, X_test, cv=cv, groups=groups, eval_func=eval_func, type_of_target=type_of_target)
96 |     score = result.scores[-1] if result.scores else None
97 | 
98 |     return EnsembleResult(result.test_prediction, result.oof_prediction, score)
99 | 


--------------------------------------------------------------------------------
/nyaggle/environment.py:
--------------------------------------------------------------------------------
 1 | # pytorch
 2 | 
 3 | try:
 4 |     import torch
 5 |     _has_torch = True
 6 | except ImportError:
 7 |     _has_torch = False
 8 | 
 9 | 
10 | def requires_torch():
11 |     if not _has_torch:
12 |         raise ImportError('You need to install pytorch before using this API.')
13 | 
14 | 
15 | # mlflow
16 | 
17 | try:
18 |     import mlflow
19 |     _has_mlflow = True
20 | except ImportError:
21 |     _has_mlflow = False
22 | 
23 | 
24 | def requires_mlflow():
25 |     if not _has_mlflow:
26 |         raise ImportError('You need to install mlflow before using this API.')
27 | 
28 | 
29 | # lightgbm
30 | 
31 | 
32 | try:
33 |     import lightgbm
34 |     _has_lightgbm = True
35 | except ImportError:
36 |     _has_lightgbm = False
37 | 
38 | 
39 | def requires_lightgbm():
40 |     if not _has_lightgbm:
41 |         raise ImportError('You need to install lightgbm before using this API.')
42 | 
43 | 
44 | # lightgbm
45 | 
46 | 
47 | try:
48 |     import catboost
49 |     _has_catboost = True
50 |     # TODO check catboost version >= 0.17
51 | except ImportError:
52 |     _has_catboost = False
53 | 
54 | 
55 | def requires_catboost():
56 |     if not _has_catboost:
57 |         raise ImportError('You need to install catboost before using this API.')
58 | 
59 | 
60 | # xgboost
61 | 
62 | 
63 | try:
64 |     import xgboost
65 |     _has_xgboost = True
66 | except ImportError:
67 |     _has_xgboost = False
68 | 
69 | 
70 | def requires_xgboost():
71 |     if not _has_xgboost:
72 |         raise ImportError('You need to install xgboost before using this API.')
73 | 


--------------------------------------------------------------------------------
/nyaggle/experiment/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.experiment.experiment import Experiment, add_leaderboard_score
2 | from nyaggle.experiment.run import autoprep_gbdt, run_experiment, find_best_lgbm_parameter
3 | 


--------------------------------------------------------------------------------
/nyaggle/experiment/auto_prep.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from pandas.api.types import is_integer_dtype, is_categorical_dtype
 6 | from sklearn.preprocessing import LabelEncoder
 7 | 
 8 | 
 9 | def autoprep_gbdt(algorithm_type: str, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
10 |                   categorical_feature_to_treat: Optional[List[str]] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
11 |     if categorical_feature_to_treat is None:
12 |         categorical_feature_to_treat = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
13 | 
14 |     # LightGBM:
15 |     # Can handle categorical dtype. Otherwise, int, float or bool is acceptable for categorical columns.
16 |     # https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support
17 |     #
18 |     # CatBoost:
19 |     # int, float, bool or str is acceptable for categorical columns. NaN should be filled.
20 |     # https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
21 |     #
22 |     # XGBoost:
23 |     # All categorical column should be encoded beforehand.
24 | 
25 |     if algorithm_type == 'lgbm':
26 |         # LightGBM can handle categorical dtype natively
27 |         categorical_feature_to_treat = [c for c in categorical_feature_to_treat if not is_categorical_dtype(X_train[c])]
28 | 
29 |     if algorithm_type == 'cat' and len(categorical_feature_to_treat) > 0:
30 |         X_train = X_train.copy()
31 |         X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy()  # dummy
32 |         for c in categorical_feature_to_treat:
33 |             X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
34 | 
35 |     if algorithm_type in ('lgbm', 'xgb') and len(categorical_feature_to_treat) > 0:
36 |         assert X_test is not None, "X_test is required for XGBoost with categorical variables"
37 |         X_train = X_train.copy()
38 |         X_test = X_test.copy()
39 | 
40 |         for c in categorical_feature_to_treat:
41 |             X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
42 |             le = LabelEncoder()
43 |             concat = np.concatenate([X_train[c].values, X_test[c].values])
44 |             concat = le.fit_transform(concat)
45 |             X_train[c] = concat[:len(X_train)]
46 |             X_test[c] = concat[len(X_train):]
47 | 
48 |     return X_train, X_test
49 | 
50 | 
51 | def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]) -> Tuple[pd.Series, pd.Series]:
52 |     if is_categorical_dtype(strain):
53 |         return strain.cat.codes, stest.cat.codes
54 |     elif is_integer_dtype(strain.dtype):
55 |         fillval = min(strain.min(), stest.min()) - 1
56 |         return strain.fillna(fillval), stest.fillna(fillval)
57 |     else:
58 |         return strain.astype(str), stest.astype(str)
59 | 


--------------------------------------------------------------------------------
/nyaggle/experiment/experiment.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numbers
  3 | import os
  4 | import shutil
  5 | import uuid
  6 | import warnings
  7 | from logging import getLogger, FileHandler, DEBUG, Logger
  8 | from typing import Dict, Optional
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | 
 13 | from nyaggle.environment import requires_mlflow
 14 | 
 15 | MLFLOW_KEY_LENGTH_LIMIT = 250
 16 | MLFLOW_VALUE_LENGTH_LIMIT = 250
 17 | 
 18 | 
 19 | def _sanitize_mlflow_param(param, limit):
 20 |     if len(str(param)) > limit:
 21 |         warnings.warn('Length of param exceeds limit {}. It will be truncated. value: {}'.format(limit, param))
 22 |         param = str(param)[:limit]
 23 |     return param
 24 | 
 25 | 
 26 | def _check_directory(directory: str, if_exists: str) -> str:
 27 |     if os.path.exists(directory):
 28 |         if if_exists == 'error':
 29 |             raise ValueError('directory {} already exists.'.format(directory))
 30 |         elif if_exists == 'replace':
 31 |             warnings.warn(
 32 |                 'directory {} already exists. It will be replaced by the new result'.format(directory))
 33 | 
 34 |             existing_run_id = _try_to_get_existing_mlflow_run_id(directory)
 35 |             if existing_run_id is not None:
 36 |                 requires_mlflow()
 37 |                 import mlflow
 38 |                 mlflow.delete_run(existing_run_id)
 39 | 
 40 |             shutil.rmtree(directory, ignore_errors=True)
 41 |         elif if_exists == 'rename':
 42 |             postfix_index = 1
 43 | 
 44 |             while os.path.exists(directory + '_' + str(postfix_index)):
 45 |                 postfix_index += 1
 46 | 
 47 |             directory += '_' + str(postfix_index)
 48 |             warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory))
 49 |     return directory
 50 | 
 51 | 
 52 | def _sanitize(v):
 53 |     return v if isinstance(v, numbers.Number) else str(v)
 54 | 
 55 | 
 56 | def _try_to_get_existing_mlflow_run_id(logging_directory: str) -> Optional[str]:
 57 |     mlflow_path = os.path.join(logging_directory, 'mlflow.json')
 58 |     if os.path.exists(mlflow_path):
 59 |         with open(mlflow_path, 'r') as f:
 60 |             mlflow_metadata = json.load(f)
 61 |             return mlflow_metadata['run_id']
 62 |     return None
 63 | 
 64 | 
 65 | class Experiment(object):
 66 |     """Minimal experiment logger for Kaggle
 67 | 
 68 |     This module provides minimal functionality for tracking experiments.
 69 |     The output files are laid out as follows:
 70 | 
 71 |     .. code-block:: none
 72 | 
 73 |       <logging_directory>/
 74 |           log.txt       <== Output of log
 75 |           metrics.json  <== Output of log_metric(s), format: name,score
 76 |           params.json   <== Output of log_param(s), format: key,value
 77 |           mlflow.json   <== mlflow's run_id, experiment_id and artifact_uri (logged if with_mlflow=True)
 78 | 
 79 | 
 80 |     You can add numpy array and pandas dataframe under the directory through ``log_numpy`` and ``log_dataframe``.
 81 | 
 82 |     Args:
 83 |         logging_directory:
 84 |             Path to directory where output is stored.
 85 |         custom_logger:
 86 |             A custom logger to be used instead of default logger.
 87 |         with_mlflow:
 88 |             If True, `mlflow tracking <https://www.mlflow.org/docs/latest/tracking.html>`_ is used.
 89 |             One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
 90 |             Note that all output files are located both ``logging_directory`` and
 91 |             mlflow's directory (``mlruns`` by default).
 92 |         if_exists:
 93 |             How to behave if the logging directory already exists.
 94 | 
 95 |             - error: Raise a ValueError.
 96 |             - replace: Delete logging directory before logging.
 97 |             - append: Append to exisitng experiment.
 98 |             - rename: Rename current directory by adding "_1", "_2"... prefix
 99 |     Example:
100 |         >>> import numpy as np
101 |         >>> import pandas as pd
102 |         >>> from nyaggle.experiment import Experiment
103 |         >>>
104 |         >>> with Experiment(logging_directory='./output/') as exp:
105 |         >>>     # log key-value pair as a parameter
106 |         >>>     exp.log_param('lr', 0.01)
107 |         >>>     exp.log_param('optimizer', 'adam')
108 |         >>>
109 |         >>>     # log text
110 |         >>>     exp.log('blah blah blah')
111 |         >>>
112 |         >>>     # log metric
113 |         >>>     exp.log_metric('CV', 0.85)
114 |         >>>
115 |         >>>     # log dictionary with flattening keys
116 |         >>>     exp.log_dict('params', {'X': 3, 'Y': {'Z': 'foobar'}})
117 |         >>>
118 |         >>>     # log numpy ndarray, pandas dafaframe and any artifacts
119 |         >>>     exp.log_numpy('predicted', np.zeros(1))
120 |         >>>     exp.log_dataframe('submission', pd.DataFrame(), file_format='csv')
121 |         >>>     exp.log_artifact('path-to-your-file')
122 |     """
123 | 
124 |     def __init__(self,
125 |                  logging_directory: str,
126 |                  custom_logger: Optional[Logger] = None,
127 |                  with_mlflow: bool = False,
128 |                  if_exists: str = 'error'
129 |                  ):
130 |         logging_directory = _check_directory(logging_directory, if_exists)
131 |         os.makedirs(logging_directory, exist_ok=True)
132 | 
133 |         self.logging_directory = logging_directory
134 |         self.with_mlflow = with_mlflow
135 | 
136 |         if custom_logger is not None:
137 |             self.logger = custom_logger
138 |             self.is_custom = True
139 |         else:
140 |             self.logger = getLogger(str(uuid.uuid4()))
141 |             self.log_path = os.path.join(logging_directory, 'log.txt')
142 |             self.logger.addHandler(FileHandler(self.log_path))
143 |             self.logger.setLevel(DEBUG)
144 |             self.is_custom = False
145 |         self.metrics = self._load_dict('metrics.json')
146 |         self.params = self._load_dict('params.json')
147 |         self.inherit_existing_run = False
148 | 
149 |         if self.with_mlflow:
150 |             requires_mlflow()
151 |             self.mlflow_run_id = _try_to_get_existing_mlflow_run_id(logging_directory)
152 |             if self.mlflow_run_id is not None:
153 |                 self.mlflow_run_name = None
154 |             else:
155 |                 self.mlflow_run_name = logging_directory
156 | 
157 |     def __enter__(self):
158 |         self.start()
159 |         return self
160 | 
161 |     def __exit__(self, ex_type, ex_value, trace):
162 |         self.stop()
163 | 
164 |     @classmethod
165 |     def continue_from(cls, logging_directory: str, with_mlflow: bool = False):
166 |         return cls(logging_directory=logging_directory, if_exists='append', with_mlflow=with_mlflow)
167 | 
168 |     def start(self):
169 |         """
170 |         Start a new experiment.
171 |         """
172 |         if self.with_mlflow:
173 |             import mlflow
174 | 
175 |             if mlflow.active_run() is not None:
176 |                 active_run = mlflow.active_run()
177 |                 self.inherit_existing_run = True
178 |             else:
179 |                 active_run = mlflow.start_run(run_name=self.mlflow_run_name, run_id=self.mlflow_run_id)
180 |             mlflow_metadata = {
181 |                 'artifact_uri': active_run.info.artifact_uri,
182 |                 'experiment_id': active_run.info.experiment_id,
183 |                 'run_id': active_run.info.run_id
184 |             }
185 |             self.mlflow_run_id = active_run.info.run_id
186 |             with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f:
187 |                 json.dump(mlflow_metadata, f, indent=4)
188 | 
189 |     def _load_dict(self, filename: str) -> Dict:
190 |         try:
191 |             path = os.path.join(self.logging_directory, filename)
192 |             with open(path, 'r') as f:
193 |                 return json.load(f)
194 |         except IOError:
195 |             self.logger.warning('failed to load file: {}'.format(filename))
196 |             return {}
197 | 
198 |     def _save_dict(self, obj: Dict, filename: str):
199 |         try:
200 |             path = os.path.join(self.logging_directory, filename)
201 |             with open(path, 'w') as f:
202 |                 json.dump(obj, f, indent=2)
203 |         except IOError:
204 |             self.logger.warning('failed to save file: {}'.format(filename))
205 | 
206 |     def stop(self):
207 |         """
208 |         Stop current experiment.
209 |         """
210 |         self._save_dict(self.metrics, 'metrics.json')
211 |         self._save_dict(self.params, 'params.json')
212 | 
213 |         if not self.is_custom:
214 |             for h in self.logger.handlers:
215 |                 h.close()
216 | 
217 |         if self.with_mlflow:
218 |             import mlflow
219 |             from mlflow.exceptions import MlflowException
220 | 
221 |             try:
222 |                 mlflow.log_artifact(self.log_path)
223 |                 mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json'))
224 |                 mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json'))
225 |             except MlflowException as e:
226 |                 warnings.warn('Error in saving artifacts to mlflow. The result may not be saved.: {}'.format(e))
227 |             if not self.inherit_existing_run:
228 |                 mlflow.end_run()
229 | 
230 |     def get_logger(self) -> Logger:
231 |         """
232 |         Get logger used in this experiment.
233 | 
234 |         Returns:
235 |             logger object
236 |         """
237 |         return self.logger
238 | 
239 |     def get_run(self):
240 |         """
241 |         Get mlflow's currently active run, or None if ``with_mlflow = False``.
242 | 
243 |         Returns:
244 |             active Run
245 |         """
246 |         if not self.with_mlflow:
247 |             return None
248 | 
249 |         import mlflow
250 |         return mlflow.active_run()
251 | 
252 |     def log(self, text: str):
253 |         """
254 |         Logs a message on the logger for the experiment.
255 | 
256 |         Args:
257 |             text:
258 |                 The message to be written.
259 |         """
260 |         self.logger.info(text)
261 | 
262 |     def log_param(self, key, value):
263 |         """
264 |         Logs a key-value pair for the experiment.
265 | 
266 |         Args:
267 |             key: parameter name
268 |             value: parameter value
269 |         """
270 |         key = _sanitize(key)
271 |         value = _sanitize(value)
272 |         self.params[key] = value
273 | 
274 |         if self.with_mlflow:
275 |             import mlflow
276 |             from mlflow.exceptions import MlflowException
277 | 
278 |             key_mlflow = _sanitize_mlflow_param(key, MLFLOW_KEY_LENGTH_LIMIT)
279 |             value_mlflow = _sanitize_mlflow_param(value, MLFLOW_VALUE_LENGTH_LIMIT)
280 | 
281 |             try:
282 |                 mlflow.log_param(key_mlflow, value_mlflow)
283 |             except MlflowException as e:
284 |                 warnings.warn('Error in logging parameter {} to mlflow. Skipped. {}'.format(key, e))
285 | 
286 |     def log_params(self, params: Dict):
287 |         """
288 |         Logs a batch of params for the experiments.
289 | 
290 |         Args:
291 |             params: dictionary of parameters
292 |         """
293 |         for k, v in params.items():
294 |             self.log_param(k, v)
295 | 
296 |     def log_dict(self, name: str, value: Dict, separator: str = '.'):
297 |         """
298 |         Logs a dictionary as parameter with flatten format.
299 | 
300 |         Args:
301 |             name: Parameter name
302 |             value: Parameter value
303 |             separator: Separating character used to concatanate keys
304 |         Examples:
305 |             >>> with Experiment('./') as e:
306 |             >>>     e.log_dict('a', {'b': 1, 'c': 'd'})
307 |             >>>     print(e.params)
308 |             { 'a.b': 1, 'a.c': 'd' }
309 |         """
310 | 
311 |         if value is None:
312 |             self.log_param(name, value)
313 |             return
314 | 
315 |         def _flatten(d: Dict, prefix: str, separator: str) -> Dict:
316 |             items = []
317 |             for k, v in d.items():
318 |                 child_key = prefix + separator + str(k) if prefix else str(k)
319 |                 if isinstance(v, Dict) and v:
320 |                     items.extend(_flatten(v, child_key, separator).items())
321 |                 else:
322 |                     items.append((child_key, v))
323 |             return dict(items)
324 | 
325 |         value = _flatten(value, name, separator)
326 |         self.log_params(value)
327 | 
328 |     def log_metric(self, name: str, score: float):
329 |         """
330 |         Log a metric under the logging directory.
331 | 
332 |         Args:
333 |             name:
334 |                 Metric name.
335 |             score:
336 |                 Metric value.
337 |         """
338 |         name = _sanitize(name)
339 |         score = _sanitize(score)
340 |         self.metrics[name] = score
341 | 
342 |         if self.with_mlflow:
343 |             import mlflow
344 |             from mlflow.exceptions import MlflowException
345 | 
346 |             try:
347 |                 mlflow.log_metric(name, score)
348 |             except MlflowException as e:
349 |                 warnings.warn('Error in logging metric {} to mlflow. Skipped. {}'.format(name, e))
350 | 
351 |     def log_metrics(self, metrics: Dict):
352 |         """
353 |         Log a batch of metrics under the logging directory.
354 | 
355 |         Args:
356 |             metrics: dictionary of metrics.
357 |         """
358 |         for k, v in metrics.items():
359 |             self.log_metric(k, v)
360 | 
361 |     def log_numpy(self, name: str, array: np.ndarray):
362 |         """
363 |         Log a numpy ndarray under the logging directory.
364 | 
365 |         Args:
366 |             name:
367 |                 Name of the file. A .npy extension will be appended to the file name if it does not already have one.
368 |             array:
369 |                 Array data to be saved.
370 |         """
371 |         path = os.path.join(self.logging_directory, name)
372 |         np.save(path, array)
373 | 
374 |         if self.with_mlflow:
375 |             import mlflow
376 |             mlflow.log_artifact(path + '.npy')
377 | 
378 |     def log_dataframe(self, name: str, df: pd.DataFrame, file_format: str = 'feather'):
379 |         """
380 |         Log a pandas dataframe under the logging directory.
381 | 
382 |         Args:
383 |             name:
384 |                 Name of the file. A ``.f`` or ``.csv`` extension will be appended to the file name
385 |                 if it does not already have one.
386 |             df:
387 |                 A dataframe to be saved.
388 |             file_format:
389 |                 A format of output file. ``csv`` and ``feather`` are supported.
390 |         """
391 |         path = os.path.join(self.logging_directory, name)
392 |         if file_format == 'feather':
393 |             if not path.endswith('.f'):
394 |                 path += '.f'
395 |             df.to_feather(path)
396 |         elif file_format == 'csv':
397 |             if not path.endswith('.csv'):
398 |                 path += '.csv'
399 |             df.to_csv(path, index=False)
400 |         else:
401 |             raise RuntimeError('format not supported')
402 | 
403 |         if self.with_mlflow:
404 |             import mlflow
405 |             mlflow.log_artifact(path)
406 | 
407 |     def log_artifact(self, src_file_path: str):
408 |         """
409 |         Make a copy of the file under the logging directory.
410 | 
411 |         Args:
412 |             src_file_path:
413 |                 Path of the file. If path is not a child of the logging directory, the file will be copied.
414 |                 If ``with_mlflow`` is True, ``mlflow.log_artifact`` will be called (then another copy will be made).
415 |         """
416 |         logging_path = os.path.abspath(self.logging_directory)
417 |         src_file_path = os.path.abspath(src_file_path)
418 | 
419 |         if os.path.commonpath([logging_path]) != os.path.commonpath([logging_path, src_file_path]):
420 |             src_file = os.path.basename(src_file_path)
421 |             shutil.copy(src_file, self.logging_directory)
422 | 
423 |         if self.with_mlflow:
424 |             import mlflow
425 |             mlflow.log_artifact(src_file_path)
426 | 
427 | 
428 | def add_leaderboard_score(logging_directory: str, score: float):
429 |     """
430 |     Record leaderboard score to the existing experiment directory.
431 | 
432 |     Args:
433 |         logging_directory:
434 |             The directory to be added
435 |         score:
436 |             Leaderboard score
437 |     """
438 |     with Experiment.continue_from(logging_directory) as e:
439 |         e.log_metric('LB', score)
440 | 


--------------------------------------------------------------------------------
/nyaggle/experiment/hyperparameter_tuner.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import warnings
 3 | from typing import Dict, Iterable, Optional, Union
 4 | 
 5 | import optuna.integration.lightgbm as optuna_lgb
 6 | import pandas as pd
 7 | import sklearn.utils.multiclass as multiclass
 8 | from sklearn.model_selection import BaseCrossValidator
 9 | 
10 | from nyaggle.validation.split import check_cv
11 | 
12 | 
13 | def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series,
14 |                              cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
15 |                              groups: Optional[pd.Series] = None,
16 |                              time_budget: Optional[int] = None,
17 |                              type_of_target: str = 'auto') -> Dict:
18 |     """
19 |     Search hyperparameter for lightgbm using optuna.
20 | 
21 |     Args:
22 |         base_param:
23 |             Base parameters passed to lgb.train.
24 |         X:
25 |             Training data.
26 |         y:
27 |             Target
28 |         cv:
29 |             int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
30 |         groups:
31 |             Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
32 |         time_budget:
33 |             Time budget for tuning (in seconds).
34 |         type_of_target:
35 |             The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
36 |             Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
37 | 
38 |     Returns:
39 |         The best parameters found
40 |     """
41 |     cv = check_cv(cv, y)
42 | 
43 |     if type_of_target == 'auto':
44 |         type_of_target = multiclass.type_of_target(y)
45 | 
46 |     train_index, test_index = next(cv.split(X, y, groups))
47 | 
48 |     dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index])
49 |     dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index])
50 | 
51 |     params = copy.deepcopy(base_param)
52 |     if 'early_stopping_rounds' not in params:
53 |         params['early_stopping_rounds'] = 100
54 | 
55 |     if params.get('feature_pre_filter'):
56 |         warnings.warn("feature_pre_filter will be set to False to tune min_data_in_leaf.")
57 |     params['feature_pre_filter'] = False
58 | 
59 |     if not any([p in params for p in ('num_iterations', 'num_iteration',
60 |                                       'num_trees', 'num_tree',
61 |                                       'num_rounds', 'num_round')]):
62 |         params['num_iterations'] = params.get('n_estimators', 10000)
63 | 
64 |     if 'objective' not in params:
65 |         tot_to_objective = {
66 |             'binary': 'binary',
67 |             'continuous': 'regression',
68 |             'multiclass': 'multiclass'
69 |         }
70 |         params['objective'] = tot_to_objective[type_of_target]
71 | 
72 |     if 'metric' not in params and 'objective' in params:
73 |         if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root',
74 |                                    'root_mean_squared_error', 'rmse']:
75 |             params['metric'] = 'l2'
76 |         if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']:
77 |             params['metric'] = 'l1'
78 |         if params['objective'] in ['binary']:
79 |             params['metric'] = 'binary_logloss'
80 |         if params['objective'] in ['multiclass']:
81 |             params['metric'] = 'multi_logloss'
82 | 
83 |     if not any([p in params for p in ('verbose', 'verbosity')]):
84 |         params['verbosity'] = -1
85 | 
86 |     model = optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0,
87 |                              time_budget=time_budget)
88 | 
89 |     return model.params
90 | 


--------------------------------------------------------------------------------
/nyaggle/experiment/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import time
  4 | from collections import namedtuple
  5 | from datetime import datetime
  6 | from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
  7 | 
  8 | import pandas as pd
  9 | import sklearn.utils.multiclass as multiclass
 10 | from sklearn.base import BaseEstimator
 11 | from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss
 12 | from sklearn.model_selection import BaseCrossValidator
 13 | 
 14 | from nyaggle.environment import requires_catboost, requires_lightgbm, requires_xgboost
 15 | from nyaggle.experiment.auto_prep import autoprep_gbdt
 16 | from nyaggle.experiment.experiment import Experiment
 17 | from nyaggle.experiment.hyperparameter_tuner import find_best_lgbm_parameter
 18 | from nyaggle.feature_store import load_features
 19 | from nyaggle.util import plot_importance, is_gbdt_instance, make_submission_df
 20 | from nyaggle.validation.cross_validate import cross_validate
 21 | from nyaggle.validation.split import check_cv
 22 | 
 23 | ExperimentResult = namedtuple('ExperimentResult',
 24 |                               [
 25 |                                   'oof_prediction',
 26 |                                   'test_prediction',
 27 |                                   'metrics',
 28 |                                   'models',
 29 |                                   'importance',
 30 |                                   'time',
 31 |                                   'submission_df'
 32 |                               ])
 33 | 
 34 | 
 35 | class ExpeimentProxy(object):
 36 |     __slots__ = ["_obj", "__weakref__"]
 37 | 
 38 |     def __init__(self, obj):
 39 |         object.__setattr__(self, "_obj", obj)
 40 | 
 41 |     def __getattribute__(self, name):
 42 |         return getattr(object.__getattribute__(self, "_obj"), name)
 43 | 
 44 |     def __setattr__(self, name, value):
 45 |         setattr(object.__getattribute__(self, "_obj"), name, value)
 46 | 
 47 |     def __enter__(self):
 48 |         return self
 49 | 
 50 |     def __exit__(self, ex_type, ex_value, trace):
 51 |         pass
 52 | 
 53 | 
 54 | def run_experiment(model_params: Dict[str, Any],
 55 |                    X_train: pd.DataFrame, y: pd.Series,
 56 |                    X_test: Optional[pd.DataFrame] = None,
 57 |                    logging_directory: str = 'output/{time}',
 58 |                    if_exists: str = 'error',
 59 |                    eval_func: Optional[Callable] = None,
 60 |                    algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
 61 |                    fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
 62 |                    cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
 63 |                    groups: Optional[pd.Series] = None,
 64 |                    categorical_feature: Optional[List[str]] = None,
 65 |                    sample_submission: Optional[pd.DataFrame] = None,
 66 |                    submission_filename: Optional[str] = None,
 67 |                    type_of_target: str = 'auto',
 68 |                    feature_list: Optional[List[Union[int, str]]] = None,
 69 |                    feature_directory: Optional[str] = None,
 70 |                    inherit_experiment: Optional[Experiment] = None,
 71 |                    with_auto_hpo: bool = False,
 72 |                    with_auto_prep: bool = False,
 73 |                    with_mlflow: bool = False
 74 |                    ):
 75 |     """
 76 |     Evaluate metrics by cross-validation and stores result
 77 |     (log, oof prediction, test prediction, feature importance plot and submission file)
 78 |     under the directory specified.
 79 | 
 80 |     One of the following estimators are used (automatically dispatched by ``type_of_target(y)`` and ``gbdt_type``).
 81 | 
 82 |     * LGBMClassifier
 83 |     * LGBMRegressor
 84 |     * CatBoostClassifier
 85 |     * CatBoostRegressor
 86 | 
 87 |     The output files are laid out as follows:
 88 | 
 89 |     .. code-block:: none
 90 | 
 91 |       <logging_directory>/
 92 |           log.txt                  <== Logging file
 93 |           importance.png           <== Feature importance plot generated by nyaggle.util.plot_importance
 94 |           oof_prediction.npy       <== Out of fold prediction in numpy array format
 95 |           test_prediction.npy      <== Test prediction in numpy array format
 96 |           submission.csv           <== Submission csv file
 97 |           metrics.json             <== Metrics
 98 |           params.json              <== Parameters
 99 |           models/
100 |               fold1                <== The trained model in fold 1
101 |               ...
102 | 
103 |     Args:
104 |         model_params:
105 |             Parameters passed to the constructor of the classifier/regressor object (i.e. LGBMRegressor).
106 |         X_train:
107 |             Training data. Categorical feature should be casted to pandas categorical type or encoded to integer.
108 |         y:
109 |             Target
110 |         X_test:
111 |             Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
112 |         logging_directory:
113 |             Path to directory where output of experiment is stored.
114 |         if_exists:
115 |             How to behave if the logging directory already exists.
116 | 
117 |             - error: Raise a ValueError.
118 |             - replace: Delete logging directory before logging.
119 |             - append: Append to exisitng experiment.
120 |             - rename: Rename current directory by adding "_1", "_2"... prefix
121 |         fit_params:
122 |             Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except
123 |             eval_set passed for each fold. If callable is passed,
124 |             returning value of ``fit_params(fold_id, train_index, test_index)`` will be used for each fold.
125 |         eval_func:
126 |             Function used for logging and calculation of returning scores.
127 |             This parameter isn't passed to GBDT, so you should set objective and eval_metric separately if needed.
128 |             If ``eval_func`` is None, ``roc_auc_score`` or ``mean_squared_error`` is used by default.
129 |         gbdt_type:
130 |             Type of gradient boosting library used. "lgbm" (lightgbm) or "cat" (catboost)
131 |         cv:
132 |             int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
133 | 
134 |             - None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
135 |             - integer, to specify the number of folds in a ``(Stratified)KFold``,
136 |             - CV splitter (the instance of ``BaseCrossValidator``),
137 |             - An iterable yielding (train, test) splits as arrays of indices.
138 |         groups:
139 |             Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
140 |         sample_submission:
141 |             A sample dataframe alined with test data (Usually in Kaggle, it is available as sample_submission.csv).
142 |             The submission file will be created with the same schema as this dataframe.
143 |         submission_filename:
144 |             The name of submission file will be created under logging directory. If ``None``, the basename of the logging
145 |             directory will be used as a filename.
146 |         categorical_feature:
147 |             List of categorical column names. If ``None``, categorical columns are automatically determined by dtype.
148 |         type_of_target:
149 |             The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
150 |             Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
151 |         feature_list:
152 |             The list of feature ids saved through nyaggle.feature_store module.
153 |         feature_directory:
154 |             The location of features stored. Only used if feature_list is not empty.
155 |         inherit_experiment:
156 |             An experiment object which is used to log results. if not ``None``, all logs in this function are treated
157 |             as a part of this experiment.
158 |         with_auto_prep:
159 |             If True, the input datasets will be copied and automatic preprocessing will be performed on them.
160 |             For example, if ``gbdt_type = 'cat'``, all missing values in categorical features will be filled.
161 |         with_auto_hpo:
162 |             If True, model parameters will be automatically updated using optuna (only available in lightgbm).
163 |         with_mlflow:
164 |             If True, `mlflow tracking <https://www.mlflow.org/docs/latest/tracking.html>`_ is used.
165 |             One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
166 |             Note that all output
167 |             mlflow's directory (``mlruns`` by default).
168 |     :return:
169 |         Namedtuple with following members
170 | 
171 |         * oof_prediction:
172 |             numpy array, shape (len(X_train),) Predicted value on Out-of-Fold validation data.
173 |         * test_prediction:
174 |             numpy array, shape (len(X_test),) Predicted value on test data. ``None`` if X_test is ``None``
175 |         * metrics:
176 |             list of float, shape(nfolds+1) ``metrics[i]`` denotes validation score in i-th fold.
177 |             ``metrics[-1]`` is overall score.
178 |         * models:
179 |             list of objects, shape(nfolds) Trained models for each folds.
180 |         * importance:
181 |             list of pd.DataFrame, feature importance for each fold (type="gain").
182 |         * time:
183 |             Training time in seconds.
184 |         * submit_df:
185 |             The dataframe saved as submission.csv
186 |     """
187 |     start_time = time.time()
188 |     cv = check_cv(cv, y)
189 | 
190 |     if feature_list:
191 |         X = pd.concat([X_train, X_test]) if X_test is not None else X_train
192 |         X.reset_index(drop=True, inplace=True)
193 |         X = load_features(X, feature_list, directory=feature_directory)
194 |         ntrain = len(X_train)
195 |         X_train, X_test = X.iloc[:ntrain, :], X.iloc[ntrain:, :].reset_index(drop=True)
196 | 
197 |     _check_input(X_train, y, X_test)
198 | 
199 |     if categorical_feature is None:
200 |         categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
201 | 
202 |     if type_of_target == 'auto':
203 |         type_of_target = multiclass.type_of_target(y)
204 |     model_type, eval_func, cat_param_name = _dispatch_models(algorithm_type, type_of_target, eval_func)
205 | 
206 |     if with_auto_prep:
207 |         assert algorithm_type in ('cat', 'xgb', 'lgbm'), "with_auto_prep is only supported for gbdt"
208 |         X_train, X_test = autoprep_gbdt(algorithm_type, X_train, X_test, categorical_feature)
209 | 
210 |     logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S'))
211 | 
212 |     if inherit_experiment is not None:
213 |         experiment = ExpeimentProxy(inherit_experiment)
214 |     else:
215 |         experiment = Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow)
216 | 
217 |     with experiment as exp:
218 |         exp.log('Algorithm: {}'.format(algorithm_type))
219 |         exp.log('Experiment: {}'.format(exp.logging_directory))
220 |         exp.log('Params: {}'.format(model_params))
221 |         exp.log('Features: {}'.format(list(X_train.columns)))
222 |         exp.log_param('algorithm_type', algorithm_type)
223 |         exp.log_param('num_features', X_train.shape[1])
224 |         if callable(fit_params):
225 |             exp.log_param('fit_params', str(fit_params))
226 |         else:
227 |             exp.log_dict('fit_params', fit_params)
228 |         exp.log_dict('model_params', model_params)
229 |         if feature_list is not None:
230 |             exp.log_param('features', feature_list)
231 | 
232 |         if with_auto_hpo:
233 |             assert algorithm_type == 'lgbm', 'auto-tuning is only supported for LightGBM'
234 |             model_params = find_best_lgbm_parameter(model_params, X_train, y, cv=cv, groups=groups,
235 |                                                     type_of_target=type_of_target)
236 |             exp.log_param('model_params_tuned', model_params)
237 | 
238 |         exp.log('Categorical: {}'.format(categorical_feature))
239 | 
240 |         models = [model_type(**model_params) for _ in range(cv.get_n_splits())]
241 | 
242 |         if fit_params is None:
243 |             fit_params = {}
244 |         if cat_param_name is not None and not callable(fit_params) and cat_param_name not in fit_params:
245 |             fit_params[cat_param_name] = categorical_feature
246 | 
247 |         if isinstance(fit_params, Dict):
248 |             exp.log_params(fit_params)
249 | 
250 |         result = cross_validate(models, X_train=X_train, y=y, X_test=X_test, cv=cv, groups=groups,
251 |                                 logger=exp.get_logger(), eval_func=eval_func, fit_params=fit_params,
252 |                                 type_of_target=type_of_target)
253 | 
254 |         # save oof
255 |         exp.log_numpy('oof_prediction', result.oof_prediction)
256 |         exp.log_numpy('test_prediction', result.test_prediction)
257 | 
258 |         for i in range(cv.get_n_splits()):
259 |             exp.log_metric('Fold {}'.format(i + 1), result.scores[i])
260 |         exp.log_metric('Overall', result.scores[-1])
261 | 
262 |         # save importance plot
263 |         if result.importance:
264 |             importance = pd.concat(result.importance)
265 |             plot_file_path = os.path.join(exp.logging_directory, 'importance.png')
266 |             plot_importance(importance, plot_file_path)
267 |             exp.log_artifact(plot_file_path)
268 | 
269 |         # save trained model
270 |         for i, model in enumerate(models):
271 |             _save_model(model, exp.logging_directory, i + 1, exp)
272 | 
273 |         # save submission.csv
274 |         submit_df = None
275 |         if X_test is not None:
276 |             submit_df = make_submission_df(result.test_prediction, sample_submission, y)
277 |             exp.log_dataframe(submission_filename or os.path.basename(exp.logging_directory), submit_df, 'csv')
278 | 
279 |         elapsed_time = time.time() - start_time
280 | 
281 |         return ExperimentResult(result.oof_prediction, result.test_prediction,
282 |                                 result.scores, models, result.importance, elapsed_time, submit_df)
283 | 
284 | 
285 | def _dispatch_eval_func(target_type: str, custom_eval: Optional[Callable] = None):
286 |     default_eval_func = {
287 |         'binary': roc_auc_score,
288 |         'multiclass': log_loss,
289 |         'continuous': mean_squared_error
290 |     }
291 |     return custom_eval if custom_eval is not None else default_eval_func[target_type]
292 | 
293 | 
294 | def _dispatch_gbdt_class(algorithm_type: str, type_of_target: str):
295 |     is_regression = type_of_target == 'continuous'
296 | 
297 |     if algorithm_type == 'lgbm':
298 |         requires_lightgbm()
299 |         from lightgbm import LGBMClassifier, LGBMRegressor
300 |         return LGBMRegressor if is_regression else LGBMClassifier
301 |     elif algorithm_type == 'cat':
302 |         requires_catboost()
303 |         from catboost import CatBoostClassifier, CatBoostRegressor
304 |         return CatBoostRegressor if is_regression else CatBoostClassifier
305 |     else:
306 |         requires_xgboost()
307 |         assert algorithm_type == 'xgb'
308 |         from xgboost import XGBClassifier, XGBRegressor
309 |         return XGBRegressor if is_regression else XGBClassifier
310 | 
311 | 
312 | def _dispatch_models(algorithm_type: Union[str, Type[BaseEstimator]],
313 |                      target_type: str, custom_eval: Optional[Callable] = None):
314 |     if not isinstance(algorithm_type, str):
315 |         assert issubclass(algorithm_type, BaseEstimator), "algorithm_type should be str or subclass of BaseEstimator"
316 |         return algorithm_type, _dispatch_eval_func(target_type, custom_eval), None
317 | 
318 |     cat_features = {
319 |         'lgbm': 'categorical_feature',
320 |         'cat': 'cat_features',
321 |         'xgb': None
322 |     }
323 | 
324 |     gbdt_class = _dispatch_gbdt_class(algorithm_type, target_type)
325 |     eval_func = _dispatch_eval_func(target_type, custom_eval)
326 | 
327 |     return gbdt_class, eval_func, cat_features[algorithm_type]
328 | 
329 | 
330 | def _save_model(model: BaseEstimator, logging_directory: str, fold: int, exp: Experiment):
331 |     model_dir = os.path.join(logging_directory, 'models')
332 |     os.makedirs(model_dir, exist_ok=True)
333 |     path = os.path.join(model_dir, 'fold{}'.format(fold))
334 | 
335 |     if is_gbdt_instance(model, 'lgbm'):
336 |         model.booster_.save_model(path)
337 |     elif is_gbdt_instance(model, ('xgb', 'cat')):
338 |         model.save_model(path)
339 |     else:
340 |         with open(path, "wb") as f:
341 |             pickle.dump(model, f)
342 | 
343 |     exp.log_artifact(path)
344 | 
345 | 
346 | def _check_input(X_train: pd.DataFrame, y: pd.Series,
347 |                  X_test: Optional[pd.DataFrame] = None):
348 |     assert len(X_train) == len(y), "length of X_train and y are different. len(X_train) = {}, len(y) = {}".format(
349 |         len(X_train), len(y)
350 |     )
351 | 
352 |     if X_test is not None:
353 |         assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test"
354 | 


--------------------------------------------------------------------------------
/nyaggle/feature/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyanp/nyaggle/636532292d7ce3468cd47a3337bc50d620f0d23b/nyaggle/feature/__init__.py


--------------------------------------------------------------------------------
/nyaggle/feature/base.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, TransformerMixin
2 | 
3 | 
4 | class BaseFeaturizer(BaseEstimator, TransformerMixin):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/nyaggle/feature/category_encoder/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.feature.category_encoder.target_encoder import KFoldEncoderWrapper, TargetEncoder
2 | 


--------------------------------------------------------------------------------
/nyaggle/feature/category_encoder/target_encoder.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Iterable, Union
  2 | 
  3 | import category_encoders as ce
  4 | import numpy as np
  5 | import pandas as pd
  6 | from category_encoders.utils import convert_input, convert_input_vector
  7 | from sklearn.base import BaseEstimator, clone
  8 | from sklearn.model_selection import BaseCrossValidator
  9 | 
 10 | from nyaggle.feature.base import BaseFeaturizer
 11 | from nyaggle.validation.split import check_cv
 12 | 
 13 | 
 14 | class KFoldEncoderWrapper(BaseFeaturizer):
 15 |     """KFold Wrapper for sklearn like interface
 16 | 
 17 |     This class wraps sklearn's TransformerMixIn (object that has fit/transform/fit_transform methods),
 18 |     and call it as K-fold manner.
 19 | 
 20 |     Args:
 21 |         base_transformer:
 22 |             Transformer object to be wrapped.
 23 |         cv:
 24 |             int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
 25 | 
 26 |             - None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
 27 |             - integer, to specify the number of folds in a ``(Stratified)KFold``,
 28 |             - CV splitter (the instance of ``BaseCrossValidator``),
 29 |             - An iterable yielding (train, test) splits as arrays of indices.
 30 |         groups:
 31 |             Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
 32 |         return_same_type:
 33 |             If True, `transform` and `fit_transform` return the same type as X.
 34 |             If False, these APIs always return a numpy array, similar to sklearn's API.
 35 |     """
 36 | 
 37 |     def __init__(self, base_transformer: BaseEstimator,
 38 |                  cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, return_same_type: bool = True,
 39 |                  groups: Optional[pd.Series] = None):
 40 |         self.cv = cv
 41 |         self.base_transformer = base_transformer
 42 | 
 43 |         self.n_splits = None
 44 |         self.transformers = None
 45 |         self.return_same_type = return_same_type
 46 |         self.groups = groups
 47 | 
 48 |     def _pre_train(self, y):
 49 |         self.cv = check_cv(self.cv, y)
 50 |         self.n_splits = self.cv.get_n_splits()
 51 |         self.transformers = [clone(self.base_transformer) for _ in range(self.n_splits + 1)]
 52 | 
 53 |     def _fit_train(self, X: pd.DataFrame, y: Optional[pd.Series], **fit_params) -> pd.DataFrame:
 54 |         if y is None:
 55 |             X_ = self.transformers[-1].transform(X)
 56 |             return self._post_transform(X_)
 57 | 
 58 |         X_ = X.copy()
 59 | 
 60 |         for i, (train_index, test_index) in enumerate(self.cv.split(X_, y, self.groups)):
 61 |             self.transformers[i].fit(X.iloc[train_index], y.iloc[train_index], **fit_params)
 62 |             X_.iloc[test_index, :] = self.transformers[i].transform(X.iloc[test_index])
 63 |         self.transformers[-1].fit(X, y, **fit_params)
 64 | 
 65 |         return X_
 66 | 
 67 |     def _post_fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
 68 |         return X
 69 | 
 70 |     def _post_transform(self, X: pd.DataFrame) -> pd.DataFrame:
 71 |         return X
 72 | 
 73 |     def fit(self, X: pd.DataFrame, y: pd.Series):
 74 |         """
 75 |         Fit models for each fold.
 76 | 
 77 |         Args:
 78 |             X:
 79 |                 Data
 80 |             y:
 81 |                 Target
 82 |         Returns:
 83 |             returns the transformer object.
 84 |         """
 85 |         self._post_fit(self.fit_transform(X, y), y)
 86 |         return self
 87 | 
 88 |     def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Union[pd.DataFrame, np.ndarray]:
 89 |         """
 90 |         Transform X
 91 | 
 92 |         Args:
 93 |             X: Data
 94 | 
 95 |         Returns:
 96 |             Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True.
 97 |         """
 98 |         is_pandas = isinstance(X, pd.DataFrame)
 99 |         X_ = self._fit_train(X, None)
100 |         X_ = self._post_transform(X_)
101 |         return X_ if self.return_same_type and is_pandas else X_.values
102 | 
103 |     def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y: pd.Series = None, **fit_params) \
104 |             -> Union[pd.DataFrame, np.ndarray]:
105 |         """
106 |         Fit models for each fold, then transform X
107 | 
108 |         Args:
109 |             X:
110 |                 Data
111 |             y:
112 |                 Target
113 |             fit_params:
114 |                 Additional parameters passed to models
115 | 
116 |         Returns:
117 |             Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True.
118 |         """
119 |         assert len(X) == len(y)
120 |         self._pre_train(y)
121 | 
122 |         is_pandas = isinstance(X, pd.DataFrame)
123 |         X = convert_input(X)
124 |         y = convert_input_vector(y, X.index)
125 | 
126 |         if y.isnull().sum() > 0:
127 |             # y == null is regarded as test data
128 |             X_ = X.copy()
129 |             X_.loc[~y.isnull(), :] = self._fit_train(X[~y.isnull()], y[~y.isnull()], **fit_params)
130 |             X_.loc[y.isnull(), :] = self._fit_train(X[y.isnull()], None, **fit_params)
131 |         else:
132 |             X_ = self._fit_train(X, y, **fit_params)
133 | 
134 |         X_ = self._post_transform(self._post_fit(X_, y))
135 | 
136 |         return X_ if self.return_same_type and is_pandas else X_.values
137 | 
138 | 
139 | class TargetEncoder(KFoldEncoderWrapper):
140 |     """Target Encoder
141 | 
142 |     KFold version of category_encoders.TargetEncoder in
143 |     https://contrib.scikit-learn.org/categorical-encoding/targetencoder.html.
144 | 
145 |     Args:
146 |         cv:
147 |             int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
148 | 
149 |             - None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
150 |             - integer, to specify the number of folds in a ``(Stratified)KFold``,
151 |             - CV splitter (the instance of ``BaseCrossValidator``),
152 |             - An iterable yielding (train, test) splits as arrays of indices.
153 |         groups:
154 |             Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
155 |         cols:
156 |             A list of columns to encode, if None, all string columns will be encoded.
157 |         drop_invariant:
158 |             Boolean for whether or not to drop columns with 0 variance.
159 |         handle_missing:
160 |             Options are ‘error’, ‘return_nan’ and ‘value’, defaults to ‘value’, which returns the target mean.
161 |         handle_unknown:
162 |             Options are ‘error’, ‘return_nan’ and ‘value’, defaults to ‘value’, which returns the target mean.
163 |         min_samples_leaf:
164 |             Minimum samples to take category average into account.
165 |         smoothing:
166 |             Smoothing effect to balance categorical average vs prior. Higher value means stronger regularization.
167 |             The value must be strictly bigger than 0.
168 |         return_same_type:
169 |             If True, ``transform`` and ``fit_transform`` return the same type as X.
170 |             If False, these APIs always return a numpy array, similar to sklearn's API.
171 |     """
172 | 
173 |     def __init__(self, cv: Optional[Union[Iterable, BaseCrossValidator]] = None,
174 |                  groups: Optional[pd.Series] = None,
175 |                  cols: List[str] = None,
176 |                  drop_invariant: bool = False, handle_missing: str = 'value', handle_unknown: str = 'value',
177 |                  min_samples_leaf: int = 20, smoothing: float = 10.0, return_same_type: bool = True):
178 |         e = ce.TargetEncoder(cols=cols, drop_invariant=drop_invariant, return_df=True,
179 |                              handle_missing=handle_missing,
180 |                              handle_unknown=handle_unknown,
181 |                              min_samples_leaf=min_samples_leaf, smoothing=smoothing)
182 | 
183 |         super().__init__(e, cv, return_same_type, groups)
184 | 
185 |     def _post_transform(self, X: pd.DataFrame) -> pd.DataFrame:
186 |         cols = self.transformers[0].cols
187 |         for c in cols:
188 |             X[c] = X[c].astype(float)
189 |         return X
190 | 


--------------------------------------------------------------------------------
/nyaggle/feature/groupby.py:
--------------------------------------------------------------------------------
  1 | # Modified work:
  2 | # -----------------------------------------------------------------------------
  3 | # Copyright (c) 2020 Kota Yuhara (@wakamezake)
  4 | # -----------------------------------------------------------------------------
  5 | 
  6 | # Original work of aggregation:
  7 | # https://github.com/pfnet-research/xfeat/blob/master/xfeat/helper.py
  8 | # -----------------------------------------------------------------------------
  9 | # MIT License
 10 | #
 11 | # Copyright (c) 2020 Preferred Networks, Inc.
 12 | #
 13 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 14 | # of this software and associated documentation files (the "Software"), to deal
 15 | # in the Software without restriction, including without limitation the rights
 16 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 17 | # copies of the Software, and to permit persons to whom the Software is
 18 | # furnished to do so, subject to the following conditions:
 19 | #
 20 | # The above copyright notice and this permission notice shall be included in all
 21 | # copies or substantial portions of the Software.
 22 | #
 23 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 24 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 25 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 26 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 27 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 28 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 29 | # SOFTWARE.
 30 | # -----------------------------------------------------------------------------
 31 | 
 32 | from inspect import isroutine
 33 | from types import FunctionType, LambdaType
 34 | from typing import Callable, List, Tuple, Union
 35 | 
 36 | import pandas as pd
 37 | from pandas.core.common import get_callable_name
 38 | 
 39 | 
 40 | def _is_lambda_function(obj):
 41 |     """
 42 |     Example:
 43 |         >>> import numpy as np
 44 |         >>> def custom_function(x): return np.sum(x)
 45 |         >>> _is_lambda_function(lambda x: np.sum(x))
 46 |         True
 47 |         >>> _is_lambda_function(np.sum)
 48 |         False
 49 |         >>> _is_lambda_function(custom_function)
 50 |         False
 51 |     """
 52 |     # It's worth noting that types.LambdaType is an alias for types.FunctionType
 53 |     return isinstance(obj, LambdaType) and obj.__name__ == "<lambda>"
 54 | 
 55 | 
 56 | def aggregation(
 57 |         input_df: pd.DataFrame,
 58 |         group_key: str,
 59 |         group_values: List[str],
 60 |         agg_methods: List[Union[str, FunctionType]],
 61 | ) -> Tuple[pd.DataFrame, List[str]]:
 62 |     """
 63 |     Aggregate values after grouping table rows by a given key.
 64 | 
 65 |     Args:
 66 |         input_df:
 67 |             Input data frame.
 68 |         group_key:
 69 |             Used to determine the groups for the groupby.
 70 |         group_values:
 71 |             Used to aggregate values for the groupby.
 72 |         agg_methods:
 73 |             List of function or function names, e.g. ['mean', 'max', 'min', numpy.mean].
 74 |             Do not use a lambda function because the name attribute of the lambda
 75 |             function cannot generate a unique string of column names in <lambda>.
 76 |     Returns:
 77 |         Tuple of output dataframe and new column names.
 78 |     """
 79 |     new_df = input_df.copy()
 80 | 
 81 |     new_cols = []
 82 |     for agg_method in agg_methods:
 83 |         if _is_lambda_function(agg_method):
 84 |             raise ValueError('Not supported lambda function.')
 85 |         elif isinstance(agg_method, str):
 86 |             pass
 87 |         elif isinstance(agg_method, FunctionType):
 88 |             pass
 89 |         elif isroutine(agg_method):
 90 |             pass
 91 |         else:
 92 |             raise ValueError('Supported types are: {} or {}.'
 93 |                              ' Got {} instead.'.format(str, Callable, type(agg_method)))
 94 | 
 95 |     for agg_method in agg_methods:
 96 |         for col in group_values:
 97 |             # only str or FunctionType
 98 |             if isinstance(agg_method, str):
 99 |                 agg_method_name = agg_method
100 |             else:
101 |                 agg_method_name = get_callable_name(agg_method)
102 |             new_col = "agg_{}_{}_by_{}".format(agg_method_name, col, group_key)
103 | 
104 |             df_agg = (
105 |                 input_df[[col] + [group_key]].groupby(group_key)[[col]].agg(
106 |                     agg_method)
107 |             )
108 |             df_agg.columns = [new_col]
109 |             new_cols.append(new_col)
110 |             new_df = new_df.merge(
111 |                 df_agg, how="left", right_index=True, left_on=group_key
112 |             )
113 | 
114 |     return new_df, new_cols
115 | 


--------------------------------------------------------------------------------
/nyaggle/feature/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.feature.nlp.bert import BertSentenceVectorizer
2 | 


--------------------------------------------------------------------------------
/nyaggle/feature/nlp/bert.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable, List, Optional, Union
  2 | import transformers
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from category_encoders.utils import convert_input
  7 | from sklearn.decomposition import TruncatedSVD
  8 | from tqdm import tqdm
  9 | 
 10 | from nyaggle.environment import requires_torch
 11 | from nyaggle.feature.base import BaseFeaturizer
 12 | 
 13 | 
 14 | class BertSentenceVectorizer(BaseFeaturizer):
 15 |     """Sentence Vectorizer using BERT pretrained model.
 16 | 
 17 |     Extract fixed-length feature vector from English/Japanese variable-length sentence using BERT.
 18 | 
 19 |     Args:
 20 |         lang:
 21 |             Type of language. If set to "jp", Japanese BERT model is used (you need to install MeCab).
 22 |         n_components:
 23 |             Number of components in SVD. If `None`, SVD is not applied.
 24 |         text_columns:
 25 |             List of processing columns. If `None`, all object columns are regarded as text column.
 26 |         pooling_strategy:
 27 |             The pooling algorithm for generating fixed length encoding vector. 'reduce_mean' and 'reduce_max' use
 28 |             average pooling and max pooling respectively to reduce vector from (num-words, emb-dim) to (emb_dim).
 29 |             'reduce_mean_max' performs 'reduce_mean' and 'reduce_max' separately and concat them.
 30 |             'cls_token' takes the first element (i.e. [CLS]).
 31 |         use_cuda:
 32 |             If `True`, inference is performed on GPU.
 33 |         tokenizer:
 34 |             The custom tokenizer used instead of default tokenizer
 35 |         model:
 36 |             The custom pretrained model used instead of default BERT model
 37 |         return_same_type:
 38 |             If True, `transform` and `fit_transform` return the same type as X.
 39 |             If False, these APIs always return a numpy array, similar to sklearn's API.
 40 |         column_format:
 41 |             Name of transformed columns (used if returning type is pd.DataFrame)
 42 |     """
 43 | 
 44 |     def __init__(self, lang: str = 'en', n_components: Optional[int] = None,
 45 |                  text_columns: List[str] = None, pooling_strategy: str = 'reduce_mean',
 46 |                  use_cuda: bool = False, tokenizer: transformers.PreTrainedTokenizer = None,
 47 |                  model=None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
 48 |         if tokenizer is not None:
 49 |             assert model is not None
 50 |             self.tokenizer = tokenizer
 51 |             self.model = model
 52 |         if lang == 'en':
 53 |             pretrained_model_name = 'bert-base-uncased'
 54 |             self.tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_model_name)
 55 |             self.model = transformers.BertModel.from_pretrained(pretrained_model_name)
 56 |         elif lang == 'jp':
 57 |             pretrained_model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
 58 |             self.tokenizer = transformers.BertJapaneseTokenizer.from_pretrained(pretrained_model_name)
 59 |             self.model = transformers.BertModel.from_pretrained(pretrained_model_name)
 60 |         else:
 61 |             raise ValueError('Specified language type () is invalid.'.format(lang))
 62 | 
 63 |         self.lang = lang
 64 |         self.n_components = n_components
 65 |         self.text_columns = text_columns
 66 |         self.pooling_strategy = pooling_strategy
 67 |         self.use_cuda = use_cuda
 68 |         self.return_same_type = return_same_type
 69 |         self.svd = {}
 70 |         self.column_format = column_format
 71 | 
 72 |     def _process_text(self, text: str) -> np.ndarray:
 73 |         requires_torch()
 74 |         import torch
 75 | 
 76 |         tokens_tensor = torch.tensor([self.tokenizer.encode(text, add_special_tokens=True)])
 77 |         if self.use_cuda:
 78 |             tokens_tensor = tokens_tensor.to('cuda')
 79 |             self.model.to('cuda')
 80 | 
 81 |         self.model.eval()
 82 |         with torch.no_grad():
 83 |             outputs = self.model(tokens_tensor)
 84 | 
 85 |         embedding = outputs.last_hidden_state.cpu().numpy()[0]
 86 |         if self.pooling_strategy == 'reduce_mean':
 87 |             return np.mean(embedding, axis=0)
 88 |         elif self.pooling_strategy == 'reduce_max':
 89 |             return np.max(embedding, axis=0)
 90 |         elif self.pooling_strategy == 'reduce_mean_max':
 91 |             return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)]
 92 |         elif self.pooling_strategy == 'cls_token':
 93 |             return embedding[0]
 94 |         else:
 95 |             raise ValueError("specify valid pooling_strategy: {reduce_mean, reduce_max, reduce_mean_max, cls_token}")
 96 | 
 97 |     def _fit_one(self, col: str, emb: np.ndarray):
 98 |         if not self.n_components or self.n_components >= emb.shape[1]:
 99 |             return emb
100 |         self.svd[col] = TruncatedSVD(n_components=self.n_components, algorithm='arpack', random_state=0)
101 |         return self.svd[col].fit(emb)
102 | 
103 |     def _transform_one(self, col: str, emb: np.ndarray):
104 |         if not self.n_components or self.n_components >= emb.shape[1]:
105 |             return emb
106 |         return self.svd[col].transform(emb)
107 | 
108 |     def _fit_transform_one(self, col: str, emb: np.ndarray):
109 |         if not self.n_components or self.n_components >= emb.shape[1]:
110 |             return emb
111 |         self.svd[col] = TruncatedSVD(n_components=self.n_components, algorithm='arpack', random_state=0)
112 |         return self.svd[col].fit_transform(emb)
113 | 
114 |     def _process(self, X: pd.DataFrame, func: Callable[[str, np.ndarray], Any]):
115 |         is_pandas = isinstance(X, pd.DataFrame)
116 |         X = convert_input(X)
117 | 
118 |         tqdm.pandas()
119 |         columns = self.text_columns or [c for c in X.columns if X[c].dtype == object]
120 |         non_text_columns = [c for c in X.columns if c not in columns]
121 | 
122 |         column_names = []
123 |         processed = []
124 |         for c in columns:
125 |             emb = np.vstack(X[c].progress_apply(lambda x: self._process_text(x)))
126 |             emb = func(c, emb)
127 |             processed.append(emb)
128 |             column_names += [self.column_format.format(col=c, idx=i) for i in range(emb.shape[1])]
129 | 
130 |         processed_df = pd.DataFrame(np.hstack(processed), columns=column_names)
131 | 
132 |         if non_text_columns:
133 |             X_ = X[non_text_columns].copy()
134 |             X_ = pd.concat([X_, processed_df], axis=1)
135 |         else:
136 |             X_ = processed_df
137 | 
138 |         return X_ if self.return_same_type and is_pandas else X_.values
139 | 
140 |     def fit(self, X: Union[pd.DataFrame, np.ndarray], y=None):
141 |         """
142 |         Fit SVD model on training data X.
143 | 
144 |         Args:
145 |             X:
146 |                 Data
147 |             y:
148 |                 Ignored
149 |         """
150 |         self._process(X, self._fit_one)
151 |         return self
152 | 
153 |     def transform(self, X: Union[pd.DataFrame, np.ndarray], y=None):
154 |         """
155 |         Perform feature extraction and dimensionality reduction using
156 |         BERT pre-trained model and trained SVD model.
157 | 
158 |         Args:
159 |             X:
160 |                 Data
161 |             y:
162 |                 Ignored
163 |         """
164 |         return self._process(X, self._transform_one)
165 | 
166 |     def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y=None, **fit_params):
167 |         """
168 |         Fit SVD model on training data X and perform feature extraction and dimensionality reduction using
169 |         BERT pre-trained model and trained SVD model.
170 | 
171 |         Args:
172 |             X:
173 |                 Data
174 |             y:
175 |                 Ignored
176 |         """
177 |         return self._process(X, self._fit_transform_one)
178 | 


--------------------------------------------------------------------------------
/nyaggle/feature_store/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.feature_store.feature_store import cached_feature, save_feature, load_feature, load_features
2 | 


--------------------------------------------------------------------------------
/nyaggle/feature_store/feature_store.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import os
  3 | import warnings
  4 | from typing import List, Optional, Union
  5 | 
  6 | import pandas as pd
  7 | import pyarrow
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | def validate_train_test_difference(train: pd.Series, test: pd.Series):
 12 |     # % of nulls
 13 |     if test.isnull().mean() == 1.0:
 14 |         raise RuntimeError('Error in feature {}: all values in test data is null'.format(train.name))
 15 | 
 16 | 
 17 | def validate_feature(df: pd.DataFrame, y: pd.Series):
 18 |     if len(y) < len(df):
 19 |         # assuming that the first part of the dataframe is train part
 20 |         train = df.iloc[:len(y), :]
 21 |         test = df.iloc[len(y):, :]
 22 |     else:
 23 |         train = df[~y.isnull()]
 24 |         test = df[y.isnull()]
 25 | 
 26 |     for c in df.columns:
 27 |         validate_train_test_difference(train[c], test[c])
 28 | 
 29 | 
 30 | def save_feature(df: pd.DataFrame, feature_name: Union[int, str], directory: str = './features/',
 31 |                  with_csv_dump: bool = False, create_directory: bool = True,
 32 |                  reference_target_variable: Optional[pd.Series] = None, overwrite: bool = False):
 33 |     """
 34 |     Save pandas dataframe as feather-format
 35 | 
 36 |     Args:
 37 |         df:
 38 |             The dataframe to be saved.
 39 |         feature_name:
 40 |             The name of the feature. The output file will be ``{feature_name}.f``.
 41 |         directory:
 42 |             The directory where the feature will be stored.
 43 |         with_csv_dump:
 44 |             If True, the first 1000 lines are dumped to csv file for debug.
 45 |         create_directory:
 46 |             If True, create directory if not exists.
 47 |         reference_target_variable:
 48 |             If not None, instant validation will be made on the feature.
 49 |         overwrite:
 50 |             If False and file already exists, RuntimeError will be raised.
 51 |     """
 52 |     if create_directory:
 53 |         os.makedirs(directory, exist_ok=True)
 54 | 
 55 |     if reference_target_variable is not None:
 56 |         validate_feature(df, reference_target_variable)
 57 | 
 58 |     path = os.path.join(directory, str(feature_name) + '.f')
 59 | 
 60 |     if not overwrite and os.path.exists(path):
 61 |         raise RuntimeError('File already exists')
 62 | 
 63 |     df.to_feather(path)
 64 | 
 65 |     if with_csv_dump:
 66 |         df.head(1000).to_csv(os.path.join(directory, str(feature_name) + '.csv'), index=False)
 67 | 
 68 | 
 69 | def load_feature(feature_name: Union[int, str], directory: str = './features/',
 70 |                  ignore_columns: List[str] = None) -> pd.DataFrame:
 71 |     """
 72 |     Load feature as pandas DataFrame.
 73 | 
 74 |     Args:
 75 |         feature_name:
 76 |             The name of the feature (used in ``save_feature``).
 77 |         directory:
 78 |             The directory where the feature is stored.
 79 |         ignore_columns:
 80 |             The list of columns that will be dropped from the loaded dataframe.
 81 |     Returns:
 82 |         The feature dataframe
 83 |     """
 84 |     path = os.path.join(directory, str(feature_name) + '.f')
 85 | 
 86 |     df = pd.read_feather(path)
 87 |     if ignore_columns:
 88 |         return df.drop([c for c in ignore_columns if c in df.columns], axis=1)
 89 |     else:
 90 |         return df
 91 | 
 92 | 
 93 | def load_features(base_df: Optional[pd.DataFrame],
 94 |                   feature_names: List[Union[int, str]], directory: str = './features/',
 95 |                   ignore_columns: List[str] = None, create_directory: bool = True,
 96 |                   rename_duplicate: bool = True) -> pd.DataFrame:
 97 |     """
 98 |     Load features and returns concatenated dataframe
 99 | 
100 |     Args:
101 |         base_df:
102 |             The base dataframe. If not None, resulting dataframe will consist of base and loaded feature columns.
103 |         feature_names:
104 |             The list of feature names to be loaded.
105 |         directory:
106 |             The directory where the feature is stored.
107 |         ignore_columns:
108 |             The list of columns that will be dropped from the loaded dataframe.
109 |         create_directory:
110 |             If True, create directory if not exists.
111 |         rename_duplicate:
112 |             If True, duplicated column name will be renamed automatically (feature name will be used as suffix).
113 |             If False, duplicated columns will be as-is.
114 |     Returns:
115 |         The merged dataframe
116 |     """
117 |     if create_directory:
118 |         os.makedirs(directory, exist_ok=True)
119 | 
120 |     dfs = [load_feature(f, directory=directory, ignore_columns=ignore_columns) for f in tqdm(feature_names)]
121 | 
122 |     if base_df is None:
123 |         base_df = dfs[0]
124 |         dfs = dfs[1:]
125 |         feature_names = feature_names[1:]
126 | 
127 |     columns = list(base_df.columns)
128 | 
129 |     for df, feature_name in zip(dfs, feature_names):
130 |         if len(df) != len(base_df):
131 |             raise RuntimeError('DataFrame length are different. feature={}'.format(feature_name))
132 | 
133 |         for c in df.columns:
134 |             if c in columns:
135 |                 warnings.warn('A feature name {} is duplicated.'.format(c))
136 | 
137 |                 if rename_duplicate:
138 |                     while c in columns:
139 |                         c += '_' + str(feature_name)
140 |                     warnings.warn('The duplicated name in feature={} will be renamed to {}'.format(feature_name, c))
141 |             columns.append(c)
142 | 
143 |     concatenated = pd.concat([base_df] + dfs, axis=1)
144 |     concatenated.columns = columns
145 |     return concatenated
146 | 
147 | 
148 | def cached_feature(feature_name: Union[int, str], directory: str = './features/', ignore_columns: List[str] = None):
149 |     """
150 |     Decorator to wrap a function which returns pd.DataFrame with a memorizing callable that saves dataframe using
151 |     ``feature_store.save_feature``.
152 | 
153 |     Args:
154 |         feature_name:
155 |             The name of the feature (used in ``save_feature``).
156 |         directory:
157 |             The directory where the feature is stored.
158 |         ignore_columns:
159 |             The list of columns that will be dropped from the loaded dataframe.
160 | 
161 |     Example:
162 |         >>> from nyaggle.feature_store import cached_feature
163 |         >>>
164 |         >>> @cached_feature('x')
165 |         >>> def make_feature_x(param) -> pd.DataFrame:
166 |         >>>     print('called')
167 |         >>>     ...
168 |         >>>     return df
169 |         >>>
170 |         >>> x = make_feature_x(...)  # if x.f does not exist, call the function and save result to x.f
171 |         "called"
172 |         >>> x = make_feature_x(...)  # load from file in the second time
173 |     """
174 | 
175 |     def _decorator(fun):
176 |         @functools.wraps(fun)
177 |         def _decorated_fun(*args, **kwargs):
178 |             try:
179 |                 return load_feature(feature_name, directory, ignore_columns)
180 |             except (pyarrow.ArrowIOError, IOError):
181 |                 df = fun(*args, **kwargs)
182 |                 assert isinstance(df, pd.DataFrame), "returning value of @cached_feature should be pd.DataFrame"
183 |                 save_feature(df, feature_name, directory)
184 |                 return df
185 | 
186 |         return _decorated_fun
187 | 
188 |     return _decorator
189 | 


--------------------------------------------------------------------------------
/nyaggle/hyper_parameters/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.hyper_parameters.parameters import get_hyperparam_byname, list_hyperparams
2 | 


--------------------------------------------------------------------------------
/nyaggle/hyper_parameters/catboost.py:
--------------------------------------------------------------------------------
 1 | parameters = [
 2 |     {
 3 |         "name": "ieee-2019-17th",
 4 |         "url": "https://nbviewer.jupyter.org/github/tmheo/IEEE-Fraud-Detection-17th-Place-Solution/blob/master/notebook/IEEE-17th-Place-Solution-CatBoost-Ensemble.ipynb",
 5 |         "competition": "ieee-fraud-detection",
 6 |         "rank": 17,
 7 |         "metric": "auc",
 8 |         "parameters": {
 9 |             'learning_rate': 0.07,
10 |             'eval_metric': 'AUC',
11 |             'loss_function': 'Logloss',
12 |             'metric_period': 500,
13 |             'od_wait': 500,
14 |             'depth': 8,
15 |         }
16 |     },
17 |     {
18 |         "name": "elo-2018-11th",
19 |         "url": "https://github.com/kangzhang0709/2019-kaggle-elo-top-11-solution",
20 |         "competition": "elo-merchant-category-recommendation",
21 |         "rank": 11,
22 |         "metric": "rmse",
23 |         "parameters": {
24 |             'learning_rate': 0.01,
25 |             'max_depth': 8,
26 |             'bagging_temperature': 0.8,
27 |             'l2_leaf_reg': 45,
28 |             'od_type': 'Iter'
29 |         }
30 |     },
31 |     {
32 |         "name": "plasticc-2018-3rd",
33 |         "url": "https://github.com/takashioya/plasticc/blob/master/scripts/train.py",
34 |         "competition": "PLAsTiCC-2018",
35 |         "rank": 3,
36 |         "metric": "multi-class log-loss",
37 |         "parameters": {
38 |             'learning_rate': 0.1,
39 |             'depth': 3,
40 |             'loss_function': 'MultiClass',
41 |             'colsample_bylevel': 0.7,
42 |         }
43 |     },
44 | ]
45 | 
46 | 


--------------------------------------------------------------------------------
/nyaggle/hyper_parameters/parameters.py:
--------------------------------------------------------------------------------
 1 | from more_itertools import first_true
 2 | from typing import Dict, List, Union
 3 | 
 4 | from nyaggle.hyper_parameters.catboost import parameters as params_cat
 5 | from nyaggle.hyper_parameters.lightgbm import parameters as params_lgb
 6 | from nyaggle.hyper_parameters.xgboost import parameters as params_xgb
 7 | 
 8 | 
 9 | def _get_hyperparam_byname(param_table: List[Dict], name: str, with_metadata: bool):
10 |     found = first_true(param_table, pred=lambda x: x['name'] == name)
11 |     if found is None:
12 |         raise RuntimeError('Hyperparameter {} not found.'.format(name))
13 | 
14 |     if with_metadata:
15 |         return found
16 |     else:
17 |         return found['parameters']
18 | 
19 | 
20 | def _return(parameter: Union[List[Dict], Dict], with_metadata: bool) -> Union[List[Dict], Dict]:
21 |     if with_metadata:
22 |         return parameter
23 | 
24 |     if isinstance(parameter, list):
25 |         return [p['parameters'] for p in parameter]
26 |     else:
27 |         return parameter['parameters']
28 | 
29 | 
30 | def _get_table(gbdt_type: str = 'lgbm'):
31 |     if gbdt_type == 'lgbm':
32 |         return params_lgb
33 |     elif gbdt_type == 'cat':
34 |         return params_cat
35 |     elif gbdt_type == 'xgb':
36 |         return params_xgb
37 |     raise ValueError('gbdt type should be one of (lgbm, cat, xgb)')
38 | 
39 | 
40 | def list_hyperparams(gbdt_type: str = 'lgbm', with_metadata: bool = False) -> List[Dict]:
41 |     """
42 |     List all hyperparameters
43 | 
44 |     Args:
45 |         gbdt_type:
46 |             The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used.
47 |         with_metadata:
48 |             When set to True, parameters are wrapped by metadata dictionary which contains information about
49 |             source URL, competition name etc.
50 |     Returns:
51 |         A list of hyper-parameters used in Kaggle gold medal solutions
52 |     """
53 |     return _return(_get_table(gbdt_type), with_metadata)
54 | 
55 | 
56 | def get_hyperparam_byname(name: str, gbdt_type: str = 'lgbm', with_metadata: bool = False) -> Dict:
57 |     """
58 |     Get a hyperparameter by parameter name
59 | 
60 |     Args:
61 |         name:
62 |             The name of parameter (e.g. "ieee-2019-10th").
63 |         gbdt_type:
64 |             The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used.
65 |         with_metadata:
66 |             When set to True, parameters are wrapped by metadata dictionary which contains information about
67 |             source URL, competition name etc.
68 |     Returns:
69 |         A hyperparameter dictionary.
70 |     """
71 |     param_table = _get_table(gbdt_type)
72 |     found = first_true(param_table, pred=lambda x: x['name'] == name)
73 |     if found is None:
74 |         raise RuntimeError('Hyperparameter {} not found.'.format(name))
75 | 
76 |     return _return(found, with_metadata)
77 | 


--------------------------------------------------------------------------------
/nyaggle/hyper_parameters/xgboost.py:
--------------------------------------------------------------------------------
  1 | parameters = [
  2 |     {
  3 |         "name": "ieee-2019-1st",
  4 |         "url": "https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600",
  5 |         "competition": "ieee-fraud-detection",
  6 |         "rank": 1,
  7 |         "metric": "auc",
  8 |         "parameters": {
  9 |             "max_depth": 12,
 10 |             "learning_rate": 0.02,
 11 |             "subsample": 0.8,
 12 |             "colsample_bytree": 0.4,
 13 |             "missing": -1,
 14 |             "eval_metric": "auc",
 15 |             "tree_method": "hist"
 16 |         }
 17 |     },
 18 |     {
 19 |         "name": "womens-ml-competition-2019-1st",
 20 |         "url": "https://github.com/salmatfq/KaggleMarchMadnessFirstPlace/blob/master/win_ncaa_men.R",
 21 |         "competition": "womens-machine-learning-competition-2019",
 22 |         "rank": 1,
 23 |         "metric": "log-loss",
 24 |         "parameters": {
 25 |             "eval_metric": "mae",
 26 |             "booster": "gbtree",
 27 |             "eta": 0.02,
 28 |             "subsample": 0.35,
 29 |             "colsample_bytree": 0.7,
 30 |             "num_parallel_tree": 10,
 31 |             "min_child_weight": 40,
 32 |             "gamma": 10,
 33 |             "max_depth": 3
 34 |         }
 35 |     },
 36 | 
 37 |     # 2018, Santander Value Prediction Challenge
 38 |     {
 39 |         "name": "santander-2018-5th",
 40 |         "url": "https://github.com/vlarine/kaggle/blob/master/santander-value-prediction-challenge/santander.py",
 41 |         "competition": "santander-value-prediction-challenge",
 42 |         "rank": 5,
 43 |         "metric": "rmsle",
 44 |         "parameters": {
 45 |             'colsample_bytree': 0.055,
 46 |             'colsample_bylevel': 0.4,
 47 |             'gamma': 1.5,
 48 |             'learning_rate': 0.01,
 49 |             'max_depth': 5,
 50 |             'objective': 'reg:linear',
 51 |             'booster': 'gbtree',
 52 |             'min_child_weight': 10,
 53 |             'reg_alpha': 0,
 54 |             'reg_lambda': 0,
 55 |             'eval_metric': 'rmse',
 56 |             'subsample': 0.7,
 57 |         }
 58 |     },
 59 | 
 60 |     # 2018, Elo Merchant Category Recommendation
 61 |     {
 62 |         "name": "elo-2018-11th",
 63 |         "url": "https://github.com/kangzhang0709/2019-kaggle-elo-top-11-solution/blob/master/Models/model_xgb.ipynb",
 64 |         "competition": "elo-merchant-category-recommendation",
 65 |         "rank": 11,
 66 |         "metric": "rmse",
 67 |         "parameters": {
 68 |             'objective': 'reg:linear',
 69 |             'booster': 'gbtree',
 70 |             'learning_rate': 0.01,
 71 |             'max_depth': 10,
 72 |             'gamma': 1.45,
 73 |             'alpha': 0.1,
 74 |             'lambda': 0.3,
 75 |             'subsample': 0.9,
 76 |             'colsample_bytree': 0.054,
 77 |             'colsample_bylevel': 0.50
 78 |         }
 79 |     },
 80 | 
 81 |     # 2018, DonorsChoose.org Application Screening
 82 |     {
 83 |         "name": "donorschoose-2018-1st",
 84 |         "url": "https://www.kaggle.com/shadowwarrior/1st-place-solution/notebook",
 85 |         "competition": "donorschoose-application-screening",
 86 |         "rank": 1,
 87 |         "metric": "auc",
 88 |         "parameters": {
 89 |             'objective': 'binary:logistic',
 90 |             'eval_metric': 'auc',
 91 |             'eta': 0.01,
 92 |             'max_depth': 7,
 93 |             'subsample': 0.8,
 94 |             'colsample_bytree': 0.4,
 95 |             'min_child_weight': 10,
 96 |             'gamma': 2
 97 |         }
 98 |     },
 99 | 
100 |     # 2018, Recruit Restaurant Visitor Forecasting
101 | 
102 |     # 2017, Instacart Market Basket Analysis
103 |     {
104 |         "name": "instacart-2017-2nd",
105 |         "url": "https://github.com/KazukiOnodera/Instacart/blob/master/py_model/002_xgb_holdout_item_812_1.py",
106 |         "competition": "instacart-market-basket-analysis",
107 |         "rank": 2,
108 |         "metric": "",
109 |         "parameters": {
110 |             'max_depth': 10,
111 |             'eta': 0.02,
112 |             'colsample_bytree': 0.4,
113 |             'subsample': 0.75,
114 |             'eval_metric': 'logloss',
115 |             'objective': 'binary:logistic',
116 |             'tree_method': 'hist'
117 |          }
118 |     },
119 | 
120 |     # 2017, Two Sigma Connect; Rental Listing Inquiries
121 |     {
122 |         "name": "two-sigma-2017-1st",
123 |         "url": "https://github.com/plantsgo/Rental-Listing-Inquiries/blob/master/xgb.py",
124 |         "competition": "two-sigma-connect-rental-listing-inquiries",
125 |         "rank": 1,
126 |         "metric": "multi-class log-loss",
127 |         "parameters": {
128 |             'booster': 'gbtree',
129 |             'objective': 'multi:softprob',
130 |             'eval_metric': 'mlogloss',
131 |             'gamma': 1,
132 |             'min_child_weight': 1.5,
133 |             'max_depth': 5,
134 |             'lambda': 10,
135 |             'subsample': 0.7,
136 |             'colsample_bytree': 0.7,
137 |             'colsample_bylevel': 0.7,
138 |             'eta': 0.03,
139 |             'tree_method': 'exact'
140 |         }
141 |     },
142 | 
143 |     # 2016, Santander Product Recommendation
144 |     {
145 |         "name": "santander-2016-2nd",
146 |         "url": "https://github.com/ttvand/Santander-Product-Recommendation/blob/master/First%20level%20learners/xgboost.R",
147 |         "competition": "santander-product-recommendation",
148 |         "rank": 2,
149 |         "metric": "map7",
150 |         "parameters": {
151 |             "etaC": 10,
152 |             "subsample": 1,
153 |             "colsample_bytree": 0.5,
154 |             "max_depth": 8,
155 |             "min_child_weight": 0,
156 |             "gamma": 0.1
157 |         }
158 |     },
159 | 
160 |     # 2016, TalkingData Mobile User Demographics
161 |     {
162 |         "name": "talkingdata-2016-3rd-1",
163 |         "url": "https://github.com/chechir/talking_data/blob/master/danijel/xgb/xgb_cv5_train_events.R",
164 |         "competition": "talkingdata-mobile-user-demographics",
165 |         "rank": 3,
166 |         "metric": "multi-class log-loss",
167 |         "parameters": {
168 |             "booster": 'gbtree',
169 |             "objective": 'reg:logistic',
170 |             "eval_metric": 'logloss',
171 |             "learning_rate": 0.025,
172 |             "max_depth": 6,
173 |             "subsample": 0.8,
174 |             "colsample_bytree": 0.5,
175 |             "colsample_bylevel": 0.5
176 |         }
177 |     },
178 |     {
179 |         "name": "talkingdata-2016-3rd-2",
180 |         "url": "https://github.com/chechir/talking_data/blob/master/danijel/xgb/xgb_cv5_train_noevents.R",
181 |         "competition": "talkingdata-mobile-user-demographics",
182 |         "rank": 3,
183 |         "metric": "multi-class log-loss",
184 |         "parameters": {
185 |             "booster": 'gbtree',
186 |             "objective": 'reg:logistic',
187 |             "eval_metric": 'logloss',
188 |             "learning_rate": 0.05,
189 |             "max_depth": 2,
190 |             "colsample_bytree": 0.8,
191 |             "colsample_bylevel": 0.8
192 |         }
193 |     },
194 | 
195 |     # 2016, Allstate Claims Severity
196 |     {
197 |         "name": "allstate-2016-3rd",
198 |         "url": "https://www.kaggle.com/c/allstate-claims-severity/discussion/26447#150319",
199 |         "competition": "allstate-claims-severity",
200 |         "rank": 3,
201 |         "metric": "mae",
202 |         "parameters": {
203 |             'colsample_bytree': 0.4,
204 |             'subsample': 0.975,
205 |             'learning_rate': 0.015,
206 |             'gamma': 1.5,
207 |             'lambda': 2,
208 |             'alpha': 2,
209 |             'max_depth': 25,
210 |             'num_parallel_tree': 1,
211 |             'min_child_weight': 50,
212 |             'eval_metric': 'mae',
213 |             'max_delta_step': 0,
214 |         }
215 |     },
216 | 
217 |     # 2016, Bosch Production Line Performance
218 |     {
219 |         "name": "bosch-2016-1st",
220 |         "url": "https://www.kaggle.com/c/bosch-production-line-performance/discussion/25434#144628",
221 |         "competition": "bosch-production-line-performance",
222 |         "rank": 1,
223 |         "metric": "mcc",
224 |         "parameters": {
225 |             "eval_metric": "auc",
226 |             "alpha": 0,
227 |             "booster": "gbtree",
228 |             "colsample_bytree": 0.6,
229 |             "minchildweight": 5,
230 |             "subsample": 0.9,
231 |             "eta": 0.03,
232 |             "objective": "binary:logistic",
233 |             "max_depth": 14,
234 |             "lambda": 4
235 |         }
236 |     },
237 | ]
238 | 


--------------------------------------------------------------------------------
/nyaggle/testing/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.testing.util import *
2 | 


--------------------------------------------------------------------------------
/nyaggle/testing/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import tempfile
 4 | import uuid
 5 | from contextlib import contextmanager
 6 | from typing import Tuple
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | from sklearn.datasets import make_classification, make_regression
11 | 
12 | 
13 | def make_classification_df(n_samples: int = 1024,
14 |                            n_num_features: int = 20,
15 |                            n_cat_features: int = 0,
16 |                            class_sep: float = 1.0,
17 |                            n_classes: int = 2,
18 |                            feature_name: str = 'col_{}',
19 |                            target_name: str = 'target',
20 |                            random_state: int = 0,
21 |                            id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]:
22 |     np.random.seed(random_state)
23 |     X, y = make_classification(n_samples=n_samples, n_features=n_num_features, class_sep=class_sep,
24 |                                random_state=random_state, n_classes=n_classes, n_informative=max(n_classes, 2))
25 | 
26 |     X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)])
27 |     y = pd.Series(y, name=target_name)
28 | 
29 |     if id_column is not None:
30 |         X[id_column] = range(n_samples)
31 | 
32 |     for i in range(n_cat_features):
33 |         X['cat_{}'.format(i)] = \
34 |             pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype('category')
35 | 
36 |     return X, y
37 | 
38 | 
39 | def make_regression_df(n_samples: int = 1024,
40 |                        n_num_features: int = 20,
41 |                        n_cat_features: int = 0,
42 |                        feature_name: str = 'col_{}',
43 |                        target_name: str = 'target',
44 |                        random_state: int = 0,
45 |                        id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]:
46 |     np.random.seed(random_state)
47 |     X, y = make_regression(n_samples=n_samples, n_features=n_num_features,
48 |                            random_state=random_state)
49 | 
50 |     X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)])
51 |     y = pd.Series(y, name=target_name)
52 | 
53 |     if id_column is not None:
54 |         X[id_column] = range(n_samples)
55 | 
56 |     for i in range(n_cat_features):
57 |         X['cat_{}'.format(i)] = \
58 |             pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype(str).astype('category')
59 | 
60 |     return X, y
61 | 
62 | 
63 | 
64 | 
65 | @contextmanager
66 | def get_temp_directory() -> str:
67 |     path = None
68 |     try:
69 |         path = os.path.join(tempfile.gettempdir(), uuid.uuid4().hex)
70 |         yield path
71 |     finally:
72 |         if path:
73 |             shutil.rmtree(path, ignore_errors=True)
74 | 
75 | 


--------------------------------------------------------------------------------
/nyaggle/util/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.util.plot_importance import plot_importance
2 | from nyaggle.util.traits import is_instance, is_gbdt_instance
3 | from nyaggle.util.submission import make_submission_df
4 | 


--------------------------------------------------------------------------------
/nyaggle/util/plot_importance.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | import seaborn as sns
 6 | 
 7 | 
 8 | def plot_importance(importance: pd.DataFrame, path: Optional[str] = None, top_n: int = 100,
 9 |                     figsize: Optional[Tuple[int, int]] = None,
10 |                     title: Optional[str] = None):
11 |     """
12 |     Plot feature importance and write to image
13 | 
14 |     Args:
15 |         importance:
16 |             The dataframe which has "feature" and "importance" column
17 |         path:
18 |             The file path to be saved
19 |         top_n:
20 |             The number of features to be visualized
21 |         figsize:
22 |             The size of the figure
23 |         title:
24 |             The title of the plot
25 |     Example:
26 |         >>> import pandas as pd
27 |         >>> import lightgbm as lgb
28 |         >>> from nyaggle.util import plot_importance
29 |         >>> from sklearn.datasets import make_classification
30 | 
31 |         >>> X, y = make_classification()
32 |         >>> X = pd.DataFrame(X, columns=['col{}'.format(i) for i in range(X.shape[1])])
33 |         >>> booster = lgb.train({'objective': 'binary'}, lgb.Dataset(X, y))
34 |         >>> importance = pd.DataFrame({
35 |         >>>     'feature': X.columns,
36 |         >>>     'importance': booster.feature_importance('gain')
37 |         >>> })
38 |         >>> plot_importance(importance, 'importance.png')
39 |     """
40 |     importance = importance.groupby('feature')['importance'] \
41 |         .mean() \
42 |         .reset_index() \
43 |         .sort_values(by='importance', ascending=False)
44 | 
45 |     if len(importance) > top_n:
46 |         importance = importance.iloc[:top_n, :]
47 | 
48 |     if figsize is None:
49 |         figsize = (10, 16)
50 | 
51 |     if title is None:
52 |         title = 'Feature Importance'
53 | 
54 |     plt.figure(figsize=figsize)
55 |     sns.barplot(x="importance", y="feature", data=importance)
56 |     plt.title(title)
57 |     plt.tight_layout()
58 |     if path is not None:
59 |         plt.savefig(path)
60 | 


--------------------------------------------------------------------------------
/nyaggle/util/submission.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def make_submission_df(test_prediction: np.ndarray, sample_submission: Optional[pd.DataFrame] = None,
 8 |                        y: Optional[pd.Series] = None) -> pd.DataFrame:
 9 |     """
10 |     Make a dataframe formatted as a kaggle competition style.
11 | 
12 |     Args:
13 |         test_prediction:
14 |             A test prediction to be formatted.
15 |         sample_submission:
16 |             A sample dataframe alined with test data (Usually in Kaggle, it is available as sample_submission.csv).
17 |             The submission file will be created with the same schema as this dataframe.
18 |         y:
19 |             Target variables which is used for inferring the column name. Ignored if ``sample_submission`` is passed.
20 |     Returns:
21 |         The formatted dataframe
22 |     """
23 |     if sample_submission is not None:
24 |         submit_df = sample_submission.copy()
25 | 
26 |         if test_prediction.ndim > 1 and test_prediction.shape[1] > 1:
27 |             n_id_cols = submit_df.shape[1] - test_prediction.shape[1]
28 |             for i in range(test_prediction.shape[1]):
29 |                 submit_df.iloc[:, n_id_cols + i] = test_prediction[:, i]
30 |         else:
31 |             submit_df.iloc[:, -1] = test_prediction
32 |     else:
33 |         submit_df = pd.DataFrame()
34 |         id_col_name = y.index.name if y is not None and y.index.name else 'id'
35 | 
36 |         submit_df[id_col_name] = np.arange(len(test_prediction))
37 | 
38 |         if test_prediction.ndim > 1 and test_prediction.shape[1] > 1:
39 |             tgt_col_names = sorted(y.unique()) if y is not None else [str(i) for i in range(test_prediction.shape[1])]
40 |             for i, y in enumerate(tgt_col_names):
41 |                 submit_df[y] = test_prediction[:, i]
42 |         else:
43 |             tgt_col_name = y.name if y is not None and y.name else 'target'
44 |             submit_df[tgt_col_name] = test_prediction
45 | 
46 |     return submit_df
47 | 


--------------------------------------------------------------------------------
/nyaggle/util/traits.py:
--------------------------------------------------------------------------------
 1 | # Original work of safe_instance:
 2 | # https://github.com/slundberg/shap/blob/master/shap/common.py
 3 | # -----------------------------------------------------------------------------
 4 | # The MIT License (MIT)
 5 | #
 6 | # Copyright (c) 2018 Scott Lundberg
 7 | #
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | # -----------------------------------------------------------------------------
26 | 
27 | import importlib
28 | from typing import List, Tuple, Union
29 | 
30 | 
31 | def is_instance(obj, class_path_str: Union[str, List, Tuple]) -> bool:
32 |     """
33 |     Acts as a safe version of isinstance without having to explicitly
34 |     import packages which may not exist in the users environment.
35 |     Checks if obj is an instance of type specified by class_path_str.
36 |     Parameters
37 |     ----------
38 |     obj: Any
39 |         Some object you want to test against
40 |     class_path_str: str or list
41 |         A string or list of strings specifying full class paths
42 |         Example: `sklearn.ensemble.RandomForestRegressor`
43 |     Returns
44 |     --------
45 |     bool: True if isinstance is true and the package exists, False otherwise
46 |     """
47 |     if isinstance(class_path_str, str):
48 |         class_path_strs = [class_path_str]
49 |     elif isinstance(class_path_str, list) or isinstance(class_path_str, tuple):
50 |         class_path_strs = class_path_str
51 |     else:
52 |         class_path_strs = ['']
53 | 
54 |     # try each module path in order
55 |     for class_path_str in class_path_strs:
56 |         if "." not in class_path_str:
57 |             raise ValueError("class_path_str must be a string or list of strings specifying a full \
58 |                 module path to a class. Eg, 'sklearn.ensemble.RandomForestRegressor'")
59 | 
60 |         # Splits on last occurence of "."
61 |         module_name, class_name = class_path_str.rsplit(".", 1)
62 | 
63 |         # Check module exists
64 |         try:
65 |             spec = importlib.util.find_spec(module_name)
66 |         except:
67 |             spec = None
68 |         if spec is None:
69 |             continue
70 | 
71 |         module = importlib.import_module(module_name)
72 | 
73 |         # Get class
74 |         _class = getattr(module, class_name, None)
75 |         if _class is None:
76 |             continue
77 | 
78 |         if isinstance(obj, _class):
79 |             return True
80 | 
81 |     return False
82 | 
83 | 
84 | def is_gbdt_instance(obj, algorithm_type: Union[str, Tuple]) -> bool:
85 |     if isinstance(algorithm_type, str):
86 |         algorithm_type = (algorithm_type,)
87 | 
88 |     gbdt_instance_name = {
89 |         'lgbm': 'lightgbm.sklearn.LGBMModel',
90 |         'xgb': 'xgboost.sklearn.XGBModel',
91 |         'cat': 'catboost.core.CatBoost'
92 |     }
93 | 
94 |     return is_instance(obj, tuple(gbdt_instance_name[t] for t in algorithm_type))
95 | 


--------------------------------------------------------------------------------
/nyaggle/validation/__init__.py:
--------------------------------------------------------------------------------
1 | from nyaggle.validation.cross_validate import cross_validate
2 | from nyaggle.validation.adversarial_validate import adversarial_validate
3 | from nyaggle.validation.split import \
4 |     check_cv, TimeSeriesSplit, SlidingWindowSplit, Take, Nth, Skip, StratifiedGroupKFold
5 | 


--------------------------------------------------------------------------------
/nyaggle/validation/adversarial_validate.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | from typing import Iterable, List, Optional, Union
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from sklearn.base import BaseEstimator
 7 | from sklearn.metrics import roc_auc_score
 8 | from sklearn.model_selection import KFold, BaseCrossValidator
 9 | 
10 | from nyaggle.environment import requires_lightgbm
11 | from nyaggle.util import is_instance
12 | from nyaggle.validation.cross_validate import cross_validate
13 | from nyaggle.validation.split import Take
14 | 
15 | ADVResult = namedtuple('ADVResult', ['auc', 'importance'])
16 | 
17 | 
18 | def adversarial_validate(X_train: pd.DataFrame,
19 |                          X_test: pd.DataFrame,
20 |                          importance_type: str = 'gain',
21 |                          estimator: Optional[BaseEstimator] = None,
22 |                          categorical_feature: List[str] = None,
23 |                          cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None) -> ADVResult:
24 |     """
25 |     Perform adversarial validation between X_train and X_test.
26 | 
27 |     Args:
28 |         X_train:
29 |             Training data
30 |         X_test:
31 |             Test data
32 |         importance_type:
33 |             The type of feature importance calculated.
34 |         estimator:
35 |             The custom estimator. If None, LGBMClassifier is automatically used.
36 |             Only LGBMModel or CatBoost instances are supported.
37 |         categorical_feature:
38 |             List of categorical column names. If ``None``, categorical columns are automatically determined by dtype.
39 |         cv:
40 |             Cross validation split. If ``None``, the first fold out of 5 fold is used as validation.
41 |     Returns:
42 |         Namedtuple with following members
43 | 
44 |         * auc:
45 |             float, ROC AUC score of adversarial validation.
46 |         * importance:
47 |             pandas DataFrame, feature importance of adversarial model (order by importance)
48 | 
49 |     Example:
50 |         >>> from sklearn.model_selection import train_test_split
51 |         >>> from nyaggle.testing import make_regression_df
52 |         >>> from nyaggle.validation import adversarial_validate
53 | 
54 |         >>> X, y = make_regression_df(n_samples=8)
55 |         >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
56 |         >>> auc, importance = cross_validate(X_train, X_test)
57 |         >>>
58 |         >>> print(auc)
59 |         0.51078231
60 |         >>> importance.head()
61 |         feature importance
62 |         col_1   231.5827204
63 |         col_5   207.1837266
64 |         col_7   188.6920685
65 |         col_4   174.5668498
66 |         col_9   170.6438643
67 |     """
68 |     concat = pd.concat([X_train, X_test]).copy().reset_index(drop=True)
69 |     y = np.array([1] * len(X_train) + [0] * len(X_test))
70 | 
71 |     if estimator is None:
72 |         requires_lightgbm()
73 |         from lightgbm import LGBMClassifier
74 |         estimator = LGBMClassifier(n_estimators=10000, objective='binary', importance_type=importance_type,
75 |                                    random_state=0)
76 |     else:
77 |         assert is_instance(estimator, ('lightgbm.sklearn.LGBMModel', 'catboost.core.CatBoost')), \
78 |             'Only CatBoostClassifier or LGBMClassifier is allowed'
79 | 
80 |     if cv is None:
81 |         cv = Take(1, KFold(5, shuffle=True, random_state=0))
82 | 
83 |     fit_params = {'verbose': -1}
84 |     if categorical_feature:
85 |         fit_params['categorical_feature'] = categorical_feature
86 | 
87 |     result = cross_validate(estimator, concat, y, None, cv=cv,
88 |                             eval_func=roc_auc_score, fit_params=fit_params, importance_type=importance_type)
89 | 
90 |     importance = pd.concat(result.importance)
91 |     importance = importance.groupby('feature')['importance'].mean().reset_index()
92 |     importance.sort_values(by='importance', ascending=False, inplace=True)
93 |     importance.reset_index(drop=True, inplace=True)
94 | 
95 |     return ADVResult(result.scores[-1], importance)
96 | 


--------------------------------------------------------------------------------
/nyaggle/validation/cross_validate.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import time
  3 | import warnings
  4 | from collections import namedtuple
  5 | from logging import Logger, getLogger
  6 | from typing import Any, Callable, Dict, Iterable, List, Optional, Union
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import sklearn.utils.multiclass as multiclass
 11 | from category_encoders.utils import convert_input, convert_input_vector
 12 | from sklearn.base import BaseEstimator
 13 | from sklearn.model_selection import BaseCrossValidator
 14 | 
 15 | from nyaggle.util.traits import is_gbdt_instance
 16 | from nyaggle.validation.split import check_cv
 17 | 
 18 | CVResult = namedtuple('CVResult', ['oof_prediction', 'test_prediction', 'scores', 'importance'])
 19 | 
 20 | 
 21 | def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
 22 |                    X_train: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
 23 |                    X_test: Union[pd.DataFrame, np.ndarray] = None,
 24 |                    cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
 25 |                    groups: Optional[pd.Series] = None,
 26 |                    eval_func: Optional[Callable] = None, logger: Optional[Logger] = None,
 27 |                    on_each_fold: Optional[Callable[[int, BaseEstimator, pd.DataFrame, pd.Series], None]] = None,
 28 |                    fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
 29 |                    importance_type: str = 'gain',
 30 |                    early_stopping: bool = True,
 31 |                    type_of_target: str = 'auto') -> CVResult:
 32 |     """
 33 |     Evaluate metrics by cross-validation. It also records out-of-fold prediction and test prediction.
 34 | 
 35 |     Args:
 36 |         estimator:
 37 |             The object to be used in cross-validation. For list inputs, ``estimator[i]`` is trained on i-th fold.
 38 |         X_train:
 39 |             Training data
 40 |         y:
 41 |             Target
 42 |         X_test:
 43 |             Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
 44 |         cv:
 45 |             int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
 46 | 
 47 |             - None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
 48 |             - integer, to specify the number of folds in a ``(Stratified)KFold``,
 49 |             - CV splitter (the instance of ``BaseCrossValidator``),
 50 |             - An iterable yielding (train, test) splits as arrays of indices.
 51 |         groups:
 52 |             Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
 53 |         eval_func:
 54 |             Function used for logging and returning scores
 55 |         logger:
 56 |             logger
 57 |         on_each_fold:
 58 |             called for each fold with (idx_fold, model, X_fold, y_fold)
 59 |         fit_params:
 60 |             Parameters passed to the fit method of the estimator
 61 |         importance_type:
 62 |             The type of feature importance to be used to calculate result.
 63 |             Used only in ``LGBMClassifier`` and ``LGBMRegressor``.
 64 |         early_stopping:
 65 |             If ``True``, ``eval_set`` will be added to ``fit_params`` for each fold.
 66 |             ``early_stopping_rounds = 100`` will also be appended to fit_params if it does not already have one.
 67 |         type_of_target:
 68 |             The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
 69 |             Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
 70 |     Returns:
 71 |         Namedtuple with following members
 72 | 
 73 |         * oof_prediction (numpy array, shape (len(X_train),)):
 74 |             The predicted value on put-of-Fold validation data.
 75 |         * test_prediction (numpy array, hape (len(X_test),)):
 76 |             The predicted value on test data. ``None`` if X_test is ``None``.
 77 |         * scores (list of float, shape (nfolds+1,)):
 78 |             ``scores[i]`` denotes validation score in i-th fold.
 79 |             ``scores[-1]`` is the overall score. `None` if eval is not specified.
 80 |         * importance (list of pandas DataFrame, shape (nfolds,)):
 81 |             ``importance[i]`` denotes feature importance in i-th fold model.
 82 |             If the estimator is not GBDT, empty array is returned.
 83 | 
 84 |     Example:
 85 |         >>> from sklearn.datasets import make_regression
 86 |         >>> from sklearn.linear_model import Ridge
 87 |         >>> from sklearn.metrics import mean_squared_error
 88 |         >>> from nyaggle.validation import cross_validate
 89 | 
 90 |         >>> X, y = make_regression(n_samples=8)
 91 |         >>> model = Ridge(alpha=1.0)
 92 |         >>> pred_oof, pred_test, scores, _ = \
 93 |         >>>     cross_validate(model,
 94 |         >>>                    X_train=X[:3, :],
 95 |         >>>                    y=y[:3],
 96 |         >>>                    X_test=X[3:, :],
 97 |         >>>                    cv=3,
 98 |         >>>                    eval_func=mean_squared_error)
 99 |         >>> print(pred_oof)
100 |         [-101.1123267 ,   26.79300693,   17.72635528]
101 |         >>> print(pred_test)
102 |         [-10.65095894 -12.18909059 -23.09906427 -17.68360714 -20.08218267]
103 |         >>> print(scores)
104 |         [71912.80290003832, 15236.680239881942, 15472.822033121925, 34207.43505768073]
105 |     """
106 |     cv = check_cv(cv, y)
107 |     n_output_cols = 1
108 |     if type_of_target == 'auto':
109 |         type_of_target = multiclass.type_of_target(y)
110 |     if type_of_target == 'multiclass':
111 |         n_output_cols = y.nunique(dropna=True)
112 | 
113 |     if isinstance(estimator, list):
114 |         assert len(estimator) == cv.get_n_splits(), "Number of estimators should be same to nfolds."
115 | 
116 |     X_train = convert_input(X_train)
117 |     y = convert_input_vector(y, X_train.index)
118 |     if X_test is not None:
119 |         X_test = convert_input(X_test)
120 | 
121 |     if not isinstance(estimator, list):
122 |         estimator = [estimator] * cv.get_n_splits()
123 | 
124 |     assert len(estimator) == cv.get_n_splits()
125 | 
126 |     if logger is None:
127 |         logger = getLogger(__name__)
128 | 
129 |     def _predict(model: BaseEstimator, x: pd.DataFrame, _type_of_target: str):
130 |         if _type_of_target in ('binary', 'multiclass'):
131 |             if hasattr(model, "predict_proba"):
132 |                 proba = model.predict_proba(x)
133 |             elif hasattr(model, "decision_function"):
134 |                 warnings.warn('Since {} does not have predict_proba method, '
135 |                               'decision_function is used for the prediction instead.'.format(type(model)))
136 |                 proba = model.decision_function(x)
137 |             else:
138 |                 raise RuntimeError('Estimator in classification problem should have '
139 |                                    'either predict_proba or decision_function')
140 |             if proba.ndim == 1:
141 |                 return proba
142 |             else:
143 |                 return proba[:, 1] if proba.shape[1] == 2 else proba
144 |         else:
145 |             return model.predict(x)
146 | 
147 |     oof = np.zeros((len(X_train), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_train))
148 |     evaluated = np.full(len(X_train), False)
149 |     test = None
150 |     if X_test is not None:
151 |         test = np.zeros((len(X_test), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_test))
152 | 
153 |     scores = []
154 |     eta_all = []
155 |     importance = []
156 | 
157 |     for n, (train_idx, valid_idx) in enumerate(cv.split(X_train, y, groups)):
158 |         start_time = time.time()
159 | 
160 |         train_x, train_y = X_train.iloc[train_idx], y.iloc[train_idx]
161 |         valid_x, valid_y = X_train.iloc[valid_idx], y.iloc[valid_idx]
162 | 
163 |         if fit_params is None:
164 |             fit_params_fold = {}
165 |         elif callable(fit_params):
166 |             fit_params_fold = fit_params(n, train_idx, valid_idx)
167 |         else:
168 |             fit_params_fold = copy.copy(fit_params)
169 | 
170 |         if is_gbdt_instance(estimator[n], ('lgbm', 'cat', 'xgb')):
171 |             if early_stopping:
172 |                 if 'eval_set' not in fit_params_fold:
173 |                     fit_params_fold['eval_set'] = [(valid_x, valid_y)]
174 |                 if 'early_stopping_rounds' not in fit_params_fold:
175 |                     fit_params_fold['early_stopping_rounds'] = 100
176 | 
177 |             estimator[n].fit(train_x, train_y, **fit_params_fold)
178 |         else:
179 |             estimator[n].fit(train_x, train_y, **fit_params_fold)
180 | 
181 |         oof[valid_idx] = _predict(estimator[n], valid_x, type_of_target)
182 |         evaluated[valid_idx] = True
183 | 
184 |         if X_test is not None:
185 |             test += _predict(estimator[n], X_test, type_of_target)
186 | 
187 |         if on_each_fold is not None:
188 |             on_each_fold(n, estimator[n], train_x, train_y)
189 | 
190 |         if is_gbdt_instance(estimator[n], ('lgbm', 'cat', 'xgb')):
191 |             importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type))
192 | 
193 |         if eval_func is not None:
194 |             score = eval_func(valid_y, oof[valid_idx])
195 |             scores.append(score)
196 |             logger.info('Fold {} score: {}'.format(n, score))
197 | 
198 |         elapsed = time.time() - start_time
199 |         eta_all.append(elapsed)
200 |         logger.debug('{:.3f} sec / fold'.format(elapsed))
201 | 
202 |     if eval_func is not None:
203 |         score = eval_func(y.loc[evaluated], oof[evaluated])
204 |         scores.append(score)
205 |         logger.info('Overall score: {}'.format(score))
206 | 
207 |     if X_test is not None:
208 |         predicted = test / cv.get_n_splits(X_train, y, groups)
209 |     else:
210 |         predicted = None
211 | 
212 |     return CVResult(oof, predicted, scores, importance)
213 | 
214 | 
215 | def _get_gbdt_importance(gbdt_model: BaseEstimator, features: List[str],
216 |                          importance_type: str) -> pd.DataFrame:
217 |     df = pd.DataFrame()
218 | 
219 |     df['feature'] = features
220 | 
221 |     if is_gbdt_instance(gbdt_model, 'cat'):
222 |         df['importance'] = gbdt_model.get_feature_importance()
223 |     elif is_gbdt_instance(gbdt_model, 'xgb'):
224 |         df['importance'] = gbdt_model.feature_importances_
225 |     elif is_gbdt_instance(gbdt_model, 'lgbm'):
226 |         df['importance'] = gbdt_model.booster_.feature_importance(importance_type=importance_type)
227 | 
228 |     return df
229 | 


--------------------------------------------------------------------------------
/nyaggle/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.6'
2 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | torch
2 | mlflow
3 | catboost
4 | lightgbm<4.0.0
5 | xgboost
6 | mecab-python3>=1.0.0
7 | flake8
8 | pytest
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | category_encoders
 2 | matplotlib
 3 | more-itertools
 4 | numpy
 5 | optuna>=1.0.0
 6 | pandas
 7 | pyarrow
 8 | seaborn
 9 | scikit-learn
10 | tqdm
11 | transformers[ja]
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from codecs import open
 2 | from os import path
 3 | 
 4 | from setuptools import find_packages, setup
 5 | 
 6 | 
 7 | def get_long_description():
 8 |     here = path.abspath(path.dirname(__file__))
 9 | 
10 |     with open(path.join(here, 'README.md'), encoding='utf-8') as f:
11 |         long_description = f.read()
12 |     return long_description
13 | 
14 | 
15 | def get_version():
16 |     version_filepath = path.join(path.dirname(__file__), 'nyaggle', 'version.py')
17 |     with open(version_filepath) as f:
18 |         for line in f:
19 |             if line.startswith('__version__'):
20 |                 return line.strip().split()[-1][1:-1]
21 | 
22 | 
23 | setup(
24 |     name='nyaggle',
25 |     packages=find_packages(),
26 | 
27 |     version=get_version(),
28 | 
29 |     license='MIT',
30 | 
31 |     install_requires=[
32 |         'category_encoders',
33 |         'matplotlib',
34 |         'more-itertools',
35 |         'numpy',
36 |         'optuna>=1.0.0',
37 |         'pandas',
38 |         'pyarrow',
39 |         'seaborn',
40 |         'scikit-learn',
41 |         'tqdm',
42 |         'transformers>=2.3.0',
43 |     ],
44 | 
45 |     extras_require={
46 |         'all': ['catboost>=0.17', 'lightgbm', 'xgboost', 'torch', 'mlflow']
47 |     },
48 | 
49 |     author='nyanp',
50 |     author_email='Noumi.Taiga@gmail.com',
51 |     url='https://github.com/nyanp/nyaggle',
52 |     description='Code for Kaggle and Offline Competitions.',
53 |     long_description=get_long_description(),
54 |     long_description_content_type='text/markdown',
55 |     keywords='nyaggle kaggle',
56 |     classifiers=[
57 |         'License :: OSI Approved :: BSD License',
58 |         'Programming Language :: Python :: 3.8',
59 |         'Programming Language :: Python :: 3.9',
60 |         'Programming Language :: Python :: 3.10',
61 |         'Programming Language :: Python :: 3.11'
62 |     ]
63 | )
64 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyanp/nyaggle/636532292d7ce3468cd47a3337bc50d620f0d23b/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import tempfile
 4 | import uuid
 5 | 
 6 | import pytest
 7 | 
 8 | 
 9 | @pytest.fixture(scope='function', autouse=True)
10 | def tmpdir_name():
11 |     path = None
12 |     try:
13 |         path = os.path.join(tempfile.gettempdir(), uuid.uuid4().hex)
14 |         yield path
15 |     finally:
16 |         if path:
17 |             shutil.rmtree(path, ignore_errors=True)
18 | 


--------------------------------------------------------------------------------
/tests/ensemble/test_averaging.py:
--------------------------------------------------------------------------------
  1 | import scipy.stats as stats
  2 | from numpy.testing import assert_array_almost_equal
  3 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
  4 | from sklearn.linear_model import Ridge, LogisticRegression
  5 | from sklearn.metrics import roc_auc_score, mean_squared_error
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.svm import SVC, SVR
  8 | from sklearn.utils.multiclass import type_of_target
  9 | 
 10 | from nyaggle.ensemble import averaging, averaging_opt
 11 | from nyaggle.testing import make_classification_df, make_regression_df
 12 | from nyaggle.validation import cross_validate
 13 | 
 14 | 
 15 | def _make_1st_stage_preds(X, y, X_test):
 16 |     if type_of_target(y) == 'continuous':
 17 |         models = [
 18 |             SVR(),
 19 |             Ridge(random_state=0),
 20 |             RandomForestRegressor(n_estimators=30, random_state=0)
 21 |         ]
 22 |     else:
 23 |         models = [
 24 |             SVC(random_state=0),
 25 |             LogisticRegression(random_state=0),
 26 |             RandomForestClassifier(n_estimators=30, random_state=0)
 27 |         ]
 28 | 
 29 |     results = [cross_validate(m, X, y, X_test, cv=5) for m in models]
 30 | 
 31 |     return [r.oof_prediction for r in results], [r.test_prediction for r in results]
 32 | 
 33 | 
 34 | def test_averaging():
 35 |     X, y = make_classification_df()
 36 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 37 | 
 38 |     _, test = _make_1st_stage_preds(X_train, y_train, X_test)
 39 | 
 40 |     result = averaging(test)
 41 | 
 42 |     assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
 43 |     assert result.score is None
 44 |     assert result.oof_prediction is None
 45 | 
 46 | 
 47 | def test_averaging_with_oof():
 48 |     X, y = make_classification_df()
 49 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 50 | 
 51 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
 52 | 
 53 |     result = averaging(test, oof, y_train)
 54 | 
 55 |     assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
 56 |     assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
 57 |     assert result.score is None
 58 | 
 59 | 
 60 | def test_averaging_regression():
 61 |     X, y = make_regression_df()
 62 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 63 | 
 64 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
 65 | 
 66 |     result = averaging(test, oof, y_train)
 67 | 
 68 |     assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
 69 |     assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
 70 |     assert result.score is None
 71 | 
 72 | 
 73 | def test_averaging_multiclass():
 74 |     X, y = make_classification_df(n_classes=5)
 75 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 76 | 
 77 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
 78 | 
 79 |     result = averaging(test, oof, y_train)
 80 | 
 81 |     assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
 82 |     assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
 83 |     assert result.score is None
 84 | 
 85 | 
 86 | def test_averaging_with_metrics():
 87 |     X, y = make_classification_df()
 88 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 89 | 
 90 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
 91 | 
 92 |     result = averaging(test, oof, y_train, eval_func=roc_auc_score)
 93 | 
 94 |     assert result.score == roc_auc_score(y_train, result.oof_prediction)
 95 | 
 96 | 
 97 | def test_weight_averaging():
 98 |     X, y = make_classification_df()
 99 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
100 | 
101 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
102 | 
103 |     result = averaging(test, oof, y_train, weights=[0.2, 0.4, 0.3])
104 | 
105 |     assert_array_almost_equal(0.2 * test[0] + 0.4 * test[1] + 0.3 * test[2], result.test_prediction)
106 |     assert_array_almost_equal(0.2 * oof[0] + 0.4 * oof[1] + 0.3 * oof[2], result.oof_prediction)
107 |     assert result.score is None
108 | 
109 | 
110 | def test_rank_averaging():
111 |     X, y = make_classification_df(n_samples=1024)
112 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
113 | 
114 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
115 | 
116 |     result = averaging(test, rank_averaging=True)
117 | 
118 |     test_rank = [stats.rankdata(t) / len(X_test) for t in test]
119 | 
120 |     assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction)
121 |     assert result.score is None
122 | 
123 | 
124 | def test_rank_averaging_with_oof():
125 |     X, y = make_classification_df(n_samples=1024)
126 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
127 | 
128 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
129 | 
130 |     result = averaging(test, oof, y_train, rank_averaging=True)
131 | 
132 |     oof_rank = [stats.rankdata(o) / len(X_train) for o in oof]
133 |     test_rank = [stats.rankdata(t) / len(X_test) for t in test]
134 | 
135 |     assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction)
136 |     assert_array_almost_equal((oof_rank[0] + oof_rank[1] + oof_rank[2]) / 3, result.oof_prediction)
137 |     assert result.score is None
138 | 
139 | 
140 | def test_averaging_opt_maximize():
141 |     X, y = make_classification_df(n_samples=1024)
142 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
143 | 
144 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
145 | 
146 |     best_single_model = max(roc_auc_score(y_train, oof[0]),
147 |                             roc_auc_score(y_train, oof[1]),
148 |                             roc_auc_score(y_train, oof[2]))
149 | 
150 |     result = averaging_opt(test, oof, y_train, roc_auc_score, higher_is_better=True)
151 | 
152 |     assert result.score >= best_single_model
153 | 
154 |     result_simple_avg = averaging(test, oof, y_train, eval_func=roc_auc_score)
155 | 
156 |     assert result.score >= result_simple_avg.score
157 | 
158 | 
159 | def test_averaging_opt_minimize():
160 |     X, y = make_regression_df(n_samples=1024)
161 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
162 | 
163 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
164 | 
165 |     best_single_model = min(mean_squared_error(y_train, oof[0]),
166 |                             mean_squared_error(y_train, oof[1]),
167 |                             mean_squared_error(y_train, oof[2]))
168 | 
169 |     result = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False)
170 | 
171 |     assert result.score <= best_single_model
172 | 
173 |     result_simple_avg = averaging(test, oof, y_train, eval_func=mean_squared_error)
174 | 
175 |     assert result.score <= result_simple_avg.score
176 | 
177 | 
178 | def test_averaging_opt_minimize_with_method():
179 |     X, y = make_regression_df(n_samples=1024)
180 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
181 | 
182 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
183 | 
184 |     best_single_model = min(mean_squared_error(y_train, oof[0]),
185 |                             mean_squared_error(y_train, oof[1]),
186 |                             mean_squared_error(y_train, oof[2]))
187 | 
188 |     result1 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False)
189 |     result2 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False, method='Nelder-Mead')
190 |     result3 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False, method='SLSQP')
191 | 
192 |     assert result1.score != result2.score
193 |     assert result1.score == result3.score
194 | 
195 |     assert result1.score <= best_single_model
196 |     assert result2.score <= best_single_model
197 | 
198 | 
199 | def test_rank_averaging_opt_maximize():
200 |     X, y = make_classification_df(n_samples=1024)
201 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
202 | 
203 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
204 | 
205 |     best_single_model = max(roc_auc_score(y_train, oof[0]),
206 |                             roc_auc_score(y_train, oof[1]),
207 |                             roc_auc_score(y_train, oof[2]))
208 | 
209 |     result = averaging_opt(test, oof, y_train, roc_auc_score, higher_is_better=True, rank_averaging=True)
210 | 
211 |     assert result.score >= best_single_model
212 | 
213 |     result_simple_avg = averaging(test, oof, y_train, eval_func=roc_auc_score, rank_averaging=True)
214 | 
215 |     assert result.score >= result_simple_avg.score
216 | 


--------------------------------------------------------------------------------
/tests/ensemble/test_stacking.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 2 | from sklearn.linear_model import Ridge, LogisticRegression
 3 | from sklearn.metrics import mean_squared_error, roc_auc_score
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn.svm import SVC, SVR
 6 | from sklearn.utils.multiclass import type_of_target
 7 | 
 8 | from nyaggle.ensemble import stacking
 9 | from nyaggle.testing import make_classification_df, make_regression_df
10 | from nyaggle.validation import cross_validate
11 | 
12 | 
13 | def _make_1st_stage_preds(X, y, X_test):
14 |     if type_of_target(y) == 'continuous':
15 |         models = [
16 |             SVR(),
17 |             Ridge(random_state=0),
18 |             RandomForestRegressor(n_estimators=30, random_state=0)
19 |         ]
20 |     else:
21 |         models = [
22 |             SVC(random_state=0),
23 |             LogisticRegression(random_state=0),
24 |             RandomForestClassifier(n_estimators=30, random_state=0)
25 |         ]
26 | 
27 |     results = [cross_validate(m, X, y, X_test, cv=5) for m in models]
28 | 
29 |     return [r.oof_prediction for r in results], [r.test_prediction for r in results]
30 | 
31 | 
32 | def test_stacking_classification():
33 |     X, y = make_classification_df()
34 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
35 | 
36 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
37 | 
38 |     worst_base_roc = min(roc_auc_score(y_train, _oof) for _oof in oof)
39 | 
40 |     result = stacking(test, oof, y_train, eval_func=roc_auc_score)
41 | 
42 |     assert roc_auc_score(y_train, result.oof_prediction) > worst_base_roc
43 | 
44 | 
45 | def test_stacking_regression():
46 |     X, y = make_regression_df()
47 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
48 | 
49 |     oof, test = _make_1st_stage_preds(X_train, y_train, X_test)
50 | 
51 |     worst_base_rmse = max(mean_squared_error(y_train, _oof) for _oof in oof)
52 | 
53 |     result = stacking(test, oof, y_train, eval_func=mean_squared_error)
54 | 
55 |     assert mean_squared_error(y_train, result.oof_prediction) < worst_base_rmse
56 | 


--------------------------------------------------------------------------------
/tests/experiment/test_experiment.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import pytest
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | from nyaggle.experiment import Experiment
  9 | 
 10 | 
 11 | def test_log_params(tmpdir_name):
 12 |     with Experiment(tmpdir_name) as e:
 13 |         e.log_param('x', 1)
 14 |         e.log_param('x', 2)
 15 |         e.log_params({
 16 |             'y': 'ABC',
 17 |             'z': None,
 18 |         })
 19 | 
 20 |     with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f:
 21 |         params = json.load(f)
 22 | 
 23 |         expected = {
 24 |             'x': 2,      # if the key is duplicated, the latter one is stored
 25 |             'y': 'ABC',
 26 |             'z': 'None'  # all non-numerical values are casted to string before logging
 27 |         }
 28 |         assert params == expected
 29 | 
 30 | 
 31 | def test_log_params_empty(tmpdir_name):
 32 |     with Experiment(tmpdir_name):
 33 |         pass
 34 | 
 35 |     with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f:
 36 |         params = json.load(f)
 37 |         assert params == {}
 38 | 
 39 | 
 40 | def test_log_metrics(tmpdir_name):
 41 |     with Experiment(tmpdir_name) as e:
 42 |         e.log_metric('x', 1)
 43 |         e.log_metric('x', 2)
 44 |         e.log_metrics({
 45 |             'y': 3,
 46 |             'z': 4,
 47 |         })
 48 | 
 49 |     with open(os.path.join(tmpdir_name, 'metrics.json'), 'r') as f:
 50 |         params = json.load(f)
 51 | 
 52 |         expected = {
 53 |             'x': 2,
 54 |             'y': 3,
 55 |             'z': 4,
 56 |         }
 57 |         assert params == expected
 58 | 
 59 | 
 60 | def test_log_metrics_empty(tmpdir_name):
 61 |     with Experiment(tmpdir_name):
 62 |         pass
 63 | 
 64 |     with open(os.path.join(tmpdir_name, 'metrics.json'), 'r') as f:
 65 |         params = json.load(f)
 66 |         assert params == {}
 67 | 
 68 | 
 69 | def test_log_dict(tmpdir_name):
 70 |     with Experiment(tmpdir_name) as e:
 71 |         e.log_dict('foo', {'a': 1, 'b': 'foo', 'c': {'d': 'e', 'f': {}, 'g': {'h': 'i'}, 'j': None}})
 72 | 
 73 |     with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f:
 74 |         params = json.load(f)
 75 |         assert params == {
 76 |             'foo.a': 1,
 77 |             'foo.b': 'foo',
 78 |             'foo.c.d': 'e',
 79 |             'foo.c.f': '{}',
 80 |             'foo.c.g.h': 'i',
 81 |             'foo.c.j': 'None'
 82 |         }
 83 | 
 84 | 
 85 | def test_error_while_experiment(tmpdir_name):
 86 |     try:
 87 |         with Experiment(tmpdir_name) as e:
 88 |             e.log_metric('x', 0.5)
 89 |             e.log_param('foo', 'bar')
 90 |             e.log_numpy('np', np.zeros(100))
 91 |             e.log_dataframe('df', pd.DataFrame({'a': [1, 2, 3]}))
 92 | 
 93 |         raise KeyboardInterrupt()
 94 |     except KeyboardInterrupt:
 95 |         pass
 96 | 
 97 |     # all logs are saved even if error raised inside experiment
 98 |     with open(os.path.join(tmpdir_name, 'metrics.json'), 'r') as f:
 99 |         metrics = json.load(f)
100 |         assert metrics == {'x': 0.5}
101 | 
102 |     with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f:
103 |         params = json.load(f)
104 |         assert params == {'foo': 'bar'}
105 | 
106 |     assert os.path.exists(os.path.join(tmpdir_name, 'np.npy'))
107 |     assert os.path.exists(os.path.join(tmpdir_name, 'df.f'))
108 | 
109 | 
110 | def test_experiment_duplicated_error(tmpdir_name):
111 |     with Experiment(tmpdir_name) as e:
112 |         e.log_metric('CV', 0.97)
113 | 
114 |     with pytest.raises(ValueError):
115 |         with Experiment(tmpdir_name):
116 |             pass
117 | 
118 |     with pytest.raises(ValueError):
119 |         with Experiment(tmpdir_name, if_exists='error'):
120 |             pass
121 | 
122 | 
123 | def test_experiment_duplicated_replace(tmpdir_name):
124 |     with Experiment(tmpdir_name) as e:
125 |         e.log_metric('CV', 0.97)
126 | 
127 |     with Experiment(tmpdir_name, if_exists='replace') as e:
128 |         e.log_metric('LB', 0.95)
129 | 
130 |     with open(os.path.join(tmpdir_name, 'metrics.json')) as f:
131 |         metrics = json.load(f)
132 | 
133 |         # replaced by the new result
134 |         assert 'LB' in metrics
135 |         assert 'CV' not in metrics
136 | 
137 | 
138 | def test_experiment_duplicated_append(tmpdir_name):
139 |     with Experiment(tmpdir_name) as e:
140 |         e.log_metric('CV', 0.97)
141 | 
142 |     with Experiment(tmpdir_name, if_exists='append') as e:
143 |         e.log_metric('LB', 0.95)
144 | 
145 |     with open(os.path.join(tmpdir_name, 'metrics.json')) as f:
146 |         metrics = json.load(f)
147 | 
148 |         # appended to the existing result
149 |         assert 'LB' in metrics
150 |         assert 'CV' in metrics
151 | 
152 | 
153 | def test_experiment_duplicated_rename(tmpdir_name):
154 |     with Experiment(tmpdir_name) as e:
155 |         e.log_metric('CV', 0.97)
156 | 
157 |     with Experiment(tmpdir_name, if_exists='rename') as e:
158 |         e.log_metric('LB', 0.95)
159 | 
160 |     with open(os.path.join(tmpdir_name, 'metrics.json')) as f:
161 |         metrics = json.load(f)
162 |         assert 'LB' not in metrics
163 |         assert 'CV' in metrics
164 | 
165 |     with open(os.path.join(tmpdir_name + '_1', 'metrics.json')) as f:
166 |         metrics = json.load(f)
167 |         assert 'LB' in metrics
168 |         assert 'CV' not in metrics
169 | 
170 | 
171 | def test_experiment_duplicated_replace_mlflow(tmpdir_name):
172 |     import mlflow
173 | 
174 |     with Experiment(tmpdir_name, with_mlflow=True) as e:
175 |         e.log_metric('CV', 0.97)
176 |         run_id_old = e.mlflow_run_id
177 | 
178 |     with Experiment(tmpdir_name, with_mlflow=True, if_exists='replace') as e:
179 |         e.log_metric('LB', 0.95)
180 |         run_id_new = e.mlflow_run_id
181 | 
182 |     assert run_id_old != run_id_new
183 | 
184 |     client = mlflow.tracking.MlflowClient()
185 |     old_run = client.get_run(run_id_old)
186 |     new_run = client.get_run(run_id_new)
187 |     assert old_run.info.lifecycle_stage == 'deleted'
188 |     assert new_run.info.lifecycle_stage == 'active'
189 | 
190 | 
191 | def test_experiment_duplicated_append_mlflow(tmpdir_name):
192 |     with Experiment(tmpdir_name, with_mlflow=True) as e:
193 |         e.log_metric('CV', 0.97)
194 |         run_id_old = e.mlflow_run_id
195 | 
196 |     with Experiment(tmpdir_name, with_mlflow=True, if_exists='append') as e:
197 |         e.log_metric('LB', 0.95)
198 |         run_id_new = e.mlflow_run_id
199 | 
200 |     with open(os.path.join(tmpdir_name, 'metrics.json')) as f:
201 |         metrics = json.load(f)
202 | 
203 |         # appended to the existing result
204 |         assert 'LB' in metrics
205 |         assert 'CV' in metrics
206 | 
207 |     assert run_id_old == run_id_new
208 | 
209 |     import mlflow
210 |     client = mlflow.tracking.MlflowClient()
211 |     old_run = client.get_run(run_id_old)
212 |     assert old_run.info.lifecycle_stage == 'active'
213 | 
214 | 
215 | def test_experiment_duplicated_rename_mlflow(tmpdir_name):
216 |     with Experiment(tmpdir_name, with_mlflow=True) as e:
217 |         e.log_metric('CV', 0.97)
218 |         run_id_old = e.mlflow_run_id
219 | 
220 |     with Experiment(tmpdir_name, with_mlflow=True, if_exists='rename') as e:
221 |         e.log_metric('LB', 0.95)
222 |         run_id_new = e.mlflow_run_id
223 | 
224 |     assert run_id_old != run_id_new
225 | 
226 | 
227 | def test_experiment_continue(tmpdir_name):
228 |     with Experiment(tmpdir_name, with_mlflow=True) as e:
229 |         e.log_metric('CV', 0.97)
230 | 
231 |     # appending to exising local & mlflow result
232 |     with Experiment.continue_from(tmpdir_name, with_mlflow=True) as e:
233 |         e.log_metric('LB', 0.95)
234 | 
235 |         metric_file = os.path.join(tmpdir_name, 'metrics.json')
236 | 
237 |         import mlflow
238 | 
239 |         client = mlflow.tracking.MlflowClient()
240 |         data = client.get_run(mlflow.active_run().info.run_id).data
241 |         assert data.metrics['CV'] == 0.97
242 |         assert data.metrics['LB'] == 0.95
243 | 
244 |     with open(metric_file, 'r') as f:
245 |         obj = json.load(f)
246 |         assert obj['CV'] == 0.97
247 |         assert obj['LB'] == 0.95
248 | 
249 |     with Experiment(tmpdir_name, with_mlflow=True, if_exists='append') as e:
250 |         e.log_metric('X', 1.1)
251 | 
252 |         import mlflow
253 | 
254 |         client = mlflow.tracking.MlflowClient()
255 |         data = client.get_run(mlflow.active_run().info.run_id).data
256 |         assert data.metrics['CV'] == 0.97
257 |         assert data.metrics['LB'] == 0.95
258 |         assert data.metrics['X'] == 1.1
259 | 
260 |     # stop logging to mlflow, still continue logging on local dir
261 |     with Experiment.continue_from(tmpdir_name, with_mlflow=False) as e:
262 |         e.log_metric('Y', 1.1)
263 |         import mlflow
264 |         assert mlflow.active_run() is None
265 | 
266 |     with open(metric_file, 'r') as f:
267 |         obj = json.load(f)
268 |         assert 'Y' in obj
269 | 


--------------------------------------------------------------------------------
/tests/experiment/test_hyperparameter_tuner.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn import datasets
 3 | 
 4 | from nyaggle.experiment.hyperparameter_tuner import find_best_lgbm_parameter
 5 | 
 6 | 
 7 | def _check_parameter_tunes(params, x, y):
 8 |     best_params = find_best_lgbm_parameter(params, x, y)
 9 |     # lightgbm_tuner tuned params
10 |     tuned_params = {
11 |         'num_leaves', 'feature_fraction', 'bagging_fraction', 'bagging_freq',
12 |         'lambda_l1', 'lambda_l2', 'min_child_samples'
13 |     }
14 |     intersection = set(best_params.keys()) & tuned_params
15 |     assert intersection == tuned_params
16 | 
17 | 
18 | def test_regression_problem_parameter_tunes():
19 |     x, y = datasets.load_diabetes(return_X_y=True, as_frame=True)
20 |     params = {
21 |         'objective': 'regression',
22 |         'metric': 'rmse',
23 |         'verbosity': -1,
24 |     }
25 |     _check_parameter_tunes(params, x, y)
26 | 
27 | 
28 | def test_binary_classification_parameter_tunes():
29 |     dataset = datasets.load_breast_cancer()
30 |     x = pd.DataFrame(dataset.data, columns=dataset.feature_names)
31 |     y = pd.Series(dataset.target)
32 |     params = {
33 |         'objective': 'binary',
34 |         'metric': 'binary_logloss',
35 |         'verbosity': -1,
36 |     }
37 |     _check_parameter_tunes(params, x, y)
38 | 
39 | 
40 | def test_multi_classification_parameter_tunes():
41 |     dataset = datasets.load_wine()
42 |     x = pd.DataFrame(dataset.data, columns=dataset.feature_names)
43 |     y = pd.Series(dataset.target)
44 |     params = {
45 |         'objective': 'multiclass',
46 |         'num_class': 3,
47 |         'verbosity': -1,
48 |     }
49 |     _check_parameter_tunes(params, x, y)
50 | 


--------------------------------------------------------------------------------
/tests/feature/category_encoder/test_target_encoder.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | import category_encoders as ce
  4 | import numpy.testing as npt
  5 | import pandas as pd
  6 | from pandas.testing import assert_frame_equal
  7 | from sklearn.model_selection import KFold
  8 | 
  9 | from nyaggle.feature.category_encoder import TargetEncoder
 10 | 
 11 | 
 12 | def _test_target_encoder(X_train, y_train, X_test, **kw):
 13 |     cv = KFold(n_splits=2, random_state=42, shuffle=True)
 14 | 
 15 |     te = TargetEncoder(cv.split(X_train), **kw)
 16 | 
 17 |     ret_train = te.fit_transform(X_train, y_train)
 18 |     ret_test = te.transform(X_test)
 19 | 
 20 |     ret_train2 = copy.deepcopy(X_train)
 21 |     ret_test2 = copy.deepcopy(X_test)
 22 | 
 23 |     for train_idx, test_idx in cv.split(X_train):
 24 |         te2 = ce.TargetEncoder(**kw)
 25 | 
 26 |         if isinstance(X_train, pd.DataFrame):
 27 |             te2.fit(X_train.loc[train_idx, :], y_train.loc[train_idx])
 28 |             ret_train2.loc[test_idx] = te2.transform(ret_train2.loc[test_idx])
 29 |         else:
 30 |             te2.fit(X_train[train_idx, :], y_train[train_idx])
 31 |             ret_train2[test_idx] = te2.transform(ret_train2[test_idx])
 32 | 
 33 |     ret_train2 = ret_train2.astype(float)
 34 | 
 35 |     if isinstance(ret_train, pd.DataFrame):
 36 |         assert_frame_equal(ret_train, ret_train2)
 37 |     else:
 38 |         npt.assert_array_equal(ret_train, ret_train2)
 39 | 
 40 |     te2 = ce.TargetEncoder(**kw)
 41 |     te2.fit(X_train, y_train)
 42 | 
 43 |     ret_test2 = te2.transform(ret_test2)
 44 | 
 45 |     if isinstance(ret_train, pd.DataFrame):
 46 |         assert_frame_equal(ret_test, ret_test2)
 47 |     else:
 48 |         npt.assert_array_equal(ret_test, ret_test2)
 49 | 
 50 | 
 51 | def test_target_encoder_fit_transform():
 52 |     X_train = pd.DataFrame({
 53 |         'x': ['A', 'A', 'A', 'B', 'B', 'C'],
 54 |         'a': [1, 2, 3, 1, 2, 3]
 55 | 
 56 |     })
 57 |     y_train = pd.Series([0, 0, 1, 0, 1, 1])
 58 |     X_test = pd.DataFrame({
 59 |         'x': ['A', 'B', 'C', 'D'],
 60 |         'a': [1, 2, 3, 4]
 61 |     })
 62 | 
 63 |     X = pd.concat([X_train, X_test])
 64 |     y = pd.concat([y_train, pd.Series([None] * 4)]).astype(float)
 65 | 
 66 |     ce1 = TargetEncoder(cols=['x'])
 67 |     ce1.fit(X_train, y_train)
 68 |     ret1 = ce1.transform(X_test)
 69 | 
 70 |     ce2 = TargetEncoder(cols=['x'])
 71 |     ret2 = ce2.fit_transform(X, y).iloc[6:, :]
 72 | 
 73 |     assert_frame_equal(ret1, ret2)
 74 | 
 75 | 
 76 | def test_target_encoder():
 77 |     X_train = pd.DataFrame({
 78 |         'x': ['A', 'A', 'A', 'B', 'B', 'C'],
 79 | 
 80 |     })
 81 |     y_train = pd.Series([0, 0, 1, 0, 1, 1])
 82 |     X_test = pd.DataFrame({
 83 |         'x': ['A', 'B', 'C', 'D']
 84 |     })
 85 | 
 86 |     _test_target_encoder(X_train, y_train, X_test)
 87 | 
 88 | 
 89 | def test_target_encoder_ndarray():
 90 |     X_train = pd.DataFrame({
 91 |         'x': ['A', 'A', 'A', 'B', 'B', 'C'],
 92 | 
 93 |     })
 94 |     y_train = pd.Series([0, 0, 1, 0, 1, 1])
 95 |     X_test = pd.DataFrame({
 96 |         'x': ['A', 'B', 'C', 'D']
 97 |     })
 98 | 
 99 |     _test_target_encoder(X_train.values, y_train.values, X_test.values)
100 | 


--------------------------------------------------------------------------------
/tests/feature/nlp/test_bert.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy.testing as npt
  4 | import pandas as pd
  5 | import pytest
  6 | from pandas.testing import assert_frame_equal
  7 | 
  8 | from nyaggle.feature.nlp import BertSentenceVectorizer
  9 | 
 10 | _TEST_SENTENCE_EN = [
 11 |     'This is a pen.',
 12 |     'A quick brown fox',
 13 |     'Redistribution and use in source and binary forms, with or without modification.',
 14 |     'BERT is the state of the art NLP model.',
 15 |     'This is a pen.',
 16 |     'THIS IS A PEN.',
 17 | ]
 18 | 
 19 | _TEST_SENTENCE_JP = [
 20 |     '金メダルが5枚欲しい。',
 21 |     '私は昨日から風邪をひいています。',
 22 |     'これはペンです。',
 23 |     'BERTは最新の自然言語処理モデルです。',
 24 |     '金メダルが5枚欲しい。',
 25 |     '金メダルが 5枚 欲しい。',
 26 | ]
 27 | 
 28 | 
 29 | def _under_py35():
 30 |     return not (sys.version_info.major == 3 and sys.version_info.minor >= 6)
 31 | 
 32 | 
 33 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5")
 34 | def test_bert_fit():
 35 |     bert = BertSentenceVectorizer(use_cuda=False)
 36 | 
 37 |     X = pd.DataFrame({
 38 |         'id': [0, 1, 2, 3, 4, 5],
 39 |         'sentence': _TEST_SENTENCE_EN
 40 |     })
 41 | 
 42 |     bert.fit(X)
 43 |     ret = bert.transform(X)
 44 | 
 45 |     assert ret.shape[0] == 6
 46 |     assert ret.shape[1] == 768 + 1  # id + embed
 47 | 
 48 |     ret.drop('id', axis=1, inplace=True)
 49 |     npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values)
 50 |     npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values)
 51 | 
 52 | 
 53 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5")
 54 | def test_bert_fit_transform():
 55 |     X = pd.DataFrame({
 56 |         'id': [0, 1, 2, 3, 4, 5],
 57 |         'sentence': _TEST_SENTENCE_EN
 58 |     })
 59 | 
 60 |     bert = BertSentenceVectorizer(use_cuda=False)
 61 |     ret = bert.fit_transform(X)
 62 | 
 63 |     bert = BertSentenceVectorizer(use_cuda=False)
 64 |     bert.fit(X)
 65 |     ret2 = bert.fit_transform(X)
 66 | 
 67 |     assert_frame_equal(ret, ret2)
 68 | 
 69 | 
 70 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5")
 71 | def test_bert_en_svd():
 72 |     n_components = 3
 73 |     bert = BertSentenceVectorizer(n_components=n_components, use_cuda=False)
 74 | 
 75 |     X = pd.DataFrame({
 76 |         'id': [0, 1, 2, 3, 4, 5],
 77 |         'sentence': _TEST_SENTENCE_EN
 78 |     })
 79 | 
 80 |     ret = bert.fit_transform(X)
 81 | 
 82 |     assert ret.shape[0] == 6
 83 |     assert ret.shape[1] == n_components + 1
 84 | 
 85 |     ret.drop('id', axis=1, inplace=True)
 86 |     npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values, decimal=3)
 87 |     npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values, decimal=3)
 88 | 
 89 | 
 90 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5")
 91 | def test_bert_en_svd_multicol():
 92 |     bert = BertSentenceVectorizer(use_cuda=False)
 93 | 
 94 |     X = pd.DataFrame({
 95 |         'id': [0, 1, 2, 3, 4, 5],
 96 |         'sentence': _TEST_SENTENCE_EN,
 97 |         'sentence2': _TEST_SENTENCE_EN
 98 |     })
 99 | 
100 |     ret = bert.fit_transform(X)
101 | 
102 |     assert ret.shape[0] == 6
103 |     assert ret.shape[1] == 2 * 768 + 1
104 | 
105 |     ret.drop('id', axis=1, inplace=True)
106 |     npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values, decimal=3)
107 |     npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values, decimal=3)
108 | 
109 | 
110 | @pytest.mark.skipif(_under_py35(), reason="BertSentenceVectorizer is not supported under Python <= 3.5")
111 | def test_bert_jp():
112 |     bert = BertSentenceVectorizer(use_cuda=False, lang='jp')
113 | 
114 |     X = pd.DataFrame({
115 |         'id': [0, 1, 2, 3, 4, 5],
116 |         'sentence': _TEST_SENTENCE_JP
117 |     })
118 | 
119 |     ret = bert.fit_transform(X)
120 | 
121 |     assert ret.shape[0] == 6
122 |     assert ret.shape[1] == 768 + 1
123 | 
124 |     ret.drop('id', axis=1, inplace=True)
125 |     npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values)
126 |     npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values)
127 | 


--------------------------------------------------------------------------------
/tests/feature/test_groupby.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from sklearn import datasets
 5 | 
 6 | from nyaggle.feature.groupby import aggregation
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def iris_dataframe():
11 |     iris = datasets.load_iris()
12 |     df = pd.DataFrame(np.concatenate([iris.data,
13 |                                       iris.target.reshape((iris.target.shape[0], 1))], axis=1))
14 |     df.columns = ['sl', 'sw', 'pl', 'pw', 'species']
15 |     group_key = 'species'
16 |     group_values = ['sl', 'sw', 'pl', 'pw']
17 |     return df, group_key, group_values
18 | 
19 | 
20 | def custom_function(x):
21 |     return np.sum(x)
22 | 
23 | 
24 | def test_return_type_by_aggregation(iris_dataframe):
25 |     df, group_key, group_values = iris_dataframe
26 |     agg_methods = ["max", np.sum, custom_function]
27 |     new_df, new_cols = aggregation(df, group_key, group_values,
28 |                                    agg_methods)
29 |     assert isinstance(new_df, pd.DataFrame)
30 |     assert isinstance(new_cols, list)
31 | 
32 | 
33 | @pytest.mark.parametrize('agg_method', [[int], [lambda x: np.max(x)]])
34 | def test_assert_by_aggregation(iris_dataframe, agg_method):
35 |     df, group_key, group_values = iris_dataframe
36 |     with pytest.raises(ValueError):
37 |         aggregation(df, group_key, group_values, agg_method)
38 | 


--------------------------------------------------------------------------------
/tests/feature_store/test_feature_store.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pytest
  6 | from pandas.testing import assert_frame_equal
  7 | 
  8 | import nyaggle.feature_store as fs
  9 | from nyaggle.testing import get_temp_directory
 10 | 
 11 | 
 12 | def test_save_feature():
 13 |     df = pd.DataFrame()
 14 | 
 15 |     df['a'] = np.arange(100)
 16 | 
 17 |     with get_temp_directory() as tmp:
 18 |         fs.save_feature(df, 0, tmp)
 19 | 
 20 |         assert os.path.exists(os.path.join(tmp, '0.f'))
 21 | 
 22 | 
 23 | def test_load_feature():
 24 |     df = pd.DataFrame()
 25 | 
 26 |     df['a'] = np.arange(100)
 27 | 
 28 |     with get_temp_directory() as tmp:
 29 |         fs.save_feature(df, 0, tmp)
 30 | 
 31 |         df_loaded = fs.load_feature(0, tmp)
 32 |         assert_frame_equal(df, df_loaded)
 33 | 
 34 | 
 35 | def test_multi_columns():
 36 |     df = pd.DataFrame()
 37 | 
 38 |     df['a'] = np.arange(100)
 39 |     df['b'] = None
 40 | 
 41 |     with get_temp_directory() as tmp:
 42 |         fs.save_feature(df, 0, tmp)
 43 | 
 44 |         df_loaded = fs.load_feature(0, tmp)
 45 |         assert_frame_equal(df, df_loaded)
 46 | 
 47 | 
 48 | def test_various_dtypes():
 49 |     df = pd.DataFrame()
 50 | 
 51 |     df['a'] = np.arange(100).astype(float)
 52 |     df['b'] = np.arange(100).astype(int)
 53 |     df['c'] = np.arange(100).astype(np.uint8)
 54 |     df['d'] = np.arange(100).astype(np.uint16)
 55 |     df['e'] = np.arange(100).astype(np.uint32)
 56 |     df['f'] = np.arange(100).astype(np.int8)
 57 |     df['g'] = np.arange(100).astype(np.int16)
 58 |     df['h'] = np.arange(100).astype(np.int32)
 59 |     df['i'] = np.arange(100).astype(np.int64)
 60 | 
 61 |     with get_temp_directory() as tmp:
 62 |         fs.save_feature(df, 0, tmp)
 63 | 
 64 |         df_loaded = fs.load_feature(0, tmp)
 65 |         assert_frame_equal(df, df_loaded)
 66 | 
 67 | 
 68 | def test_load_features():
 69 |     df = pd.DataFrame()
 70 | 
 71 |     df['a'] = np.arange(100).astype(float)
 72 |     df['b'] = np.arange(100).astype(int)
 73 |     df['c'] = np.arange(100).astype(int)
 74 | 
 75 |     with get_temp_directory() as tmp:
 76 |         fs.save_feature(df[['b']], 0, tmp)
 77 |         fs.save_feature(df[['c']], 1, tmp)
 78 | 
 79 |         df_loaded = fs.load_features(df[['a']], [0, 1], tmp)
 80 |         assert_frame_equal(df, df_loaded)
 81 | 
 82 | 
 83 | def test_load_features_no_base():
 84 |     df = pd.DataFrame()
 85 | 
 86 |     df['a'] = np.arange(100).astype(float)
 87 |     df['b'] = np.arange(100).astype(int)
 88 |     df['c'] = np.arange(100).astype(int)
 89 | 
 90 |     with get_temp_directory() as tmp:
 91 |         fs.save_feature(df[['b']], 0, tmp)
 92 |         fs.save_feature(df[['c']], 1, tmp)
 93 |         fs.save_feature(df[['a']], '2', tmp)
 94 | 
 95 |         df_loaded = fs.load_features(None, [0, 1, '2'], tmp)
 96 |         assert list(df_loaded.columns) == ['b', 'c', 'a']
 97 | 
 98 | 
 99 | def test_load_feature_ignore_columns():
100 |     df = pd.DataFrame()
101 | 
102 |     df['a'] = np.arange(100).astype(float)
103 |     df['b'] = np.arange(100).astype(int)
104 |     df['c'] = np.arange(100).astype(int)
105 | 
106 |     with get_temp_directory() as tmp:
107 |         fs.save_feature(df, 0, tmp)
108 | 
109 |         # just skip irrelevant column names
110 |         df_loaded = fs.load_feature(0, tmp, ignore_columns=['b', 'X'])
111 | 
112 |         assert_frame_equal(df_loaded, df.drop('b', axis=1))
113 | 
114 | 
115 | def test_load_feature_ignore_all_columns():
116 |     df = pd.DataFrame()
117 | 
118 |     df['a'] = np.arange(100).astype(float)
119 |     df['b'] = np.arange(100).astype(int)
120 |     df['c'] = np.arange(100).astype(int)
121 | 
122 |     with get_temp_directory() as tmp:
123 |         fs.save_feature(df, 0, tmp)
124 | 
125 |         df_loaded = fs.load_feature(0, tmp, ignore_columns=['a', 'b', 'c', 'X'])
126 | 
127 |         assert_frame_equal(df_loaded, df.drop(['a', 'b', 'c'], axis=1))
128 | 
129 | 
130 | def test_load_features_duplicate_col_name():
131 |     df = pd.DataFrame()
132 | 
133 |     df['a'] = np.arange(100).astype(float)
134 |     df['b'] = np.arange(100).astype(int)
135 |     df['c'] = np.arange(100).astype(int)
136 | 
137 |     with get_temp_directory() as tmp:
138 |         fs.save_feature(df[['a', 'b']], 0, tmp)
139 |         fs.save_feature(df[['b', 'c']], 1, tmp)
140 |         fs.save_feature(df[['b', 'a']], 'X', tmp)
141 | 
142 |         df_loaded = fs.load_features(None, [0, 1, 'X'], tmp, rename_duplicate=True)
143 |         assert list(df_loaded.columns) == ['a', 'b', 'b_1', 'c', 'b_X', 'a_X']
144 | 
145 |         df_loaded = fs.load_features(None, [0, 1, 'X'], tmp, rename_duplicate=False)
146 |         assert list(df_loaded.columns) == ['a', 'b', 'b', 'c', 'b', 'a']
147 | 
148 | 
149 | def test_invalid_feature():
150 |     df = pd.DataFrame({
151 |         'a': [1, 2, 3, 4, 5] + [None] * 5,
152 |         'b': np.random.randint(0, 10, size=10)
153 |     })
154 |     y = pd.Series([1, 0, 1, 0, 1])
155 | 
156 |     with get_temp_directory() as tmp:
157 |         with pytest.raises(RuntimeError):
158 |             fs.save_feature(df[['a']], 0, reference_target_variable=y, directory=tmp)
159 |         with pytest.raises(RuntimeError):
160 |             fs.save_feature(df, 0, reference_target_variable=y, directory=tmp)
161 | 
162 |         # ok
163 |         fs.save_feature(df[['b']], 0, reference_target_variable=y, directory=tmp)
164 | 
165 | 
166 | def test_feature_exists():
167 |     df = pd.DataFrame({
168 |         'a': [1, 2, 3, 4, 5] + [None] * 5
169 |     })
170 | 
171 |     with get_temp_directory() as tmp:
172 |         fs.save_feature(df[['a']], 0, directory=tmp)
173 |         with pytest.raises(RuntimeError):
174 |             fs.save_feature(df, 0, overwrite=False, directory=tmp)
175 | 
176 | 
177 | def test_decorator():
178 |     with get_temp_directory() as tmp:
179 |         @fs.cached_feature('x', tmp)
180 |         def make_feature_x():
181 |             return pd.DataFrame({'a': [1, 2, 3, 4, 5]})
182 | 
183 |         @fs.cached_feature('y', tmp)
184 |         def make_feature_y(n: int):
185 |             return pd.DataFrame({'b': np.arange(n)})
186 | 
187 |         x = make_feature_x()
188 |         assert make_feature_x.__name__ == "make_feature_x"
189 |         assert os.path.exists(os.path.join(tmp, "x.f"))
190 |         x2 = make_feature_x()
191 |         assert_frame_equal(x, x2)
192 | 
193 |         y = make_feature_y(100)
194 |         assert len(y) == 100
195 |         assert os.path.exists(os.path.join(tmp, "y.f"))
196 |         y2 = make_feature_y(100)
197 |         assert_frame_equal(y, y2)
198 | 


--------------------------------------------------------------------------------
/tests/validation/test_adversarial_validate.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import train_test_split
 2 | 
 3 | from nyaggle.testing import make_classification_df
 4 | from nyaggle.validation import adversarial_validate
 5 | 
 6 | 
 7 | def test_adv():
 8 |     X, y = make_classification_df(1024)
 9 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
10 | 
11 |     X_train['target'] = 0
12 |     X_test['target'] = 1
13 | 
14 |     auc, importance = adversarial_validate(X_train, X_test)
15 | 
16 |     assert importance['feature'][0] == 'target'
17 |     assert auc >= 0.9
18 | 


--------------------------------------------------------------------------------
/tests/validation/test_cross_validate.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import numpy as np
  4 | 
  5 | from catboost import CatBoostClassifier
  6 | from lightgbm import LGBMClassifier
  7 | from sklearn.datasets import make_classification, make_regression
  8 | from sklearn.linear_model import RidgeClassifier, Ridge
  9 | from sklearn.metrics import roc_auc_score, r2_score
 10 | from sklearn.model_selection import train_test_split, KFold
 11 | 
 12 | from nyaggle.experiment import autoprep_gbdt
 13 | from nyaggle.testing import make_classification_df
 14 | from nyaggle.validation import cross_validate, Take
 15 | 
 16 | 
 17 | def test_cv_sklean_binary():
 18 |     X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
 19 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 20 | 
 21 |     model = RidgeClassifier(alpha=1.0)
 22 | 
 23 |     pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score)
 24 | 
 25 |     assert len(scores) == 5 + 1
 26 |     assert scores[-1] >= 0.85  # overall auc
 27 |     assert roc_auc_score(y_train, pred_oof) == scores[-1]
 28 |     assert roc_auc_score(y_test, pred_test) >= 0.85  # test score
 29 | 
 30 | 
 31 | def test_cv_sklean_regression():
 32 |     X, y = make_regression(n_samples=1024, n_features=20, random_state=0)
 33 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 34 | 
 35 |     model = Ridge(alpha=1.0)
 36 | 
 37 |     pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=r2_score)
 38 | 
 39 |     print(scores)
 40 |     assert len(scores) == 5 + 1
 41 |     assert scores[-1] >= 0.95  # overall r2
 42 |     assert r2_score(y_train, pred_oof) == scores[-1]
 43 |     assert r2_score(y_test, pred_test) >= 0.95  # test r2
 44 | 
 45 | 
 46 | def test_cv_lgbm():
 47 |     X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
 48 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 49 | 
 50 |     models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
 51 | 
 52 |     pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
 53 |                                                              eval_func=roc_auc_score,
 54 |                                                              fit_params={'early_stopping_rounds': 200})
 55 | 
 56 |     print(scores)
 57 |     assert len(scores) == 5 + 1
 58 |     assert scores[-1] >= 0.85  # overall roc_auc
 59 |     assert roc_auc_score(y_train, pred_oof) == scores[-1]
 60 |     assert roc_auc_score(y_test, pred_test) >= 0.85  # test roc_auc
 61 |     assert roc_auc_score(y, models[0].predict_proba(X)[:, 1]) >= 0.85  # make sure models are trained
 62 |     assert len(importance) == 5
 63 |     assert list(importance[0].columns) == ['feature', 'importance']
 64 |     assert len(importance[0]) == 20
 65 | 
 66 | 
 67 | def test_cv_lgbm_df():
 68 |     X, y = make_classification_df(n_samples=1024, n_num_features=20, n_cat_features=1, class_sep=0.98, random_state=0)
 69 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 70 | 
 71 |     models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
 72 | 
 73 |     pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
 74 |                                                              eval_func=roc_auc_score)
 75 | 
 76 |     print(scores)
 77 |     assert len(scores) == 5 + 1
 78 |     assert scores[-1] >= 0.85  # overall roc_auc
 79 |     assert roc_auc_score(y_train, pred_oof) == scores[-1]
 80 |     assert roc_auc_score(y_test, pred_test) >= 0.85  # test roc_auc
 81 |     assert roc_auc_score(y_test, models[0].predict_proba(X_test)[:, 1]) >= 0.85  # make sure models are trained
 82 |     assert len(importance) == 5
 83 |     assert list(importance[0].columns) == ['feature', 'importance']
 84 |     assert len(importance[0]) == 20 + 1
 85 |     assert models[0].booster_.num_trees() < 300  # making sure early stopping worked
 86 | 
 87 | 
 88 | def test_cv_cat_df():
 89 |     X, y = make_classification_df(n_samples=1024, n_num_features=20, n_cat_features=1, class_sep=0.98, random_state=0)
 90 |     X, _ = autoprep_gbdt('cat', X, None)
 91 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 92 | 
 93 |     models = [CatBoostClassifier(n_estimators=300) for _ in range(5)]
 94 | 
 95 |     pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
 96 |                                                              eval_func=roc_auc_score,
 97 |                                                              fit_params={'cat_features': ['cat_0']})
 98 | 
 99 |     print(scores)
100 |     assert len(scores) == 5 + 1
101 |     assert scores[-1] >= 0.85  # overall roc_auc
102 |     assert roc_auc_score(y_train, pred_oof) == scores[-1]
103 |     assert roc_auc_score(y_test, pred_test) >= 0.85  # test roc_auc
104 |     assert roc_auc_score(y_test, models[0].predict_proba(X_test)[:, 1]) >= 0.85  # make sure models are trained
105 |     assert len(importance) == 5
106 |     assert list(importance[0].columns) == ['feature', 'importance']
107 |     assert len(importance[0]) == 20 + 1
108 |     assert models[0].tree_count_ < 300  # making sure early stopping worked
109 | 
110 | 
111 | def test_cv_partial_evaluate():
112 |     X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
113 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
114 | 
115 |     model = RidgeClassifier(alpha=1.0)
116 | 
117 |     n = 0
118 | 
119 |     def _fold_count(*args):
120 |         nonlocal n
121 |         n += 1
122 | 
123 |     cv = Take(2, KFold(5))
124 | 
125 |     pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=cv, eval_func=roc_auc_score,
126 |                                                     on_each_fold=_fold_count)
127 | 
128 |     assert len(scores) == 2 + 1
129 |     assert scores[-1] >= 0.8  # overall auc
130 |     assert n == 2
131 | 
132 | 
133 | def test_fit_params_callback():
134 |     X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
135 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
136 | 
137 |     models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
138 | 
139 |     sample_weights = np.random.randint(1, 10, size=len(X_train))
140 |     sample_weights = sample_weights / sample_weights.sum()
141 | 
142 |     def fit_params(n: int, train_index: List[int], valid_index: List[int]):
143 |         return {
144 |             'early_stopping_rounds': 100,
145 |             'sample_weight': list(sample_weights[train_index]),
146 |             'eval_sample_weight': [list(sample_weights[valid_index])]
147 |         }
148 | 
149 |     result_w_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
150 |                                      eval_func=roc_auc_score, fit_params=fit_params)
151 | 
152 |     result_wo_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
153 |                                       eval_func=roc_auc_score, fit_params={'early_stopping_rounds': 50})
154 | 
155 |     assert result_w_weight.scores[-1] != result_wo_weight.scores[-1]
156 | 


--------------------------------------------------------------------------------
/tests/validation/test_split.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | import pytest
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.model_selection import KFold
  7 | 
  8 | import nyaggle.validation.split as split
  9 | 
 10 | 
 11 | def _random_uniform_dates(start_date: str, n_days: int, size: int):
 12 |     return pd.to_datetime(start_date) + pd.to_timedelta(np.random.randint(0, n_days, size=size), 'd')
 13 | 
 14 | 
 15 | def test_take():
 16 |     df = pd.DataFrame()
 17 |     df['id'] = np.arange(10)
 18 | 
 19 |     folds = split.Take(2, KFold(5)).split(df)
 20 | 
 21 |     train_index, test_index = next(folds)
 22 |     assert np.array_equal(test_index, np.array([0, 1]))
 23 | 
 24 |     train_index, test_index = next(folds)
 25 |     assert np.array_equal(test_index, np.array([2, 3]))
 26 | 
 27 |     with pytest.raises(StopIteration):
 28 |         next(folds)
 29 | 
 30 | 
 31 | def test_take_over():
 32 |     df = pd.DataFrame()
 33 |     df['id'] = np.arange(10)
 34 | 
 35 |     # k > base_validator.n_splits
 36 |     folds = split.Take(3, KFold(2)).split(df)
 37 | 
 38 |     train_index, test_index = next(folds)
 39 |     assert np.array_equal(test_index, np.array([0, 1, 2, 3, 4]))
 40 | 
 41 |     train_index, test_index = next(folds)
 42 |     assert np.array_equal(test_index, np.array([5, 6, 7, 8, 9]))
 43 | 
 44 |     with pytest.raises(StopIteration):
 45 |         next(folds)
 46 | 
 47 | 
 48 | def test_skip():
 49 |     df = pd.DataFrame()
 50 |     df['id'] = np.arange(10)
 51 | 
 52 |     kf = split.Skip(2, KFold(5))
 53 |     folds = kf.split(df)
 54 | 
 55 |     assert kf.get_n_splits() == 3
 56 | 
 57 |     train_index, test_index = next(folds)
 58 |     assert np.array_equal(test_index, np.array([4, 5]))
 59 | 
 60 |     train_index, test_index = next(folds)
 61 |     assert np.array_equal(test_index, np.array([6, 7]))
 62 | 
 63 |     train_index, test_index = next(folds)
 64 |     assert np.array_equal(test_index, np.array([8, 9]))
 65 | 
 66 |     with pytest.raises(StopIteration):
 67 |         next(folds)
 68 | 
 69 | 
 70 | def test_nth():
 71 |     df = pd.DataFrame()
 72 |     df['id'] = np.arange(10)
 73 | 
 74 |     kf = split.Nth(3, KFold(5))
 75 |     folds = kf.split(df)
 76 | 
 77 |     assert kf.get_n_splits() == 1
 78 | 
 79 |     train_index, test_index = next(folds)
 80 |     assert np.array_equal(test_index, np.array([4, 5]))
 81 | 
 82 |     with pytest.raises(StopIteration):
 83 |         next(folds)
 84 | 
 85 |     kf = split.Nth(1, KFold(5))
 86 |     folds = kf.split(df)
 87 | 
 88 |     assert kf.get_n_splits() == 1
 89 | 
 90 |     train_index, test_index = next(folds)
 91 |     assert np.array_equal(test_index, np.array([0, 1]))
 92 | 
 93 |     with pytest.raises(StopIteration):
 94 |         next(folds)
 95 | 
 96 | 
 97 | def test_time_series_split():
 98 |     df = pd.DataFrame()
 99 |     df['time'] = pd.date_range(start='2018/1/1', periods=5)
100 | 
101 |     folds = split.TimeSeriesSplit('time',
102 |                                   [(('2018-01-01', '2018-01-02'), ('2018-01-02', '2018-01-04')),
103 |                                    (('2018-01-02', '2018-01-03'), ('2018-01-04', '2018-01-06'))])
104 | 
105 |     assert folds.get_n_splits() == 2
106 | 
107 |     splits = folds.split(df)
108 | 
109 |     train_index, test_index = next(splits)
110 |     assert np.array_equal(train_index, np.array([0]))
111 |     assert np.array_equal(test_index, np.array([1, 2]))
112 | 
113 |     train_index, test_index = next(splits)
114 |     assert np.array_equal(train_index, np.array([1]))
115 |     assert np.array_equal(test_index, np.array([3, 4]))
116 | 
117 |     with pytest.raises(StopIteration):
118 |         next(splits)
119 | 
120 | 
121 | def test_time_series_open_range():
122 |     df = pd.DataFrame()
123 |     df['x'] = [1, 2, 3, 4, 5]
124 |     df['time'] = pd.date_range(start='2018/1/1', periods=5)
125 | 
126 |     folds = split.TimeSeriesSplit(df['time'],
127 |                                   [((None, '2018-01-03'), ('2018-01-03', None))])
128 |     splits = folds.split(df)
129 | 
130 |     train_index, test_index = next(splits)
131 |     assert np.array_equal(train_index, np.array([0, 1]))
132 |     assert np.array_equal(test_index, np.array([2, 3, 4]))
133 | 
134 | 
135 | def test_time_series_add_folds():
136 |     df = pd.DataFrame()
137 |     df['x'] = [1, 2, 3, 4, 5]
138 |     df['time'] = pd.date_range(start='2018/1/1', periods=5)
139 | 
140 |     folds = split.TimeSeriesSplit(df['time'])
141 | 
142 |     assert folds.get_n_splits() == 0
143 | 
144 |     folds.add_fold((None, '2018-01-03'), ('2018-01-03', None))
145 | 
146 |     assert folds.get_n_splits() == 1
147 | 
148 | 
149 | def test_sliding_window_split():
150 |     window = split.SlidingWindowSplit('time',
151 |                                       train_from='2018-01-20',
152 |                                       train_to='2018-01-23',
153 |                                       test_from='2018-01-27',
154 |                                       test_to='2018-01-31',
155 |                                       n_windows=3,
156 |                                       stride=pd.to_timedelta(2, 'd'))
157 | 
158 |     #         train           test
159 |     #  fold1: 01/16 - 01/19   01/23 - 01/27  (backtest 2)
160 |     #  fold2: 01/18 - 01/21   01/25 - 01/29  (backtest 1)
161 |     #  fold3: 01/20 - 01/23   01/27 - 01/31  (base window)
162 | 
163 |     expected = [
164 |         ((datetime(2018, 1, 16), datetime(2018, 1, 19)), (datetime(2018, 1, 23), datetime(2018, 1, 27))),
165 |         ((datetime(2018, 1, 18), datetime(2018, 1, 21)), (datetime(2018, 1, 25), datetime(2018, 1, 29))),
166 |         ((datetime(2018, 1, 20), datetime(2018, 1, 23)), (datetime(2018, 1, 27), datetime(2018, 1, 31)))
167 |     ]
168 | 
169 |     assert window.times == expected
170 | 
171 | 
172 | def test_stratified_group_kfold_one_class_per_grp():
173 |     sgf = split.StratifiedGroupKFold(2, shuffle=False)
174 | 
175 |     df = pd.DataFrame()
176 |     df['group'] = [1, 1, 2, 2, 3, 3, 4, 4]
177 |     df['y'] = [0, 0, 1, 1, 0, 0, 1, 1]
178 |     df['x'] = [0, 1, 2, 3, 4, 5, 6, 7]
179 | 
180 |     assert sgf.get_n_splits(df, df['y'], df['group']) == 2
181 | 
182 |     splits = sgf.split(df, df['y'], df['group'])
183 | 
184 |     train_index, test_index = next(splits)
185 |     assert np.array_equal(train_index, np.array([2, 3, 4, 5]))
186 |     assert np.array_equal(test_index, np.array([0, 1, 6, 7]))
187 | 
188 |     train_index, test_index = next(splits)
189 |     assert np.array_equal(train_index, np.array([0, 1, 6, 7]))
190 |     assert np.array_equal(test_index, np.array([2, 3, 4, 5]))
191 | 
192 | 
193 | def test_stratified_group_kfold_multi_class_per_fold():
194 |     sgf = split.StratifiedGroupKFold(2, shuffle=False)
195 | 
196 |     df = pd.DataFrame()
197 |     df['group'] = [1, 1, 2, 2, 3, 3, 4, 4]
198 |     df['y'] = [0, 1, 0, 1, 1, 1, 1, 1]
199 |     df['x'] = [0, 1, 2, 3, 4, 5, 6, 7]
200 | 
201 |     assert sgf.get_n_splits(df, df['y'], df['group']) == 2
202 | 
203 |     splits = sgf.split(df, df['y'], df['group'])
204 | 
205 |     train_index, test_index = next(splits)
206 |     assert np.array_equal(train_index, np.array([0, 1, 4, 5]))
207 |     assert np.array_equal(test_index, np.array([2, 3, 6, 7]))
208 | 
209 |     train_index, test_index = next(splits)
210 |     assert np.array_equal(train_index, np.array([2, 3, 6, 7]))
211 |     assert np.array_equal(test_index, np.array([0, 1, 4, 5]))
212 | 
213 | 
214 | def test_stratified_group_kfold_imbalanced_group():
215 |     sgf = split.StratifiedGroupKFold(2, shuffle=False)
216 | 
217 |     df = pd.DataFrame()
218 |     df['group'] = [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4]
219 |     df['y'] = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
220 |     df['x'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
221 | 
222 |     assert sgf.get_n_splits(df, df['y'], df['group']) == 2
223 | 
224 |     splits = sgf.split(df, df['y'], df['group'])
225 | 
226 |     train_index, test_index = next(splits)
227 |     assert np.array_equal(train_index, np.array([8, 9, 10, 11]))
228 |     assert np.array_equal(test_index, np.array([0, 1, 2, 3, 4, 5, 6, 7]))
229 | 
230 |     train_index, test_index = next(splits)
231 |     assert np.array_equal(train_index, np.array([0, 1, 2, 3, 4, 5, 6, 7]))
232 |     assert np.array_equal(test_index, np.array([8, 9, 10, 11]))
233 | 


--------------------------------------------------------------------------------