├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── VERSION.txt
├── codecov.yml
├── docs
├── Makefile
├── authors.rst
├── conf.py
├── contributing.rst
├── features.rst
├── history.rst
├── index.rst
├── installation.rst
├── introduction.rst
├── logos
│ ├── logo.png
│ ├── small_logo.ico
│ └── small_logo.png
├── make.bat
└── webinars
│ ├── auto-ML.pdf
│ └── features.pdf
├── examples
├── classification
│ ├── classification.py
│ ├── example.ipynb
│ ├── test_classification.csv
│ └── train_classification.csv
└── regression
│ ├── example.ipynb
│ ├── regression.py
│ ├── test_regression.csv
│ └── train_regression.csv
├── mlbox
├── __init__.py
├── encoding
│ ├── __init__.py
│ ├── categorical_encoder.py
│ └── na_encoder.py
├── model
│ ├── __init__.py
│ ├── classification
│ │ ├── __init__.py
│ │ ├── classifier.py
│ │ ├── feature_selector.py
│ │ └── stacking_classifier.py
│ └── regression
│ │ ├── __init__.py
│ │ ├── feature_selector.py
│ │ ├── regressor.py
│ │ └── stacking_regressor.py
├── optimisation
│ ├── __init__.py
│ └── optimiser.py
├── prediction
│ ├── __init__.py
│ └── predictor.py
└── preprocessing
│ ├── __init__.py
│ ├── drift
│ ├── __init__.py
│ ├── drift_estimator.py
│ └── drift_threshold.py
│ ├── drift_thresholder.py
│ └── reader.py
├── requirements.txt
├── setup.py
└── tests
├── .DS_Store
├── __init__.py
├── data_for_tests
├── clean_target.csv
├── clean_test.csv
├── clean_train.csv
├── inplace_test.csv
├── inplace_train.csv
├── test.csv
├── test_regression.csv
├── train.csv
├── train.h5
├── train.json
├── train.xls
└── train_regression.csv
├── test_categorical_encoder.py
├── test_classification_feature_selector.py
├── test_classifier.py
├── test_drift_estimator.py
├── test_drift_threshold.py
├── test_drift_thresholder.py
├── test_na_encoder.py
├── test_optimiser.py
├── test_predictor.py
├── test_reader.py
├── test_regression_feature_selector.py
├── test_regressor.py
├── test_stacking_classifer.py
└── test_stacking_regressor.py
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .pytest_cache/
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pycharm
76 | .idea
77 | .DS_Store
78 |
79 | # pyenv
80 | .python-version
81 |
82 | # celery beat schedule file
83 | celerybeat-schedule
84 |
85 | # SageMath parsed files
86 | *.sage.py
87 |
88 | # dotenv
89 | .env
90 |
91 | # virtualenv
92 | .venv
93 | venv/
94 | ENV/
95 |
96 | # Spyder project settings
97 | .spyderproject
98 | .spyproject
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
103 | # mkdocs documentation
104 | /site
105 |
106 | # mypy
107 | .mypy_cache/
108 |
109 | # save folders
110 | *save/
111 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | matrix:
3 | include:
4 | - os: linux
5 | python: '3.5'
6 | - os: linux
7 | python: '3.6'
8 | - os: linux
9 | python: '3.7'
10 | - os: osx
11 | language: generic
12 | python: '3.5'
13 | before_install:
14 | - brew install libomp
15 | - brew upgrade pyenv
16 | - brew install pyenv-virtualenv
17 | - pyenv install 3.5.6
18 | - eval "$(pyenv init -)"
19 | - pyenv virtualenv 3.5.6 venv
20 | - pyenv activate venv
21 | - os: osx
22 | language: generic
23 | python: '3.6'
24 | before_install:
25 | - brew install libomp
26 | - brew upgrade pyenv
27 | - brew install pyenv-virtualenv
28 | - pyenv install 3.6.7
29 | - eval "$(pyenv init -)"
30 | - pyenv virtualenv 3.6.7 venv
31 | - pyenv activate venv
32 | - os: osx
33 | language: generic
34 | python: '3.7'
35 | before_install:
36 | - brew install libomp
37 | - brew upgrade pyenv
38 | - brew install pyenv-virtualenv
39 | - pyenv install 3.7.2
40 | - eval "$(pyenv init -)"
41 | - pyenv virtualenv 3.7.2 venv
42 | - pyenv activate venv
43 | - os: windows
44 | language: sh
45 | python: '3.5'
46 | before_install:
47 | - choco install python --version 3.5.4
48 | - export PATH="/c/Python35:/c/Python35/Scripts:$PATH"
49 | - os: windows
50 | language: sh
51 | python: '3.6'
52 | before_install:
53 | - choco install python --version 3.6.7
54 | - export PATH="/c/Python36:/c/Python36/Scripts:$PATH"
55 | - os: windows
56 | language: sh
57 | python: '3.7'
58 | before_install:
59 | - choco install python --version 3.7.2
60 | - export PATH="/c/Python37:/c/Python37/Scripts:$PATH"
61 | install:
62 | - pip install coverage
63 | - pip install codecov
64 | - pip install -U pytest
65 | - pip install --upgrade setuptools wheel
66 | script:
67 | - python setup.py install
68 | - cd tests
69 | - if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" = "3.7" ] ; then
70 | coverage run -m --source=../mlbox/ pytest; fi
71 | - if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" != "3.7" ] ; then
72 | pytest; fi
73 | - if [ "$TRAVIS_OS_NAME" = "osx" ] ; then pytest; fi
74 | - if [ "$TRAVIS_OS_NAME" = "windows" ] ; then pytest; fi
75 | - cd ..
76 | after_success:
77 | - codecov
78 | deploy:
79 | provider: pypi
80 | user: AxeldeRomblay
81 | password:
82 | secure: l4S5cjkkjhj82j3Tq51/zkBEkjOfSl9xaISu9rmcQNQUbsqp1qrLiKmcMVm0mirNezhTnNdeeCWRyeuvXBNpbRq37KKM6NGScmbAPdCKZeDw6/wDOwjzaMpsnzynq7EiowrgrawwffTa1kP6dgzkG4U/ftjd1jNdNMmOz5MyMnkS2cVv2Uy0o/g7MPQ1hIVAGpoLtnjJ+iGZrQrCWGOr9zp6k003T0xGlS9oEPLM1yid1s1Aeeq8p8Jaee2gGbhpOZ8fySHPcBX2e7TThgoqwfN/wvDzBwko5VPHTaWiVa9FW4zirwyE9EK8LmjAuodF63QOBujO5YTCf1ja5iC5czxZrjNsZCznXmsVqZlyetF2aMofDk++0T0zCmXpMRjivmLV0O/ZSl/HDkMua1TdPuink+FKdGrwCH/IzyeAfT95yVisiRpmgNAhn8/IW/U8v87voquy+YoVL6egSjoB5EyEnzSoojK7qyRPCPmFmKcJHK3aoT3yocwgOSgClqX1gbrYrXAKkXR8lPp7VlZdNKIbKQLu6TILAOVILsAU2MFJbomMAREL/kM9tB3jOj34gKl0qghMOM10BUnWZ3L+MrNamm/0nrnFhlsI8OIVB47ahOnhVZsLk1H2LGZDwBvJTv2gzEG0mUaQaA45/dxJWvR9IZpObEu6T/U/e+uKI+g=
83 | skip_existing: true
84 | skip_cleanup: true
85 | on:
86 | condition: $TRAVIS_OS_NAME != "windows"
87 | repo: AxeldeRomblay/MLBox
88 | branch: master
89 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD-3 License
2 | Copyright (c) 2017, Axel ARONIO DE ROMBLAY
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 | * Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 | * Neither the name of MLBox nor the names of its contributors may be used
14 | to endorse or promote products derived from this software without specific
15 | prior written permission.
16 |
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL AXEL ARONIO DE ROMBLAY BE LIABLE FOR ANY
21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include *.rst
3 | include *.txt
4 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts.
2 |
3 | clean-build: ## remove build artifacts
4 | rm -fr build/
5 | rm -fr dist/
6 | rm -fr .eggs/
7 | find . -name '*.egg-info' -exec rm -fr {} +
8 | find . -name '*.egg' -exec rm -f {} +
9 |
10 | clean-pyc: ## remove Python file artifacts
11 | find . -name '*.pyc' -exec rm -f {} +
12 | find . -name '*.pyo' -exec rm -f {} +
13 | find . -name '*~' -exec rm -f {} +
14 | find . -name '__pycache__' -exec rm -fr {} +
15 |
16 | clean-test: ## remove test and coverage artifacts
17 | cd tests/; \
18 | rm -fr .tox/; \
19 | rm -f .coverage; \
20 | rm -fr htmlcov/
21 |
22 | test: ## run tests quickly with the default Python
23 | cd tests/; \
24 | pytest
25 |
26 | coverage: ## check code coverage quickly with the default Python
27 | cd tests/; \
28 | coverage run -m --source=../mlbox/ pytest;\
29 | coverage html;\
30 | $(BROWSER) htmlcov/index.html
31 |
32 | docs: ## generate Sphinx HTML documentation, including API docs
33 | rm -f docs/mlbox.rst
34 | rm -f docs/modules.rst
35 | sphinx-apidoc -o docs/ mlbox
36 | $(MAKE) -C docs clean
37 | $(MAKE) -C docs html
38 | $(BROWSER) docs/_build/html/index.html
39 |
40 | release: ## package and upload a release
41 | python setup.py sdist upload
42 | python setup.py bdist_wheel upload
43 |
44 | dist: ## builds source and wheel package
45 | python setup.py sdist
46 | python setup.py bdist_wheel
47 | ls -l dist
48 |
49 | install: ## install the package to the active Python's site-packages
50 | python setup.py install
51 |
52 | develop: ## install the package to the active Python's site-packages in developer mode
53 | python setup.py develop
54 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: docs/logos/logo.png
2 |
3 | |Documentation Status| |PyPI version| |Build Status| |GitHub Issues| |codecov| |License| |Downloads| |Python Versions|
4 |
5 | -----------------------
6 |
7 | **MLBox is a powerful Automated Machine Learning python library.** It provides the following features:
8 |
9 |
10 | * Fast reading and distributed data preprocessing/cleaning/formatting
11 | * Highly robust feature selection and leak detection
12 | * Accurate hyper-parameter optimization in high-dimensional space
13 | * State-of-the art predictive models for classification and regression (Deep Learning, Stacking, LightGBM,...)
14 | * Prediction with models interpretation
15 |
16 |
17 | **For more details**, please refer to the `official documentation `__
18 |
19 |
20 | --------------------------
21 |
22 | How to Contribute
23 | =================
24 |
25 | MLBox has been developed and used by many active community members. Your help is very valuable to make it better for everyone.
26 |
27 | - Check out `call for contributions `__ to see what can be improved, or open an issue if you want something.
28 | - Contribute to the `tests `__ to make it more reliable.
29 | - Contribute to the `documents `__ to make it clearer for everyone.
30 | - Contribute to the `examples `__ to share your experience with other users.
31 | - Open `issue `__ if you met problems during development.
32 |
33 | For more details, please refer to `CONTRIBUTING `__.
34 |
35 | .. |Documentation Status| image:: https://readthedocs.org/projects/mlbox/badge/?version=latest
36 | :target: https://mlbox.readthedocs.io/en/latest/
37 | .. |PyPI version| image:: https://badge.fury.io/py/mlbox.svg
38 | :target: https://pypi.python.org/pypi/mlbox
39 | .. |Build Status| image:: https://travis-ci.org/AxeldeRomblay/MLBox.svg?branch=master
40 | :target: https://travis-ci.org/AxeldeRomblay/MLBox
41 | .. |GitHub Issues| image:: https://img.shields.io/github/issues/AxeldeRomblay/MLBox.svg
42 | :target: https://github.com/AxeldeRomblay/MLBox/issues
43 | .. |codecov| image:: https://codecov.io/gh/AxeldeRomblay/MLBox/branch/master/graph/badge.svg
44 | :target: https://codecov.io/gh/AxeldeRomblay/MLBox
45 | .. |License| image:: https://img.shields.io/badge/License-BSD%203--Clause-blue.svg
46 | :target: https://github.com/AxeldeRomblay/MLBox/blob/master/LICENSE
47 | .. |Downloads| image:: https://pepy.tech/badge/mlbox
48 | :target: https://pepy.tech/project/mlbox
49 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/mlbox.svg
50 | :target: https://pypi.org/project/mlbox
51 |
--------------------------------------------------------------------------------
/VERSION.txt:
--------------------------------------------------------------------------------
1 | 0.8.5
2 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | codecov:
2 | token: 989a47e4-aa64-4cbd-8516-52d00e1eb129
3 | notify:
4 | require_ci_to_pass: yes
5 | coverage:
6 | precision: 2
7 | round: up
8 | range: "50...100"
9 | status:
10 | project:
11 | default:
12 | # Commits pushed to master should not make the overall
13 | # project coverage decrease by more than 1%
14 | target: auto
15 | threshold: 1%
16 | patch:
17 | default:
18 | # Be tolerant on slight code coverage diff on PRs to limit
19 | # noisy red coverage status on github PRs.
20 | target: auto
21 | threshold: 1%
22 | changes: no
23 | parsers:
24 | gcov:
25 | branch_detection:
26 | conditional: yes
27 | loop: yes
28 | method: no
29 | macro: no
30 |
31 | comment:
32 | layout: "header, diff"
33 | behavior: default
34 | require_changes: no
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = _build
9 |
10 | # User-friendly check for sphinx-build
11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
13 | endif
14 |
15 | # Internal variables.
16 | PAPEROPT_a4 = -D latex_paper_size=a4
17 | PAPEROPT_letter = -D latex_paper_size=letter
18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
19 | # the i18n builder cannot share the environment and doctrees with the others
20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
21 |
22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
23 |
24 | help:
25 | @echo "Please use \`make ' where is one of"
26 | @echo " html to make standalone HTML files"
27 | @echo " dirhtml to make HTML files named index.html in directories"
28 | @echo " singlehtml to make a single large HTML file"
29 | @echo " pickle to make pickle files"
30 | @echo " json to make JSON files"
31 | @echo " htmlhelp to make HTML files and a HTML help project"
32 | @echo " qthelp to make HTML files and a qthelp project"
33 | @echo " devhelp to make HTML files and a Devhelp project"
34 | @echo " epub to make an epub"
35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
36 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
38 | @echo " text to make text files"
39 | @echo " man to make manual pages"
40 | @echo " texinfo to make Texinfo files"
41 | @echo " info to make Texinfo files and run them through makeinfo"
42 | @echo " gettext to make PO message catalogs"
43 | @echo " changes to make an overview of all changed/added/deprecated items"
44 | @echo " xml to make Docutils-native XML files"
45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
46 | @echo " linkcheck to check all external links for integrity"
47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
48 |
49 | clean:
50 | rm -rf $(BUILDDIR)/*
51 |
52 | html:
53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
54 | @echo
55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
56 |
57 | dirhtml:
58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
59 | @echo
60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
61 |
62 | singlehtml:
63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
64 | @echo
65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
66 |
67 | pickle:
68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
69 | @echo
70 | @echo "Build finished; now you can process the pickle files."
71 |
72 | json:
73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
74 | @echo
75 | @echo "Build finished; now you can process the JSON files."
76 |
77 | htmlhelp:
78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
79 | @echo
80 | @echo "Build finished; now you can run HTML Help Workshop with the" \
81 | ".hhp project file in $(BUILDDIR)/htmlhelp."
82 |
83 | qthelp:
84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
85 | @echo
86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/mlbox.qhcp"
89 | @echo "To view the help file:"
90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/mlbox.qhc"
91 |
92 | devhelp:
93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
94 | @echo
95 | @echo "Build finished."
96 | @echo "To view the help file:"
97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/mlbox"
98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/mlbox"
99 | @echo "# devhelp"
100 |
101 | epub:
102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | @echo
104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 |
106 | latex:
107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | @echo
109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | "(use \`make latexpdf' here to do that automatically)."
112 |
113 | latexpdf:
114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | @echo "Running LaTeX files through pdflatex..."
116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 |
119 | latexpdfja:
120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | @echo "Running LaTeX files through platex and dvipdfmx..."
122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 |
125 | text:
126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | @echo
128 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
129 |
130 | man:
131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | @echo
133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 |
135 | texinfo:
136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | @echo
138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | @echo "Run \`make' in that directory to run these through makeinfo" \
140 | "(use \`make info' here to do that automatically)."
141 |
142 | info:
143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | @echo "Running Texinfo files through makeinfo..."
145 | make -C $(BUILDDIR)/texinfo info
146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 |
148 | gettext:
149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | @echo
151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 |
153 | changes:
154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | @echo
156 | @echo "The overview file is in $(BUILDDIR)/changes."
157 |
158 | linkcheck:
159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | @echo
161 | @echo "Link check complete; look for any errors in the above output " \
162 | "or in $(BUILDDIR)/linkcheck/output.txt."
163 |
164 | doctest:
165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | @echo "Testing of doctests in the sources finished, look at the " \
167 | "results in $(BUILDDIR)/doctest/output.txt."
168 |
169 | xml:
170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | @echo
172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 |
174 | pseudoxml:
175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | @echo
177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 |
--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | =======
2 | Authors
3 | =======
4 |
5 | Development Lead
6 | ----------------
7 |
8 | * Axel ARONIO DE ROMBLAY
9 |
10 | * email:
11 | * linkedin:
12 |
13 | Contributors
14 | ------------
15 |
16 | * Nicolas CHEREL
17 | * Mohamed MASKANI
18 | * Henri GERARD
19 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # mlbox documentation build configuration file, created by
5 | # sphinx-quickstart on Tue Jul 9 22:26:36 2013.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | import sys
17 | import os
18 | from mock import Mock as MagicMock
19 |
20 | class Mock(MagicMock):
21 | @classmethod
22 | def __getattr__(cls, name):
23 | return MagicMock()
24 |
25 | MOCK_MODULES = ['numpy',
26 | 'matplotlib',
27 | 'matplotlib.pyplot',
28 | 'hyperopt',
29 | 'joblib',
30 | 'pandas',
31 | 'sklearn',
32 | 'sklearn.ensemble',
33 | 'sklearn.metrics',
34 | 'sklearn.impute',
35 | 'sklearn.linear_model',
36 | 'sklearn.model_selection',
37 | 'sklearn.tree',
38 | 'sklearn.pipeline',
39 | 'sklearn.preprocessing',
40 | 'tensorflow',
41 | 'tensorflow.keras.layers',
42 | 'tensorflow.keras.models',
43 | 'lightgbm'
44 | ]
45 |
46 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
47 |
48 |
49 | # If extensions (or modules to document with autodoc) are in another
50 | # directory, add these directories to sys.path here. If the directory is
51 | # relative to the documentation root, use os.path.abspath to make it
52 | # absolute, like shown here.
53 | #sys.path.insert(0, os.path.abspath('.'))
54 |
55 | # Get the project root dir, which is the parent dir of this
56 | cwd = os.getcwd()
57 | project_root = os.path.dirname(cwd)
58 |
59 | # Insert the project root dir as the first element in the PYTHONPATH.
60 | # This lets us ensure that the source package is imported, and that its
61 | # version is used.
62 |
63 | sys.path.insert(0, project_root)
64 |
65 | #import mlbox
66 |
67 | # -- General configuration ---------------------------------------------
68 |
69 | # If your documentation needs a minimal Sphinx version, state it here.
70 | #needs_sphinx = '1.0'
71 |
72 | # Add any Sphinx extension module names here, as strings. They can be
73 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
74 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon']
75 | napoleon_numpy_docstring = True
76 |
77 | # Add any paths that contain templates here, relative to this directory.
78 | templates_path = ['_templates']
79 |
80 | # The suffix of source filenames.
81 | source_suffix = '.rst'
82 |
83 | # The encoding of source files.
84 | #source_encoding = 'utf-8-sig'
85 |
86 | # The master toctree document.
87 | master_doc = 'index'
88 |
89 | # General information about the project.
90 | project = u'MLBox'
91 | copyright = u"2017, Axel ARONIO DE ROMBLAY"
92 |
93 | # The version info for the project you're documenting, acts as replacement
94 | # for |version| and |release|, also used in various other places throughout
95 | # the built documents.
96 | #
97 | # The short X.Y version.
98 | #version = mlbox.__version__
99 | # The full version, including alpha/beta/rc tags.
100 | #release = mlbox.__version__
101 |
102 | # The language for content autogenerated by Sphinx. Refer to documentation
103 | # for a list of supported languages.
104 | #language = None
105 |
106 | # There are two options for replacing |today|: either, you set today to
107 | # some non-false value, then it is used:
108 | #today = ''
109 | # Else, today_fmt is used as the format for a strftime call.
110 | #today_fmt = '%B %d, %Y'
111 |
112 | # List of patterns, relative to source directory, that match files and
113 | # directories to ignore when looking for source files.
114 | exclude_patterns = ['_build']
115 |
116 | # The reST default role (used for this markup: `text`) to use for all
117 | # documents.
118 | #default_role = None
119 |
120 | # If true, '()' will be appended to :func: etc. cross-reference text.
121 | #add_function_parentheses = True
122 |
123 | # If true, the current module name will be prepended to all description
124 | # unit titles (such as .. function::).
125 | #add_module_names = True
126 |
127 | # If true, sectionauthor and moduleauthor directives will be shown in the
128 | # output. They are ignored by default.
129 | #show_authors = False
130 |
131 | # The name of the Pygments (syntax highlighting) style to use.
132 | pygments_style = 'sphinx'
133 |
134 | # A list of ignored prefixes for module index sorting.
135 | #modindex_common_prefix = []
136 |
137 | # If true, keep warnings as "system message" paragraphs in the built
138 | # documents.
139 | #keep_warnings = False
140 |
141 |
142 | # -- Options for HTML output -------------------------------------------
143 |
144 | # The theme to use for HTML and HTML Help pages. See the documentation for
145 | # a list of builtin themes.
146 | html_theme = 'default'
147 |
148 | # Theme options are theme-specific and customize the look and feel of a
149 | # theme further. For a list of options available for each theme, see the
150 | # documentation.
151 | #html_theme_options = {}
152 |
153 | # Add any paths that contain custom themes here, relative to this directory.
154 | #html_theme_path = []
155 |
156 | # The name for this set of Sphinx documents. If None, it defaults to
157 | # " v documentation".
158 | html_title = "MLBox Documentation"
159 |
160 | # A shorter title for the navigation bar. Default is the same as
161 | # html_title.
162 | html_short_title = "MLBox Documentation"
163 |
164 | # The name of an image file (relative to this directory) to place at the
165 | # top of the sidebar.
166 | html_logo = "logos/small_logo.png"
167 |
168 | # The name of an image file (within the static path) to use as favicon
169 | # of the docs. This file should be a Windows icon file (.ico) being
170 | # 16x16 or 32x32 pixels large.
171 | html_favicon = "logos/small_logo.ico"
172 |
173 | # Add any paths that contain custom static files (such as style sheets)
174 | # here, relative to this directory. They are copied after the builtin
175 | # static files, so a file named "default.css" will overwrite the builtin
176 | # "default.css".
177 | html_static_path = ['_static']
178 |
179 | # If not '', a 'Last updated on:' timestamp is inserted at every page
180 | # bottom, using the given strftime format.
181 | #html_last_updated_fmt = '%b %d, %Y'
182 |
183 | # If true, SmartyPants will be used to convert quotes and dashes to
184 | # typographically correct entities.
185 | #html_use_smartypants = True
186 |
187 | # Custom sidebar templates, maps document names to template names.
188 | #html_sidebars = {}
189 |
190 | # Additional templates that should be rendered to pages, maps page names
191 | # to template names.
192 | #html_additional_pages = {}
193 |
194 | # If false, no module index is generated.
195 | #html_domain_indices = True
196 |
197 | # If false, no index is generated.
198 | #html_use_index = True
199 |
200 | # If true, the index is split into individual pages for each letter.
201 | #html_split_index = False
202 |
203 | # If true, links to the reST sources are added to the pages.
204 | #html_show_sourcelink = True
205 |
206 | # If true, "Created using Sphinx" is shown in the HTML footer.
207 | # Default is True.
208 | #html_show_sphinx = True
209 |
210 | # If true, "(C) Copyright ..." is shown in the HTML footer.
211 | # Default is True.
212 | html_show_copyright = True
213 |
214 | # If true, an OpenSearch description file will be output, and all pages
215 | # will contain a tag referring to it. The value of this option
216 | # must be the base URL from which the finished HTML is served.
217 | #html_use_opensearch = ''
218 |
219 | # This is the file name suffix for HTML files (e.g. ".xhtml").
220 | #html_file_suffix = None
221 |
222 | # Output file base name for HTML help builder.
223 | htmlhelp_basename = 'mlboxdoc'
224 |
225 |
226 | # -- Options for LaTeX output ------------------------------------------
227 |
228 | latex_elements = {
229 | # The paper size ('letterpaper' or 'a4paper').
230 | #'papersize': 'letterpaper',
231 |
232 | # The font size ('10pt', '11pt' or '12pt').
233 | #'pointsize': '10pt',
234 |
235 | # Additional stuff for the LaTeX preamble.
236 | #'preamble': '',
237 | }
238 |
239 | # Grouping the document tree into LaTeX files. List of tuples
240 | # (source start file, target name, title, author, documentclass
241 | # [howto/manual]).
242 | latex_documents = [
243 | ('index', 'mlbox.tex',
244 | u'MLBox Documentation',
245 | u'Axel ARONIO DE ROMBLAY', 'manual'),
246 | ]
247 |
248 | # The name of an image file (relative to this directory) to place at
249 | # the top of the title page.
250 | #latex_logo = None
251 |
252 | # For "manual" documents, if this is true, then toplevel headings
253 | # are parts, not chapters.
254 | #latex_use_parts = False
255 |
256 | # If true, show page references after internal links.
257 | #latex_show_pagerefs = False
258 |
259 | # If true, show URL addresses after external links.
260 | #latex_show_urls = False
261 |
262 | # Documents to append as an appendix to all manuals.
263 | #latex_appendices = []
264 |
265 | # If false, no module index is generated.
266 | #latex_domain_indices = True
267 |
268 |
269 | # -- Options for manual page output ------------------------------------
270 |
271 | # One entry per manual page. List of tuples
272 | # (source start file, name, description, authors, manual section).
273 | man_pages = [
274 | ('index', 'mlbox',
275 | u'MLBox Documentation',
276 | [u'Axel ARONIO DE ROMBLAY'], 1)
277 | ]
278 |
279 | # If true, show URL addresses after external links.
280 | #man_show_urls = False
281 |
282 |
283 | # -- Options for Texinfo output ----------------------------------------
284 |
285 | # Grouping the document tree into Texinfo files. List of tuples
286 | # (source start file, target name, title, author,
287 | # dir menu entry, description, category)
288 | texinfo_documents = [
289 | ('index', 'mlbox',
290 | u'MLBox Documentation',
291 | u'Axel ARONIO DE ROMBLAY',
292 | 'mlbox',
293 | 'MLBox is a powerful Automated Machine Learning python library.',
294 | 'Miscellaneous'),
295 | ]
296 |
297 | # Documents to append as an appendix to all manuals.
298 | #texinfo_appendices = []
299 |
300 | # If false, no module index is generated.
301 | #texinfo_domain_indices = True
302 |
303 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
304 | #texinfo_show_urls = 'footnote'
305 |
306 | # If true, do not generate a @detailmenu in the "Top" node's menu.
307 | #texinfo_no_detailmenu = False
308 |
--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Contributing
3 | ============
4 |
5 | Contributions are welcome, and they are greatly appreciated! Every
6 | little bit helps, and credit will always be given.
7 |
8 | You can contribute in many ways:
9 |
10 | Types of Contributions
11 | ----------------------
12 |
13 | Report Bugs
14 | ~~~~~~~~~~~
15 |
16 | Report bugs at https://github.com/AxeldeRomblay/mlbox/issues.
17 |
18 | If you are reporting a bug, please include:
19 |
20 | * Your operating system name and version.
21 | * Any details about your local setup that might be helpful in troubleshooting.
22 | * The smallest possible example to reproduce the bug.
23 |
24 | Fix Bugs
25 | ~~~~~~~~
26 |
27 | Look through the GitHub issues for bugs. Anything tagged with "bug"
28 | and "help wanted" is open to whoever wants to implement it.
29 |
30 | Implement Features
31 | ~~~~~~~~~~~~~~~~~~
32 |
33 | Look through the GitHub issues for features. Anything tagged with "enhancement"
34 | and "help wanted" is open to whoever wants to implement it.
35 |
36 | Write Documentation
37 | ~~~~~~~~~~~~~~~~~~~
38 |
39 | MLBox could always use more documentation, whether as part of the
40 | official MLBox docs, in docstrings, or even on the web in blog posts,
41 | articles, and such.
42 |
43 | Submit Feedback
44 | ~~~~~~~~~~~~~~~
45 |
46 | The best way to send feedback is to file an issue at https://github.com/AxeldeRomblay/mlbox/issues.
47 |
48 | If you are proposing a feature:
49 |
50 | * Explain in detail how it would work.
51 | * Keep the scope as narrow as possible, to make it easier to implement.
52 | * Remember that this is a volunteer-driven project, and that contributions
53 | are welcome :)
54 |
55 | Get Started!
56 | ------------
57 |
58 | Ready to contribute? Here's how to set up `mlbox` for local development.
59 |
60 | 1. Fork the `mlbox` repo on GitHub.
61 |
62 | 2. Clone your fork::
63 |
64 | $ git clone git@github.com:your_name_here/mlbox.git
65 |
66 | 3. If you have virtualenv installed, skip this step. Either, run the following::
67 |
68 | $ pip install virtualenv
69 |
70 | 4. Install your local copy into a virtualenv following this commands to set up your fork for local development::
71 |
72 | $ cd MLBox
73 | $ virtualenv env
74 | $ source env/bin/activate
75 | $ python setup.py develop
76 |
77 | If you have any troubles with the setup, please refer to the `installation guide `__
78 |
79 | 5. Create a branch for local development::
80 |
81 | $ git checkout -b name-of-your-bugfix-or-feature
82 |
83 | **Now you're set, you can make your changes locally.**
84 |
85 | NOTE : each time you work on your branch, you will need to activate the virtualenv: ``$ source env/bin/activate``. To deactivate it, simply run: ``$ deactivate``.
86 |
87 | 6. When you're done making changes, check that your changes pass the tests.
88 |
89 | NOTE : you need to install **pytest** before running the tests::
90 |
91 | $ cd tests
92 | $ pytest
93 |
94 | 7. Commit your changes and push your branch to GitHub::
95 |
96 | $ git add .
97 | $ git commit -m "Your detailed description of your changes."
98 | $ git push origin name-of-your-bugfix-or-feature
99 |
100 | 8. Submit a pull request through the GitHub website.
101 |
102 | Pull Request Guidelines
103 | -----------------------
104 |
105 | Before you submit a pull request, check that it meets these guidelines:
106 |
107 | 1. The pull request should include tests.
108 | 2. If the pull request adds functionality, the docs should be updated. Put
109 | your new functionality into a function with a docstring.
110 | 3. The pull request should work for all supported Python versions and for PyPy. Check
111 | https://travis-ci.org/AxeldeRomblay/MLBox/pull_requests
112 | and make sure that the tests pass for all supported Python versions.
113 |
--------------------------------------------------------------------------------
/docs/features.rst:
--------------------------------------------------------------------------------
1 | Preprocessing
2 | =============
3 |
4 | Reading
5 | -------
6 |
7 | .. autoclass:: mlbox.preprocessing.Reader
8 | :members:
9 |
10 | Drift thresholding
11 | ------------------
12 |
13 | .. autoclass:: mlbox.preprocessing.Drift_thresholder
14 | :members:
15 |
16 | Encoding
17 | ========
18 |
19 | Missing values
20 | --------------
21 |
22 | .. autoclass:: mlbox.encoding.NA_encoder
23 | :members:
24 |
25 | Categorical features
26 | --------------------
27 |
28 | .. autoclass:: mlbox.encoding.Categorical_encoder
29 | :members:
30 |
31 | Model
32 | =====
33 |
34 | Classification
35 | --------------
36 |
37 | Feature selection
38 | ~~~~~~~~~~~~~~~~~
39 |
40 | .. autoclass:: mlbox.model.classification.Clf_feature_selector
41 | :members:
42 |
43 | Classification
44 | ~~~~~~~~~~~~~~
45 |
46 | .. autoclass:: mlbox.model.classification.Classifier
47 | :members:
48 |
49 | Stacking
50 | ~~~~~~~~
51 |
52 | .. autoclass:: mlbox.model.classification.StackingClassifier
53 | :members:
54 |
55 | Regression
56 | ----------
57 |
58 | Feature selection
59 | ~~~~~~~~~~~~~~~~~
60 |
61 | .. autoclass:: mlbox.model.regression.Reg_feature_selector
62 | :members:
63 |
64 | Regression
65 | ~~~~~~~~~~
66 |
67 | .. autoclass:: mlbox.model.regression.Regressor
68 | :members:
69 |
70 | Stacking
71 | ~~~~~~~~
72 |
73 | .. autoclass:: mlbox.model.regression.StackingRegressor
74 | :members:
75 |
76 |
77 | Optimisation
78 | ============
79 |
80 | .. autoclass:: mlbox.optimisation.Optimiser
81 | :members:
82 |
83 | Prediction
84 | ==========
85 |
86 | .. autoclass:: mlbox.prediction.Predictor
87 | :members:
88 |
--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | History
2 | =======
3 |
4 | 0.1.0 (2017-02-09)
5 | ------------------
6 | * First non-official release.
7 |
8 | 0.1.1 (2017-02-23)
9 | ------------------
10 | * add of several estimators : Random Forest, Extra Trees, Logistic Regression, ...
11 | * improvement in verbose mode for reader.
12 |
13 | 0.1.2 (2017-03-02)
14 | ------------------
15 | * add of dropout for entity embeddings.
16 | * improvement in optimiser.
17 |
18 | 0.2.0 (2017-03-22)
19 | ------------------
20 | * add of feature importances for base learners.
21 | * add of leak detection.
22 | * add of stacking meta-model.
23 | * improvement in verbose mode for optimiser (folds variance).
24 |
25 | 0.2.1 (2017-04-26)
26 | ------------------
27 | * add of feature importances for bagging and boosting meta-models.
28 |
29 | 0.2.2 (first official release : 2017-06-13)
30 | -------------------------------------------
31 | * update of dependencies (Keras 2.0,...).
32 | * add of LightGBM model.
33 |
34 | 0.3.0 (2017-07-11)
35 | ------------------
36 | * Python 2.7 & Python 3.4-3.6 compatibilities
37 |
38 | 0.3.1 (2017-07-12)
39 | ------------------
40 | * Availability on PyPI.
41 |
42 | 0.4.0 (2017-07-18)
43 | ------------------
44 | * add of pipeline memory.
45 |
46 | 0.4.1 (2017-07-21)
47 | ------------------
48 | * improvement in verbose mode for reader (display missing values)
49 |
50 | 0.4.2 (2017-07-25)
51 | ------------------
52 | * update of dependencies
53 |
54 | 0.4.3 (2017-07-26)
55 | ------------------
56 | * improvement in verbose mode for predictor (display feature importances)
57 | * wait until modules and engines are imported
58 |
59 | 0.4.4 (2017-08-04)
60 | ------------------
61 | * pep8 style
62 | * normalization of drift coefficients
63 | * warning size of folder 'save'
64 |
65 | 0.5.0 (2017-08-24)
66 | ------------------
67 | * improvement in verbose mode
68 | * add of new dates features
69 | * add of a new strategy for missing categorical values
70 | * new parallel computing
71 |
72 | 0.5.1 (2017-08-25)
73 | ------------------
74 | * improvement in verbose mode for reader (display target quantiles for regression)
75 |
76 | 0.6.0 (2019-04-26)
77 | ------------------
78 | * remove xgboost installation
79 |
80 | 0.7.0 (2019-06-26)
81 | ------------------
82 | * add support for Mac OS & Windows
83 | * update support for python versions
84 | * improve setup
85 | * add tests
86 | * improve documentation & examples
87 | * minor changes in the package architecture
88 |
89 | 0.8.0 (2019-07-29)
90 | ------------------
91 | * remove support for python 2.7 version
92 |
93 | 0.8.1 (2019-08-29)
94 | ------------------
95 | * add python 3.7 version
96 | * update package dependencies
97 |
98 | 0.8.4 (2020-04-13)
99 | ------------------
100 | * update package dependencies
101 |
102 | 0.8.5 (2020-08-25)
103 | ------------------
104 | * minor fix (package dependencies)
105 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | Home - Welcome to MLBox's official documentation
2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3 |
4 | ------------------
5 |
6 | .. image:: logos/logo.png
7 |
8 |
9 | **MLBox is a powerful Automated Machine Learning python library.**
10 | It provides the following features:
11 |
12 | * Fast reading and distributed data preprocessing/cleaning/formatting.
13 | * Highly robust feature selection and leak detection.
14 | * Accurate hyper-parameter optimization in high-dimensional space.
15 | * State-of-the art predictive models for classification and regression (Deep Learning, Stacking, LightGBM,...).
16 | * Prediction with models interpretation.
17 |
18 | -------------------
19 |
20 | Links
21 | ~~~~~
22 |
23 | * **Performance experiments:**
24 | * `Kaggle competition "Two Sigma Connect: Rental Listing Inquiries" `__ (rank: **85/2488**)
25 | * `Kaggle competition "Sberbank Russian Housing Market" `__ (rank: **190/3274**)
26 |
27 | * **Examples & demos:**
28 | * `Kaggle kernel on "Titanic" dataset `__ (classification)
29 | * `Kaggle kernel on "House Prices" dataset `__ (regression)
30 |
31 | * **Articles, books & tutorials from users:**
32 | * `Tutorial on Automated Machine Learning using MLBox `__ (Analytics Vidhya article)
33 | * `MLBox: a short regression tutorial `__ (user blog)
34 | * `Implementing Auto-ML Systems with Open Source Tools `__ (KDnuggets article)
35 | * `Hands-On Automated Machine Learning `__ (O'Reilly book)
36 | * `Automatic Machine Learning `__ (Youtube tutorial)
37 | * `Automated Machine Learning with MLBox `__ (user blog)
38 | * `Introduction to AutoML with MLBox `__ (user blog)
39 |
40 | * **Webinars & conferences:**
41 | * `Paris ML Hors Série #13: Automated Machine Learning `__
42 | * `Analytics Vidhya: Automated Machine Learning using MLBox python package `__
43 | * `DataHack Summit 2017 by Analytics Vidhya `__
44 |
45 | * **References:**
46 | * `AutoML.org `__
47 | * `Skymind AI Wiki `__
48 | * `TPOT github `__
49 | * `Towards Data Science `__
50 |
51 |
52 | .. toctree::
53 | :maxdepth: 1
54 | :caption: Tutorials
55 | :hidden:
56 |
57 | installation
58 | introduction
59 |
60 | .. toctree::
61 | :maxdepth: 3
62 | :caption: Features
63 | :hidden:
64 |
65 | features
66 |
67 | .. toctree::
68 | :maxdepth: 1
69 | :caption: Contribution
70 | :hidden:
71 |
72 | authors
73 | history
74 | contributing
75 |
--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | Installation guide
2 | ==================
3 |
4 | |Documentation Status| |PyPI version| |Build Status| |GitHub Issues| |codecov| |License| |Downloads| |Python Versions|
5 |
6 |
7 | Compatibilities
8 | ---------------
9 |
10 | * *Operating systems:* **Linux**, **MacOS** & **Windows**.
11 | * *Python versions:* **3.5** - **3.7**. & **64-bit version** only (32-bit python is not supported)
12 |
13 |
14 | Basic requirements
15 | ------------------
16 |
17 | We suppose that `pip `__ is already installed.
18 |
19 | Also, please make sure you have `setuptools `__ and `wheel `__ installed, which is usually the case if pip is installed.
20 | If not, you can install both by running the following commands respectively: ``pip install setuptools`` and ``pip install wheel``.
21 |
22 |
23 | Preparation (MacOS only)
24 | ------------------------
25 |
26 | For **MacOS** users only, **OpenMP** is required. You can install it by the following command: ``brew install libomp``.
27 |
28 |
29 | Installation
30 | ------------
31 |
32 | You can choose to install MLBox either from pip or from the Github.
33 |
34 |
35 | Install from pip
36 | ~~~~~~~~~~~~~~~~
37 |
38 | Official releases of MLBox are available on **PyPI**, so you only need to run the following command:
39 |
40 | .. code-block:: console
41 |
42 | $ pip install mlbox
43 |
44 |
45 | Install from the Github
46 | ~~~~~~~~~~~~~~~~~~~~~~~
47 |
48 | If you want to get the latest features, you can also install MLBox from the Github.
49 |
50 | * **The sources for MLBox can be downloaded** from the `Github repo`_.
51 |
52 | * You can either clone the public repository:
53 |
54 | .. code-block:: console
55 |
56 | $ git clone git://github.com/AxeldeRomblay/mlbox
57 |
58 | * Or download the `tarball`_:
59 |
60 | .. code-block:: console
61 |
62 | $ curl -OL https://github.com/AxeldeRomblay/mlbox/tarball/master
63 |
64 |
65 | * Once you have a copy of the source, **you can install it**:
66 |
67 | .. code-block:: console
68 |
69 | $ cd MLBox
70 | $ python setup.py install
71 |
72 |
73 | Issues
74 | ------
75 |
76 | If you get any troubles during installation, you can refer to the `issues `__.
77 |
78 | **Please first check that there are no similar issues opened before opening one**.
79 |
80 |
81 | .. _Github repo: https://github.com/AxeldeRomblay/mlbox
82 |
83 | .. _tarball: https://github.com/AxeldeRomblay/mlbox/tarball/master
84 |
85 | .. |Documentation Status| image:: https://readthedocs.org/projects/mlbox/badge/?version=latest
86 | :target: http://mlbox.readthedocs.io/en/latest/?badge=latest
87 | .. |PyPI version| image:: https://badge.fury.io/py/mlbox.svg
88 | :target: https://pypi.python.org/pypi/mlbox
89 | .. |Build Status| image:: https://travis-ci.org/AxeldeRomblay/MLBox.svg?branch=master
90 | :target: https://travis-ci.org/AxeldeRomblay/MLBox
91 | .. |GitHub Issues| image:: https://img.shields.io/github/issues/AxeldeRomblay/MLBox.svg
92 | :target: https://github.com/AxeldeRomblay/MLBox/issues
93 | .. |codecov| image:: https://codecov.io/gh/AxeldeRomblay/MLBox/branch/master/graph/badge.svg
94 | :target: https://codecov.io/gh/AxeldeRomblay/MLBox
95 | .. |License| image:: https://img.shields.io/badge/License-BSD%203--Clause-blue.svg
96 | :target: https://github.com/AxeldeRomblay/MLBox/blob/master/LICENSE
97 | .. |Downloads| image:: https://pepy.tech/badge/mlbox
98 | :target: https://pepy.tech/project/mlbox
99 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/mlbox.svg
100 | :target: https://pypi.org/project/mlbox
101 |
--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
1 | Getting started: 30 seconds to MLBox
2 | ====================================
3 |
4 | MLBox main package contains 3 sub-packages : **preprocessing**, **optimisation** and **prediction**. Each one of them are respectively aimed at reading and preprocessing data, testing or optimising a wide range of learners and predicting the target on a test dataset.
5 |
6 | **Here are a few lines to import the MLBox:**
7 |
8 | .. code-block:: python
9 |
10 | from mlbox.preprocessing import *
11 | from mlbox.optimisation import *
12 | from mlbox.prediction import *
13 |
14 |
15 | **Then, all you need to give is :**
16 |
17 | * the list of paths to your train datasets and test datasets
18 | * the name of the target you try to predict (classification or regression)
19 |
20 | .. code-block:: python
21 |
22 | paths = [".csv", ".csv", ..., ".csv"] #to modify
23 | target_name = "" #to modify
24 |
25 |
26 | **Now, let the MLBox do the job !**
27 |
28 | ... to read and preprocess your files :
29 |
30 | .. code-block:: python
31 |
32 | data = Reader(sep=",").train_test_split(paths, target_name) #reading
33 | data = Drift_thresholder().fit_transform(data) #deleting non-stable variables
34 |
35 | ... to evaluate models (here default configuration):
36 |
37 | .. code-block:: python
38 |
39 | Optimiser().evaluate(None, data)
40 |
41 |
42 | ... or to test and optimize the whole Pipeline [**OPTIONAL**]:
43 |
44 | * missing data encoder, aka 'ne'
45 | * categorical variables encoder, aka 'ce'
46 | * feature selector, aka 'fs'
47 | * meta-features stacker, aka 'stck'
48 | * final estimator, aka 'est'
49 |
50 | **NB** : please have a look at all the possibilities you have to configure the Pipeline (steps, parameters and values...)
51 |
52 | .. code-block:: python
53 |
54 | space = {
55 |
56 | 'ne__numerical_strategy' : {"space" : [0, 'mean']},
57 |
58 | 'ce__strategy' : {"space" : ["label_encoding", "random_projection", "entity_embedding"]},
59 |
60 | 'fs__strategy' : {"space" : ["variance", "rf_feature_importance"]},
61 | 'fs__threshold': {"search" : "choice", "space" : [0.1, 0.2, 0.3]},
62 |
63 | 'est__strategy' : {"space" : ["LightGBM"]},
64 | 'est__max_depth' : {"search" : "choice", "space" : [5,6]},
65 | 'est__subsample' : {"search" : "uniform", "space" : [0.6,0.9]}
66 |
67 | }
68 |
69 | best = opt.optimise(space, data, max_evals = 5)
70 |
71 | ... finally to predict on the test set with the best parameters (or None for default configuration):
72 |
73 | .. code-block:: python
74 |
75 | Predictor().fit_predict(best, data)
76 |
77 |
78 | **That's all !** You can have a look at the folder "save" where you can find :
79 |
80 | * your predictions
81 | * feature importances
82 | * drift coefficients of your variables (0.5 = very stable, 1. = not stable at all)
83 |
--------------------------------------------------------------------------------
/docs/logos/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/logo.png
--------------------------------------------------------------------------------
/docs/logos/small_logo.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/small_logo.ico
--------------------------------------------------------------------------------
/docs/logos/small_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/small_logo.png
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | REM Command file for Sphinx documentation
4 |
5 | if "%SPHINXBUILD%" == "" (
6 | set SPHINXBUILD=sphinx-build
7 | )
8 | set BUILDDIR=_build
9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
11 | if NOT "%PAPER%" == "" (
12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
14 | )
15 |
16 | if "%1" == "" goto help
17 |
18 | if "%1" == "help" (
19 | :help
20 | echo.Please use `make ^` where ^ is one of
21 | echo. html to make standalone HTML files
22 | echo. dirhtml to make HTML files named index.html in directories
23 | echo. singlehtml to make a single large HTML file
24 | echo. pickle to make pickle files
25 | echo. json to make JSON files
26 | echo. htmlhelp to make HTML files and a HTML help project
27 | echo. qthelp to make HTML files and a qthelp project
28 | echo. devhelp to make HTML files and a Devhelp project
29 | echo. epub to make an epub
30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
31 | echo. text to make text files
32 | echo. man to make manual pages
33 | echo. texinfo to make Texinfo files
34 | echo. gettext to make PO message catalogs
35 | echo. changes to make an overview over all changed/added/deprecated items
36 | echo. xml to make Docutils-native XML files
37 | echo. pseudoxml to make pseudoxml-XML files for display purposes
38 | echo. linkcheck to check all external links for integrity
39 | echo. doctest to run all doctests embedded in the documentation if enabled
40 | goto end
41 | )
42 |
43 | if "%1" == "clean" (
44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
45 | del /q /s %BUILDDIR%\*
46 | goto end
47 | )
48 |
49 |
50 | %SPHINXBUILD% 2> nul
51 | if errorlevel 9009 (
52 | echo.
53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
54 | echo.installed, then set the SPHINXBUILD environment variable to point
55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
56 | echo.may add the Sphinx directory to PATH.
57 | echo.
58 | echo.If you don't have Sphinx installed, grab it from
59 | echo.http://sphinx-doc.org/
60 | exit /b 1
61 | )
62 |
63 | if "%1" == "html" (
64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
65 | if errorlevel 1 exit /b 1
66 | echo.
67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html.
68 | goto end
69 | )
70 |
71 | if "%1" == "dirhtml" (
72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
73 | if errorlevel 1 exit /b 1
74 | echo.
75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
76 | goto end
77 | )
78 |
79 | if "%1" == "singlehtml" (
80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
81 | if errorlevel 1 exit /b 1
82 | echo.
83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
84 | goto end
85 | )
86 |
87 | if "%1" == "pickle" (
88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
89 | if errorlevel 1 exit /b 1
90 | echo.
91 | echo.Build finished; now you can process the pickle files.
92 | goto end
93 | )
94 |
95 | if "%1" == "json" (
96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
97 | if errorlevel 1 exit /b 1
98 | echo.
99 | echo.Build finished; now you can process the JSON files.
100 | goto end
101 | )
102 |
103 | if "%1" == "htmlhelp" (
104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | if errorlevel 1 exit /b 1
106 | echo.
107 | echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | goto end
110 | )
111 |
112 | if "%1" == "qthelp" (
113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | if errorlevel 1 exit /b 1
115 | echo.
116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\mlbox.qhcp
119 | echo.To view the help file:
120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\mlbox.ghc
121 | goto end
122 | )
123 |
124 | if "%1" == "devhelp" (
125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | if errorlevel 1 exit /b 1
127 | echo.
128 | echo.Build finished.
129 | goto end
130 | )
131 |
132 | if "%1" == "epub" (
133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | if errorlevel 1 exit /b 1
135 | echo.
136 | echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | goto end
138 | )
139 |
140 | if "%1" == "latex" (
141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | if errorlevel 1 exit /b 1
143 | echo.
144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | goto end
146 | )
147 |
148 | if "%1" == "latexpdf" (
149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | cd %BUILDDIR%/latex
151 | make all-pdf
152 | cd %BUILDDIR%/..
153 | echo.
154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | goto end
156 | )
157 |
158 | if "%1" == "latexpdfja" (
159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | cd %BUILDDIR%/latex
161 | make all-pdf-ja
162 | cd %BUILDDIR%/..
163 | echo.
164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | goto end
166 | )
167 |
168 | if "%1" == "text" (
169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | if errorlevel 1 exit /b 1
171 | echo.
172 | echo.Build finished. The text files are in %BUILDDIR%/text.
173 | goto end
174 | )
175 |
176 | if "%1" == "man" (
177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | if errorlevel 1 exit /b 1
179 | echo.
180 | echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | goto end
182 | )
183 |
184 | if "%1" == "texinfo" (
185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | if errorlevel 1 exit /b 1
187 | echo.
188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | goto end
190 | )
191 |
192 | if "%1" == "gettext" (
193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | if errorlevel 1 exit /b 1
195 | echo.
196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | goto end
198 | )
199 |
200 | if "%1" == "changes" (
201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | if errorlevel 1 exit /b 1
203 | echo.
204 | echo.The overview file is in %BUILDDIR%/changes.
205 | goto end
206 | )
207 |
208 | if "%1" == "linkcheck" (
209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | if errorlevel 1 exit /b 1
211 | echo.
212 | echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | goto end
215 | )
216 |
217 | if "%1" == "doctest" (
218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | if errorlevel 1 exit /b 1
220 | echo.
221 | echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | goto end
224 | )
225 |
226 | if "%1" == "xml" (
227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | if errorlevel 1 exit /b 1
229 | echo.
230 | echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | goto end
232 | )
233 |
234 | if "%1" == "pseudoxml" (
235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | if errorlevel 1 exit /b 1
237 | echo.
238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | goto end
240 | )
241 |
242 | :end
243 |
--------------------------------------------------------------------------------
/docs/webinars/auto-ML.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/webinars/auto-ML.pdf
--------------------------------------------------------------------------------
/docs/webinars/features.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/webinars/features.pdf
--------------------------------------------------------------------------------
/examples/classification/classification.py:
--------------------------------------------------------------------------------
1 | """A classification example using mlbox."""
2 | from mlbox.preprocessing import Reader
3 | from mlbox.preprocessing import Drift_thresholder
4 | from mlbox.optimisation import Optimiser
5 | from mlbox.prediction import Predictor
6 |
7 | # Paths to the train set and the test set.
8 | paths = ["train_classification.csv", "test_classification.csv"]
9 | # Name of the feature to predict.
10 | # This columns should only be present in the train set.
11 | target_name = "Survived"
12 |
13 | # Reading and cleaning all files
14 | # Declare a reader for csv files
15 | rd = Reader(sep=',')
16 | # Return a dictionnary containing three entries
17 | # dict["train"] contains training samples withtout target columns
18 | # dict["test"] contains testing elements withtout target columns
19 | # dict["target"] contains target columns for training samples.
20 | data = rd.train_test_split(paths, target_name)
21 |
22 | dft = Drift_thresholder()
23 | data = dft.fit_transform(data)
24 |
25 | # Tuning
26 | # Declare an optimiser. Scoring possibilities for classification lie in :
27 | # {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"}
28 | opt = Optimiser(scoring='accuracy', n_folds=3)
29 | opt.evaluate(None, data)
30 |
31 | # Space of hyperparameters
32 | # The keys must respect the following syntax : "enc__param".
33 | # "enc" = "ne" for na encoder
34 | # "enc" = "ce" for categorical encoder
35 | # "enc" = "fs" for feature selector [OPTIONAL]
36 | # "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
37 | # "enc" = "est" for the final estimator
38 | # "param" : a correct associated parameter for each step.
39 | # Ex: "max_depth" for "enc"="est", ...
40 | # The values must respect the syntax: {"search":strategy,"space":list}
41 | # "strategy" = "choice" or "uniform". Default = "choice"
42 | # list : a list of values to be tested if strategy="choice".
43 | # Else, list = [value_min, value_max].
44 | # Available strategies for ne_numerical_strategy are either an integer, a float
45 | # or in {'mean', 'median', "most_frequent"}
46 | # Available strategies for ce_strategy are:
47 | # {"label_encoding", "dummification", "random_projection", entity_embedding"}
48 | space = {'ne__numerical_strategy': {"search": "choice", "space": [0]},
49 | 'ce__strategy': {"search": "choice",
50 | "space": ["label_encoding",
51 | "random_projection",
52 | "entity_embedding"]},
53 | 'fs__threshold': {"search": "uniform",
54 | "space": [0.01, 0.3]},
55 | 'est__max_depth': {"search": "choice",
56 | "space": [3, 4, 5, 6, 7]}
57 |
58 | }
59 |
60 | # Optimises hyper-parameters of the whole Pipeline with a given scoring
61 | # function. Algorithm used to optimize : Tree Parzen Estimator.
62 | #
63 | # IMPORTANT : Try to avoid dependent parameters and to set one feature
64 | # selection strategy and one estimator strategy at a time.
65 | best = opt.optimise(space, data, 15)
66 |
67 | # Make prediction and save the results in save folder.
68 | prd = Predictor()
69 | prd.fit_predict(best, data)
70 |
--------------------------------------------------------------------------------
/examples/regression/regression.py:
--------------------------------------------------------------------------------
1 | """A regression example using mlbox."""
2 | import numpy as np
3 |
4 | from mlbox.preprocessing import Reader
5 | from mlbox.preprocessing import Drift_thresholder
6 | from mlbox.optimisation import make_scorer
7 | from mlbox.optimisation import Optimiser
8 | from mlbox.prediction import Predictor
9 |
10 | # Paths to the train set and the test set.
11 | paths = ["train_regression.csv", "test_regression.csv"]
12 | # Name of the feature to predict.
13 | # This columns should only be present in the train set.
14 | target_name = "SalePrice"
15 |
16 | # Reading and cleaning all files
17 | # Declare a reader for csv files
18 | rd = Reader(sep=',')
19 | # Return a dictionnary containing three entries
20 | # dict["train"] contains training samples withtout target columns
21 | # dict["test"] contains testing elements withtout target columns
22 | # dict["target"] contains target columns for training samples.
23 | data = rd.train_test_split(paths, target_name)
24 |
25 | dft = Drift_thresholder()
26 | data = dft.fit_transform(data)
27 |
28 | # Tuning
29 | mape = make_scorer(lambda y_true,
30 | y_pred: 100*np.sum(
31 | np.abs(y_true-y_pred)/y_true
32 | )/len(y_true),
33 | greater_is_better=False,
34 | needs_proba=False)
35 | # Declare an optimiser. You can declare your own score
36 | # as presented here or use one in
37 | # {"neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"}
38 | opt = Optimiser(scoring=mape, n_folds=3)
39 | opt.evaluate(None, data)
40 |
41 | # Space of hyperparameters
42 | # The keys must respect the following syntax : "enc__param".
43 | # "enc" = "ne" for na encoder
44 | # "enc" = "ce" for categorical encoder
45 | # "enc" = "fs" for feature selector [OPTIONAL]
46 | # "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
47 | # "enc" = "est" for the final estimator
48 | # "param" : a correct associated parameter for each step.
49 | # Ex: "max_depth" for "enc"="est", ...
50 | # The values must respect the syntax: {"search":strategy,"space":list}
51 | # "strategy" = "choice" or "uniform". Default = "choice"
52 | # list : a list of values to be tested if strategy="choice".
53 | # Else, list = [value_min, value_max].
54 | # Available strategies for ne_numerical_strategy are either an integer, a float
55 | # or in {'mean', 'median', "most_frequent"}
56 | # Available strategies for ce_strategy are:
57 | # {"label_encoding", "dummification", "random_projection", entity_embedding"}
58 | space = {
59 | 'ne__numerical_strategy': {"search": "choice",
60 | "space": [0]},
61 | 'ce__strategy': {"search": "choice",
62 | "space": ["label_encoding",
63 | "random_projection",
64 | "entity_embedding"]},
65 | 'fs__threshold': {"search": "uniform",
66 | "space": [0.01, 0.3]},
67 | 'est__max_depth': {"search": "choice",
68 | "space": [3, 4, 5, 6, 7]}
69 |
70 | }
71 |
72 | # Optimises hyper-parameters of the whole Pipeline with a given scoring
73 | # function. Algorithm used to optimize : Tree Parzen Estimator.
74 | #
75 | # IMPORTANT : Try to avoid dependent parameters and to set one feature
76 | # selection strategy and one estimator strategy at a time.
77 | best = opt.optimise(space, data, 15)
78 |
79 | # Make prediction and save the results in save folder.
80 | prd = Predictor()
81 | prd.fit_predict(best, data)
82 |
--------------------------------------------------------------------------------
/mlbox/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __author__ = """Axel ARONIO DE ROMBLAY"""
4 | __email__ = 'axelderomblay@gmail.com'
5 |
6 | from .preprocessing import *
7 | from .encoding import *
8 | from .optimisation import *
9 | from .prediction import *
10 | from .model import *
11 |
--------------------------------------------------------------------------------
/mlbox/encoding/__init__.py:
--------------------------------------------------------------------------------
1 | from .na_encoder import *
2 | from .categorical_encoder import *
3 |
--------------------------------------------------------------------------------
/mlbox/encoding/na_encoder.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Author: Axel ARONIO DE ROMBLAY
3 | # License: BSD 3 clause
4 |
5 | import pandas as pd
6 | import warnings
7 |
8 | from sklearn.impute import SimpleImputer
9 |
10 |
11 | class NA_encoder():
12 | """Encodes missing values for both numerical and categorical features.
13 |
14 | Several strategies are possible in each case.
15 |
16 | Parameters
17 | ----------
18 | numerical_strategy : str or float or int. default = "mean"
19 | The strategy to encode NA for numerical features.
20 | Available strategies = "mean", "median",
21 | "most_frequent" or a float/int value
22 |
23 | categorical_strategy : str, default = ''
24 | The strategy to encode NA for categorical features.
25 | Available strategies = a string or "most_frequent"
26 |
27 | """
28 |
29 | def __init__(self,
30 | numerical_strategy='mean',
31 | categorical_strategy=''):
32 | """Init a NA_encoder.
33 |
34 | User can choose numerical strategy and categorical strategy.
35 |
36 | Parameters
37 | ----------
38 | numerical_strategy : str or float or int. default = "mean"
39 | The strategy to encode NA for numerical features.
40 |
41 | categorical_strategy : str, default = ''
42 | The strategy to encode NA for categorical features.
43 |
44 | """
45 | self.numerical_strategy = numerical_strategy
46 | self.categorical_strategy = categorical_strategy
47 | self.__Lcat = []
48 | self.__Lnum = []
49 | self.__imp = None
50 | self.__mode = dict()
51 | self.__fitOK = False
52 |
53 | def get_params(self, deep=True):
54 | """Get parameters of a NA_encoder object."""
55 | return {'numerical_strategy': self.numerical_strategy,
56 | 'categorical_strategy': self.categorical_strategy}
57 |
58 | def set_params(self, **params):
59 | """Set parameters for a NA_encoder object.
60 |
61 | Set numerical strategy and categorical strategy.
62 |
63 | Parameters
64 | ----------
65 | numerical_strategy : str or float or int. default = "mean"
66 | The strategy to encode NA for numerical features.
67 |
68 | categorical_strategy : str, default = ''
69 | The strategy to encode NA for categorical features.
70 |
71 | """
72 | self.__fitOK = False
73 |
74 | for k, v in params.items():
75 | if k not in self.get_params():
76 | warnings.warn("Invalid parameter(s) for encoder NA_encoder. "
77 | "Parameter(s) IGNORED. "
78 | "Check the list of available parameters with "
79 | "`encoder.get_params().keys()`")
80 | else:
81 | setattr(self, k, v)
82 |
83 | def fit(self, df_train, y_train=None):
84 | """Fits NA Encoder.
85 |
86 | Parameters
87 | ----------
88 | df_train : pandas dataframe of shape = (n_train, n_features)
89 | The train dataset with numerical and categorical features.
90 |
91 | y_train : pandas series of shape = (n_train, ), default = None
92 | The target for classification or regression tasks.
93 |
94 | Returns
95 | -------
96 | object
97 | self
98 |
99 | """
100 | self.__Lcat = df_train.dtypes[df_train.dtypes == 'object'].index
101 | self.__Lnum = df_train.dtypes[df_train.dtypes != 'object'].index
102 |
103 | # Dealing with numerical features
104 |
105 | if (self.numerical_strategy in ['mean', 'median', "most_frequent"]):
106 |
107 | self.__imp = SimpleImputer(strategy=self.numerical_strategy)
108 |
109 | if (len(self.__Lnum) != 0):
110 | self.__imp.fit(df_train[self.__Lnum])
111 | else:
112 | pass
113 |
114 | elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)):
115 |
116 | pass
117 |
118 | else:
119 |
120 | raise ValueError("Numerical strategy for NA encoding is not valid")
121 |
122 | # Dealing with categorical features
123 |
124 | if (type(self.categorical_strategy) == str):
125 |
126 | if (self.categorical_strategy == "most_frequent"):
127 |
128 | na_count = df_train[self.__Lcat].isnull().sum()
129 |
130 | for col in na_count[na_count>0].index:
131 |
132 | try:
133 | self.__mode[col] = df_train[col].mode()[0]
134 | except:
135 | self.__mode[col] = ""
136 |
137 | else:
138 | pass
139 |
140 | else:
141 | raise ValueError("Categorical strategy for NA encoding is not valid")
142 |
143 | self.__fitOK = True
144 |
145 | return self
146 |
147 | def fit_transform(self, df_train, y_train=None):
148 | """Fits NA Encoder and transforms the dataset.
149 |
150 | Parameters
151 | ----------
152 | df_train : pandas.Dataframe of shape = (n_train, n_features)
153 | The train dataset with numerical and categorical features.
154 |
155 | y_train : pandas.Series of shape = (n_train, ), default = None
156 | The target for classification or regression tasks.
157 |
158 | Returns
159 | -------
160 | pandas.Dataframe of shape = (n_train, n_features)
161 | The train dataset with no missing values.
162 |
163 | """
164 | self.fit(df_train, y_train)
165 |
166 | return self.transform(df_train)
167 |
168 | def transform(self, df):
169 | """Transform the dataset.
170 |
171 | Parameters
172 | ----------
173 | df : pandas.Dataframe of shape = (n, n_features)
174 | The dataset with numerical and categorical features.
175 |
176 | Returns
177 | -------
178 | pandas.Dataframe of shape = (n, n_features)
179 | The dataset with no missing values.
180 |
181 | """
182 | if(self.__fitOK):
183 |
184 | if(len(self.__Lnum) == 0):
185 |
186 | if (self.categorical_strategy != "most_frequent"):
187 | return df[self.__Lcat].fillna(self.categorical_strategy)
188 |
189 | else:
190 | return df[self.__Lcat].fillna(self.__mode)
191 |
192 | else:
193 |
194 | if (self.numerical_strategy in ['mean',
195 | 'median',
196 | "most_frequent"]):
197 |
198 | if (len(self.__Lcat) != 0):
199 |
200 | if (self.categorical_strategy != "most_frequent"):
201 |
202 | return pd.concat(
203 | (pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
204 | columns=self.__Lnum,
205 | index=df.index),
206 | df[self.__Lcat].fillna(self.categorical_strategy)
207 | ),
208 | axis=1)[df.columns]
209 |
210 | else:
211 |
212 | return pd.concat(
213 | (pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
214 | columns=self.__Lnum,
215 | index=df.index),
216 | df[self.__Lcat].fillna(self.__mode)
217 | ),
218 | axis=1)[df.columns]
219 |
220 | else:
221 |
222 | return pd.DataFrame(
223 | self.__imp.transform(df[self.__Lnum]),
224 | columns=self.__Lnum,
225 | index=df.index
226 | )
227 |
228 | elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)):
229 |
230 | if (len(self.__Lcat) != 0):
231 |
232 | if (self.categorical_strategy != "most_frequent"):
233 |
234 | return pd.concat(
235 | (df[self.__Lnum].fillna(self.numerical_strategy),
236 | df[self.__Lcat].fillna(self.categorical_strategy)
237 | ),
238 | axis=1)[df.columns]
239 |
240 | else:
241 |
242 | return pd.concat(
243 | (df[self.__Lnum].fillna(self.numerical_strategy),
244 | df[self.__Lcat].fillna(self.__mode)
245 | ),
246 | axis=1)[df.columns]
247 | else:
248 |
249 | return df[self.__Lnum].fillna(self.numerical_strategy)
250 |
251 | else:
252 |
253 | raise ValueError("Call fit or fit_transform function before")
254 |
--------------------------------------------------------------------------------
/mlbox/model/__init__.py:
--------------------------------------------------------------------------------
1 | from . import classification
2 | from . import regression
3 |
4 | __all__ = ['classification', 'regression']
5 |
--------------------------------------------------------------------------------
/mlbox/model/classification/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature_selector import Clf_feature_selector
2 | from .classifier import Classifier
3 | from .stacking_classifier import StackingClassifier
4 |
5 | __all__ = ['Clf_feature_selector', 'Classifier', 'StackingClassifier']
6 |
--------------------------------------------------------------------------------
/mlbox/model/classification/feature_selector.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Author: Axel ARONIO DE ROMBLAY
3 | # License: BSD 3 clause
4 |
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.linear_model import LogisticRegression
8 | from sklearn.ensemble import RandomForestClassifier
9 | import warnings
10 |
11 |
12 | class Clf_feature_selector():
13 |
14 | """Selects useful features.
15 |
16 | Several strategies are possible (filter and wrapper methods).
17 | Works for classification problems only (multiclass or binary).
18 |
19 | Parameters
20 | ----------
21 | strategy : str, defaut = "l1"
22 | The strategy to select features.
23 | Available strategies = {"variance", "l1", "rf_feature_importance"}
24 |
25 | threshold : float, defaut = 0.3
26 | The percentage of variable to discard according to the strategy.
27 | Must be between 0. and 1.
28 | """
29 |
30 | def __init__(self, strategy='l1', threshold=0.3):
31 |
32 | # 'variance','l1, 'rf_feature_importance'
33 | self.strategy = strategy
34 | # a float between 0. and 1. defaut : 0.3 ie we drop 0.3 of features
35 | self.threshold = threshold
36 | self.__fitOK = False
37 | self.__to_discard = []
38 |
39 |
40 | def get_params(self, deep=True):
41 |
42 | return {'strategy': self.strategy,
43 | 'threshold': self.threshold}
44 |
45 |
46 | def set_params(self, **params):
47 |
48 | self.__fitOK = False
49 |
50 | for k, v in params.items():
51 | if k not in self.get_params():
52 | warnings.warn("Invalid parameter a for feature selector"
53 | "Clf_feature_selector. Parameter IGNORED. Check"
54 | "the list of available parameters with"
55 | "`feature_selector.get_params().keys()`")
56 | else:
57 | setattr(self, k, v)
58 |
59 |
60 | def fit(self, df_train, y_train):
61 |
62 | """Fits Clf_feature_selector
63 |
64 | Parameters
65 | ----------
66 | df_train : pandas dataframe of shape = (n_train, n_features)
67 | The train dataset with numerical features and no NA
68 |
69 | y_train : pandas series of shape = (n_train, )
70 | The target for classification task. Must be encoded.
71 |
72 | Returns
73 | -------
74 | object
75 | self
76 | """
77 |
78 | # sanity checks
79 | if((type(df_train) != pd.SparseDataFrame) and
80 | (type(df_train) != pd.DataFrame)):
81 | raise ValueError("df_train must be a DataFrame")
82 |
83 | if (type(y_train) != pd.core.series.Series):
84 | raise ValueError("y_train must be a Series")
85 |
86 | if(self.strategy == 'variance'):
87 | coef = df_train.std()
88 | abstract_threshold = np.percentile(coef, 100. * self.threshold)
89 | self.__to_discard = coef[coef < abstract_threshold].index
90 | self.__fitOK = True
91 |
92 | elif(self.strategy == 'l1'):
93 | model = LogisticRegression(C=0.01, penalty='l1', solver="saga",
94 | n_jobs=-1, random_state=0) # to be tuned
95 | model.fit(df_train, y_train)
96 | coef = np.mean(np.abs(model.coef_), axis=0)
97 | abstract_threshold = np.percentile(coef, 100. * self.threshold)
98 | self.__to_discard = df_train.columns[coef < abstract_threshold]
99 | self.__fitOK = True
100 |
101 | elif(self.strategy == 'rf_feature_importance'):
102 | model = RandomForestClassifier(n_estimators=50, n_jobs=-1,
103 | random_state=0) # to be tuned
104 | model.fit(df_train, y_train)
105 | coef = model.feature_importances_
106 | abstract_threshold = np.percentile(coef, 100. * self.threshold)
107 | self.__to_discard = df_train.columns[coef < abstract_threshold]
108 | self.__fitOK = True
109 |
110 | else:
111 | raise ValueError("Strategy invalid. Please choose between "
112 | "'variance', 'l1' or 'rf_feature_importance'")
113 |
114 | return self
115 |
116 |
117 | def transform(self, df):
118 |
119 | """Transforms the dataset
120 |
121 | Parameters
122 | ----------
123 | df : pandas dataframe of shape = (n, n_features)
124 | The dataset with numerical features and no NA
125 |
126 | Returns
127 | -------
128 | pandas dataframe of shape = (n_train, n_features*(1-threshold))
129 | The train dataset with relevant features
130 | """
131 |
132 | if(self.__fitOK):
133 |
134 | # sanity checks
135 | if((type(df) != pd.SparseDataFrame) and
136 | (type(df) != pd.DataFrame)):
137 | raise ValueError("df must be a DataFrame")
138 |
139 | return df.drop(self.__to_discard, axis=1)
140 | else:
141 | raise ValueError("call fit or fit_transform function before")
142 |
143 |
144 | def fit_transform(self, df_train, y_train):
145 |
146 | """Fits Clf_feature_selector and transforms the dataset
147 |
148 | Parameters
149 | ----------
150 | df_train : pandas dataframe of shape = (n_train, n_features)
151 | The train dataset with numerical features and no NA
152 |
153 | y_train : pandas series of shape = (n_train, ).
154 | The target for classification task. Must be encoded.
155 |
156 | Returns
157 | -------
158 | pandas dataframe of shape = (n_train, n_features*(1-threshold))
159 | The train dataset with relevant features
160 | """
161 |
162 | self.fit(df_train, y_train)
163 |
164 | return self.transform(df_train)
165 |
--------------------------------------------------------------------------------
/mlbox/model/regression/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature_selector import Reg_feature_selector
2 | from .regressor import Regressor
3 | from .stacking_regressor import StackingRegressor
4 |
5 | __all__ = ['Reg_feature_selector', 'Regressor', 'StackingRegressor']
6 |
--------------------------------------------------------------------------------
/mlbox/model/regression/feature_selector.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Author: Axel ARONIO DE ROMBLAY
3 | # License: BSD 3 clause
4 |
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.linear_model import Lasso
8 | from sklearn.ensemble import RandomForestRegressor
9 | import warnings
10 |
11 |
12 | class Reg_feature_selector():
13 |
14 | """Selects useful features.
15 |
16 | Several strategies are possible (filter and wrapper methods).
17 | Works for regression problems only.
18 |
19 | Parameters
20 | ----------
21 | strategy : str, defaut = "l1"
22 | The strategy to select features.
23 | Available strategies = {"variance", "l1", "rf_feature_importance"}
24 |
25 | threshold : float, defaut = 0.3
26 | The percentage of variable to discard according the strategy.
27 | Must be between 0. and 1.
28 | """
29 |
30 | def __init__(self, strategy='l1', threshold=0.3):
31 | self.strategy = strategy
32 | self.threshold = threshold
33 | self.__fitOK = False
34 | self.__to_discard = []
35 |
36 |
37 | def get_params(self, deep=True):
38 | return {'strategy': self.strategy,
39 | 'threshold': self.threshold}
40 |
41 |
42 | def set_params(self, **params):
43 | self.__fitOK = False
44 |
45 | for k, v in params.items():
46 | if k not in self.get_params():
47 | warnings.warn("Invalid parameter a for feature selector"
48 | "Reg_feature_selector. Parameter IGNORED. Check "
49 | "the list of available parameters with "
50 | "`feature_selector.get_params().keys()`")
51 | else:
52 | setattr(self, k, v)
53 |
54 |
55 | def fit(self, df_train, y_train):
56 |
57 | """Fits Reg_feature_selector.
58 |
59 | Parameters
60 | ----------
61 | df_train : pandas dataframe of shape = (n_train, n_features)
62 | The train dataset with numerical features and no NA
63 |
64 | y_train : pandas series of shape = (n_train, ).
65 | The target for regression task.
66 |
67 | Returns
68 | -------
69 | sobject
70 | self
71 | """
72 |
73 | # sanity checks
74 | if((type(df_train) != pd.SparseDataFrame) and
75 | (type(df_train) != pd.DataFrame)):
76 | raise ValueError("df_train must be a DataFrame")
77 |
78 | if (type(y_train) != pd.core.series.Series):
79 | raise ValueError("y_train must be a Series")
80 |
81 | if(self.strategy == 'variance'):
82 | coef = df_train.std()
83 | abstract_threshold = np.percentile(coef, 100. * self.threshold)
84 | self.__to_discard = coef[coef < abstract_threshold].index
85 | self.__fitOK = True
86 |
87 | elif(self.strategy == 'l1'):
88 | model = Lasso(alpha=100.0, random_state=0) # to be tuned
89 | model.fit(df_train, y_train)
90 | coef = np.abs(model.coef_)
91 | abstract_threshold = np.percentile(coef, 100. * self.threshold)
92 | self.__to_discard = df_train.columns[coef < abstract_threshold]
93 | self.__fitOK = True
94 |
95 | elif(self.strategy == 'rf_feature_importance'):
96 | model = RandomForestRegressor(n_estimators=50,
97 | n_jobs=-1,
98 | random_state=0) # to be tuned
99 | model.fit(df_train, y_train)
100 | coef = model.feature_importances_
101 | abstract_threshold = np.percentile(coef, 100. * self.threshold)
102 | self.__to_discard = df_train.columns[coef < abstract_threshold]
103 | self.__fitOK = True
104 |
105 | else:
106 | raise ValueError("Strategy invalid. Please choose between "
107 | "'variance', 'l1' or 'rf_feature_importance'")
108 |
109 | return self
110 |
111 |
112 | def transform(self, df):
113 |
114 | """Transforms the dataset
115 |
116 | Parameters
117 | ----------
118 | df : pandas dataframe of shape = (n, n_features)
119 | The dataset with numerical features and no NA
120 |
121 | Returns
122 | -------
123 | pandas dataframe of shape = (n_train, n_features*(1-threshold))
124 | The train dataset with relevant features
125 | """
126 |
127 | if(self.__fitOK):
128 |
129 | # sanity checks
130 | if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
131 | raise ValueError("df must be a DataFrame")
132 |
133 | return df.drop(self.__to_discard, axis=1)
134 | else:
135 | raise ValueError("call fit or fit_transform function before")
136 |
137 |
138 | def fit_transform(self, df_train, y_train):
139 |
140 | """Fits Reg_feature_selector and transforms the dataset
141 |
142 | Parameters
143 | ----------
144 | df_train : pandas dataframe of shape = (n_train, n_features)
145 | The train dataset with numerical features and no NA
146 |
147 | y_train : pandas series of shape = (n_train, ).
148 | The target for regression task.
149 |
150 | Returns
151 | -------
152 | pandas dataframe of shape = (n_train, n_features*(1-threshold))
153 | The train dataset with relevant features
154 | """
155 |
156 | self.fit(df_train, y_train)
157 |
158 | return self.transform(df_train)
159 |
--------------------------------------------------------------------------------
/mlbox/model/regression/regressor.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # License: BSD 3 clause
5 |
6 | import warnings
7 | from copy import copy
8 |
9 | import numpy as np
10 | import pandas as pd
11 | from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor,
12 | ExtraTreesRegressor, RandomForestRegressor)
13 | from sklearn.linear_model import Ridge
14 | from sklearn.tree import DecisionTreeRegressor
15 | from lightgbm import LGBMRegressor
16 |
17 |
18 | class Regressor():
19 | """Wrap scikitlearn regressors.
20 |
21 | Parameters
22 | ----------
23 | strategy : str, default = "LightGBM"
24 | The choice for the regressor.
25 | Available strategies = {"LightGBM", "RandomForest", "ExtraTrees",
26 | "Tree", "Bagging", "AdaBoost" or "Linear"}
27 |
28 | **params : default = None
29 | Parameters of the corresponding regressor.
30 | Examples : n_estimators, max_depth...
31 |
32 | """
33 |
34 | def __init__(self, **params):
35 | """Init Regressor object where user can pass a strategy."""
36 | if ("strategy" in params):
37 | self.__strategy = params["strategy"]
38 | else:
39 | self.__strategy = "LightGBM"
40 |
41 | self.__regress_params = {}
42 |
43 | self.__regressor = None
44 | self.__set_regressor(self.__strategy)
45 | self.__col = None
46 |
47 | self.set_params(**params)
48 | self.__fitOK = False
49 |
50 | def get_params(self, deep=True):
51 | """Get parameters of Regressor object."""
52 | params = {}
53 | params["strategy"] = self.__strategy
54 | params.update(self.__regress_params)
55 |
56 | return params
57 |
58 | def set_params(self, **params):
59 | """Set parameters of Regressor object."""
60 | self.__fitOK = False
61 |
62 | if 'strategy' in params.keys():
63 | self.__set_regressor(params['strategy'])
64 |
65 | for k, v in self.__regress_params.items():
66 | if k not in self.get_params().keys():
67 | warnings.warn("Invalid parameter for regressor "
68 | + str(self.__strategy)
69 | + ". Parameter IGNORED. Check the list of "
70 | "available parameters with "
71 | "`regressor.get_params().keys()`")
72 | else:
73 | setattr(self.__regressor, k, v)
74 |
75 | for k, v in params.items():
76 | if(k == "strategy"):
77 | pass
78 | else:
79 | if k not in self.__regressor.get_params().keys():
80 | warnings.warn("Invalid parameter for regressor "
81 | + str(self.__strategy)
82 | + ". Parameter IGNORED. Check the list of "
83 | "available parameters with "
84 | "`regressor.get_params().keys()`")
85 | else:
86 | setattr(self.__regressor, k, v)
87 | self.__regress_params[k] = v
88 |
89 | def __set_regressor(self, strategy):
90 | """Set strategy of a regressor object."""
91 | self.__strategy = strategy
92 |
93 | if(strategy == 'RandomForest'):
94 | self.__regressor = RandomForestRegressor(
95 | n_estimators=400, max_depth=10, max_features='sqrt',
96 | bootstrap=True, n_jobs=-1, random_state=0)
97 |
98 | elif(strategy == "LightGBM"):
99 | self.__regressor = LGBMRegressor(
100 | n_estimators=500, learning_rate=0.05,
101 | colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0)
102 |
103 | elif(strategy == 'ExtraTrees'):
104 | self.__regressor = ExtraTreesRegressor(
105 | n_estimators=400, max_depth=10, max_features='sqrt',
106 | bootstrap=True, n_jobs=-1, random_state=0)
107 |
108 | elif(strategy == 'Tree'):
109 | self.__regressor = DecisionTreeRegressor(
110 | criterion='mse', splitter='best', max_depth=None,
111 | min_samples_split=2, min_samples_leaf=1,
112 | min_weight_fraction_leaf=0.0, max_features=None,
113 | random_state=0, max_leaf_nodes=None, presort=False)
114 |
115 | elif(strategy == "Bagging"):
116 | self.__regressor = BaggingRegressor(
117 | base_estimator=None, n_estimators=500, max_samples=.9,
118 | max_features=.85, bootstrap=False, bootstrap_features=False,
119 | n_jobs=-1, random_state=0)
120 |
121 | elif(strategy == "AdaBoost"):
122 | self.__regressor = AdaBoostRegressor(
123 | base_estimator=None, n_estimators=400, learning_rate=.05,
124 | random_state=0)
125 |
126 | elif(strategy == "Linear"):
127 | self.__regressor = Ridge(
128 | alpha=1.0, fit_intercept=True, normalize=False, copy_X=True,
129 | max_iter=None, tol=0.001, solver='auto', random_state=0)
130 |
131 | else:
132 | raise ValueError(
133 | "Strategy invalid. Please choose between 'LightGBM'"
134 | ", 'RandomForest', 'ExtraTrees', "
135 | "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")
136 |
137 | def fit(self, df_train, y_train):
138 | """Fits Regressor.
139 |
140 | Parameters
141 | ----------
142 | df_train : pandas dataframe of shape = (n_train, n_features)
143 | The train dataset with numerical features.
144 |
145 | y_train : pandas series of shape = (n_train, )
146 | The target for regression tasks.
147 |
148 | Returns
149 | -------
150 | object
151 | self
152 |
153 | """
154 | # sanity checks
155 | if((type(df_train) != pd.SparseDataFrame) and
156 | (type(df_train) != pd.DataFrame)):
157 | raise ValueError("df_train must be a DataFrame")
158 |
159 | if (type(y_train) != pd.core.series.Series):
160 | raise ValueError("y_train must be a Series")
161 |
162 | self.__regressor.fit(df_train.values, y_train)
163 | self.__col = df_train.columns
164 | self.__fitOK = True
165 |
166 | return self
167 |
168 | def feature_importances(self):
169 | """Computes feature importances.
170 |
171 | Regressor must be fitted before.
172 |
173 | Returns
174 | -------
175 | dict
176 | Dictionnary containing a measure of feature importance (value)
177 | for each feature (key).
178 |
179 | """
180 | if self.__fitOK:
181 |
182 | if (self.get_params()["strategy"] in ["Linear"]):
183 |
184 | importance = {}
185 | f = np.abs(self.get_estimator().coef_)
186 |
187 | for i, col in enumerate(self.__col):
188 | importance[col] = f[i]
189 |
190 | elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest",
191 | "ExtraTrees", "Tree"]):
192 |
193 | importance = {}
194 | f = self.get_estimator().feature_importances_
195 |
196 | for i, col in enumerate(self.__col):
197 | importance[col] = f[i]
198 |
199 | elif (self.get_params()["strategy"] in ["AdaBoost"]):
200 |
201 | importance = {}
202 | norm = self.get_estimator().estimator_weights_.sum()
203 |
204 | try:
205 | # LGB, RF, ET, Tree and AdaBoost
206 | # TODO: Refactor this part
207 | f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa
208 |
209 | except Exception:
210 | f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa
211 |
212 | for i, col in enumerate(self.__col):
213 | importance[col] = f[i]
214 |
215 | elif (self.get_params()["strategy"] in ["Bagging"]):
216 |
217 | importance = {}
218 | importance_bag = []
219 |
220 | for i, b in enumerate(self.get_estimator().estimators_):
221 |
222 | d = {}
223 |
224 | try:
225 | # LGB, RF, ET, Tree and AdaBoost
226 | f = b.feature_importances_
227 | except Exception:
228 | f = np.abs(b.coef_) # Linear
229 |
230 | estimator = self.get_estimator()
231 | items = enumerate(estimator.estimators_features_[i])
232 | for j, c in items:
233 | d[self.__col[c]] = f[j]
234 |
235 | importance_bag.append(d.copy())
236 |
237 | for i, col in enumerate(self.__col):
238 | list_filtered = filter(lambda x: x != 0,
239 | [k[col] if col in k else 0
240 | for k in importance_bag])
241 | importance[col] = np.mean(list(list_filtered))
242 |
243 | else:
244 |
245 | importance = {}
246 |
247 | return importance
248 |
249 | else:
250 |
251 | raise ValueError("You must call the fit function before !")
252 |
253 | def predict(self, df):
254 | """Predicts the target.
255 |
256 | Parameters
257 | ----------
258 | df : pandas dataframe of shape = (n, n_features)
259 | The dataset with numerical features.
260 |
261 | Returns
262 | -------
263 | array of shape = (n, )
264 | The target to be predicted.
265 |
266 | """
267 | try:
268 | if not callable(getattr(self.__regressor, "predict")):
269 | raise ValueError("predict attribute is not callable")
270 | except Exception as e:
271 | raise e
272 |
273 | if self.__fitOK:
274 |
275 | # sanity checks
276 | if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
277 | raise ValueError("df must be a DataFrame")
278 |
279 | return self.__regressor.predict(df.values)
280 |
281 | else:
282 | raise ValueError("You must call the fit function before !")
283 |
284 | def transform(self, df):
285 | """Transform dataframe df.
286 |
287 | Parameters
288 | ----------
289 | df : pandas dataframe of shape = (n, n_features)
290 | The dataset with numerical features.
291 |
292 | Returns
293 | -------
294 | pandas dataframe of shape = (n, n_selected_features)
295 | The transformed dataset with its most important features.
296 |
297 | """
298 | try:
299 | if not callable(getattr(self.__regressor, "transform")):
300 | raise ValueError("transform attribute is not callable")
301 | except Exception as e:
302 | raise e
303 |
304 | if self.__fitOK:
305 |
306 | # sanity checks
307 | if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
308 | raise ValueError("df must be a DataFrame")
309 |
310 | return self.__regressor.transform(df.values)
311 | else:
312 | raise ValueError("You must call the fit function before !")
313 |
314 | def score(self, df, y, sample_weight=None):
315 | """Return R^2 coefficient of determination of the prediction.
316 |
317 | Parameters
318 | ----------
319 | df : pandas dataframe of shape = (n, n_features)
320 | The dataset with numerical features.
321 |
322 | y : pandas series of shape = (n,)
323 | The numerical encoded target for classification tasks.
324 |
325 | Returns
326 | -------
327 | float
328 | R^2 of self.predict(df) wrt. y.
329 |
330 | """
331 | try:
332 | if not callable(getattr(self.__regressor, "score")):
333 | raise ValueError("score attribute is not callable")
334 | except Exception as e:
335 | raise e
336 |
337 | if self.__fitOK:
338 |
339 | # sanity checks
340 | if((type(df) != pd.SparseDataFrame) and
341 | (type(df) != pd.DataFrame)):
342 | raise ValueError("df must be a DataFrame")
343 |
344 | if (type(y) != pd.core.series.Series):
345 | raise ValueError("y must be a Series")
346 |
347 | return self.__regressor.score(df.values, y, sample_weight)
348 | else:
349 | raise ValueError("You must call the fit function before !")
350 |
351 | def get_estimator(self):
352 | """Return classfier."""
353 | return copy(self.__regressor)
354 |
--------------------------------------------------------------------------------
/mlbox/model/regression/stacking_regressor.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Author: Axel ARONIO DE ROMBLAY
3 | # License: BSD 3 clause
4 |
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from sklearn.linear_model import LinearRegression
9 | from sklearn.model_selection import KFold, cross_val_predict
10 | from copy import copy as make_copy
11 | from .regressor import Regressor
12 | import warnings
13 |
14 |
15 | class StackingRegressor():
16 | """A Stacking regressor.
17 |
18 | A stacking regressor is a regressor that uses the predictions of
19 | several first layer estimators (generated with a cross validation method)
20 | for a second layer estimator.
21 |
22 |
23 | Parameters
24 | ----------
25 | base_estimators : list, default = [Regressor(strategy="LightGBM"),
26 | Regressor(strategy="RandomForest"),
27 | Regressor(strategy="ExtraTrees")]
28 | List of estimators to fit in the first level using a cross validation.
29 |
30 | level_estimator : object, default = LinearRegression()
31 | The estimator used in second and last level
32 |
33 | n_folds : int, default = 5
34 | Number of folds used to generate the meta features for the training set
35 |
36 | copy : bool, default = False
37 | If true, meta features are added to the original dataset
38 |
39 | random_state : None, int or RandomState. default = 1
40 | Pseudo-random number generator state used for shuffling.
41 | If None, use default numpy RNG for shuffling.
42 |
43 | verbose : bool, default = True
44 | Verbose mode.
45 |
46 | """
47 |
48 | def __init__(self, base_estimators=[Regressor(strategy="LightGBM"),
49 | Regressor(strategy="RandomForest"),
50 | Regressor(strategy="ExtraTrees")],
51 | level_estimator=LinearRegression(), n_folds=5,
52 | copy=False, random_state=1, verbose=True):
53 | """Init method for StackingRegressor."""
54 | self.base_estimators = base_estimators
55 | if(type(base_estimators) != list):
56 | raise ValueError("base_estimators must be a list")
57 | else:
58 | for i, est in enumerate(self.base_estimators):
59 | self.base_estimators[i] = make_copy(est)
60 |
61 | self.level_estimator = level_estimator
62 |
63 | self.n_folds = n_folds
64 | if(type(n_folds) != int):
65 | raise ValueError("n_folds must be an integer")
66 |
67 | self.copy = copy
68 | if(type(copy) != bool):
69 | raise ValueError("copy must be a boolean")
70 |
71 | self.random_state = random_state
72 | if((type(self.random_state) != int)
73 | and (self.random_state is not None)):
74 | raise ValueError("random_state must be either None or an integer")
75 |
76 | self.verbose = verbose
77 | if(type(self.verbose) != bool):
78 | raise ValueError("verbose must be a boolean")
79 |
80 | self.__fitOK = False
81 | self.__fittransformOK = False
82 |
83 | def get_params(self, deep=True):
84 | """Get parameters of a StackingRegressor object."""
85 | return {'level_estimator': self.level_estimator,
86 | 'base_estimators': self.base_estimators,
87 | 'n_folds': self.n_folds,
88 | 'copy': self.copy,
89 | 'random_state': self.random_state,
90 | 'verbose': self.verbose}
91 |
92 | def set_params(self, **params):
93 | """Set parameters of a StackingRegressor object."""
94 | self.__fitOK = False
95 | self.__fittransformOK = False
96 |
97 | for k, v in params.items():
98 | if k not in self.get_params():
99 | warnings.warn("Invalid parameter a for stacking_regressor "
100 | "StackingRegressor. Parameter IGNORED. Check the"
101 | " list of available parameters with "
102 | "`stacking_regressor.get_params().keys()`")
103 | else:
104 | setattr(self, k, v)
105 |
106 | def fit_transform(self, df_train, y_train):
107 | """Create meta-features for the training dataset.
108 |
109 | Parameters
110 | ----------
111 | df_train : pandas DataFrame of shape = (n_samples, n_features)
112 | The training dataset.
113 |
114 | y_train : pandas series of shape = (n_samples, )
115 | The target
116 |
117 | Returns
118 | -------
119 | pandas DataFrame of shape = (n_samples,
120 | n_features*int(copy)+n_metafeatures)
121 | The transformed training dataset.
122 |
123 | """
124 | # sanity checks
125 | if((type(df_train) != pd.SparseDataFrame) & (type(df_train) != pd.DataFrame)):
126 | raise ValueError("df_train must be a DataFrame")
127 |
128 | if(type(y_train) != pd.core.series.Series):
129 | raise ValueError("y_train must be a Series")
130 |
131 | cv = KFold(n_splits=self.n_folds, shuffle=True,
132 | random_state=self.random_state)
133 |
134 | preds = pd.DataFrame([], index=y_train.index)
135 |
136 | if(self.verbose):
137 | print("")
138 | print("[=========================================================="
139 | "===================] LAYER [==============================="
140 | "====================================================]")
141 | print("")
142 |
143 | for c, reg in enumerate(self.base_estimators):
144 |
145 | if(self.verbose):
146 | print("> fitting estimator n°" + str(c + 1) +
147 | " : " + str(reg.get_params()) + " ...")
148 | print("")
149 |
150 | # for each base estimator, we create the meta feature on train set
151 | y_pred = cross_val_predict(estimator=reg, X=df_train, y=y_train, cv=cv)
152 | preds["est" + str(c + 1)] = y_pred
153 |
154 | # and we refit the base estimator on entire train set
155 | reg.fit(df_train, y_train)
156 |
157 | layer = 1
158 | columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
159 | while(len(np.intersect1d(df_train.columns, columns)) > 0):
160 | layer = layer + 1
161 | columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
162 | preds.columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
163 |
164 | self.__fittransformOK = True
165 |
166 | if(self.copy):
167 | # we keep also the initial features
168 | return pd.concat([df_train, preds], axis=1)
169 |
170 | else:
171 | return preds # we keep only the meta features
172 |
173 | def transform(self, df_test):
174 | """Create meta-features for the test dataset.
175 |
176 | Parameters
177 | ----------
178 | df_test : pandas DataFrame of shape = (n_samples_test, n_features)
179 | The test dataset.
180 |
181 | Returns
182 | -------
183 | pandas DataFrame of shape = (n_samples_test,
184 | n_features*int(copy)+n_metafeatures)
185 | The transformed test dataset.
186 |
187 | """
188 | # sanity checks
189 | if((type(df_test) != pd.SparseDataFrame) and
190 | (type(df_test) != pd.DataFrame)):
191 | raise ValueError("df_test must be a DataFrame")
192 |
193 | if(self.__fittransformOK):
194 |
195 | preds_test = pd.DataFrame([], index=df_test.index)
196 |
197 | for c, reg in enumerate(self.base_estimators):
198 |
199 | # we predict the meta feature on test set
200 | y_pred_test = reg.predict(df_test)
201 | preds_test["est" + str(c + 1)] = y_pred_test
202 |
203 | layer = 1
204 | columns = ["layer" + str(layer) + "_" + s
205 | for s in preds_test.columns]
206 |
207 | while(len(np.intersect1d(df_test.columns, columns)) > 0):
208 | layer = layer + 1
209 | columns = ["layer" + str(layer) + "_" + s
210 | for s in preds_test.columns]
211 |
212 | preds_test.columns = [
213 | "layer" + str(layer) + "_" + s for s in preds_test.columns]
214 |
215 | if(self.copy):
216 | # we keep also the initial features
217 | return pd.concat([df_test, preds_test], axis=1)
218 | else:
219 | return preds_test # we keep only the meta features
220 |
221 | else:
222 | raise ValueError("Call fit_transform before !")
223 |
224 | def fit(self, df_train, y_train):
225 | """Fit the first level estimators and the second level estimator on X.
226 |
227 | Parameters
228 | ----------
229 | df_train : pandas DataFrame of shape (n_samples, n_features)
230 | Input data
231 |
232 | y_train : pandas series of shape = (n_samples, )
233 | The target
234 |
235 | Returns
236 | -------
237 | object
238 | self
239 |
240 | """
241 | # Fit the base estimators
242 | df_train = self.fit_transform(df_train, y_train)
243 |
244 | if(self.verbose):
245 | print("")
246 | print("[=========================================================="
247 | "===============] PREDICTION LAYER [========================"
248 | "====================================================]")
249 | print("")
250 | print("> fitting estimator : " +
251 | str(self.level_estimator.get_params()) + " ...")
252 | print("")
253 |
254 | # we fit the second level estimator
255 | self.level_estimator.fit(df_train.values, y_train.values)
256 |
257 | self.__fitOK = True
258 |
259 | return self
260 |
261 |
262 | def predict(self, df_test):
263 | """Predict regression target for X_test using the meta-features.
264 |
265 | Parameters
266 | ----------
267 | df_test : pandas DataFrame of shape = (n_samples_test, n_features)
268 | The testing samples
269 |
270 | Returns
271 | -------
272 | array of shape = (n_samples_test, )
273 | The predicted values.
274 |
275 | """
276 | if(self.__fitOK):
277 | # we predict the meta features on test set
278 | df_test = self.transform(df_test)
279 |
280 | # we predict the target using the meta features
281 | return self.level_estimator.predict(df_test)
282 |
283 | else:
284 | raise ValueError("Call fit before !")
285 |
--------------------------------------------------------------------------------
/mlbox/optimisation/__init__.py:
--------------------------------------------------------------------------------
1 | from .optimiser import *
2 |
--------------------------------------------------------------------------------
/mlbox/prediction/__init__.py:
--------------------------------------------------------------------------------
1 | from .predictor import *
2 |
3 |
--------------------------------------------------------------------------------
/mlbox/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .drift_thresholder import *
2 | from .reader import *
3 |
4 |
--------------------------------------------------------------------------------
/mlbox/preprocessing/drift/__init__.py:
--------------------------------------------------------------------------------
1 | from .drift_estimator import *
2 | from .drift_threshold import *
3 |
--------------------------------------------------------------------------------
/mlbox/preprocessing/drift/drift_estimator.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Authors: Axel ARONIO DE ROMBLAY
3 | # Alexis BONDU
4 | # License: BSD 3 clause
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from sklearn.ensemble import RandomForestClassifier
9 | from sklearn.metrics import roc_auc_score
10 | from sklearn.model_selection import KFold, StratifiedKFold, cross_val_predict
11 |
12 | class DriftEstimator():
13 |
14 | """Estimates the drift between two datasets
15 |
16 |
17 | Parameters
18 | ----------
19 | estimator : classifier, defaut = RandomForestClassifier(n_estimators = 50, n_jobs=-1, max_features=1., min_samples_leaf = 5, max_depth = 5)
20 | The estimator that estimates the drift between two datasets
21 |
22 | n_folds : int, defaut = 2
23 | Number of folds used to estimate the drift
24 |
25 | stratify : bool, defaut = True
26 | Whether the cv is stratified (same number of train and test samples within each fold)
27 |
28 | random_state : int, defaut = 1
29 | Random state for cv
30 | """
31 |
32 | def __init__(self,
33 | estimator=RandomForestClassifier(n_estimators=50,
34 | n_jobs=-1,
35 | max_features=1.,
36 | min_samples_leaf=5,
37 | max_depth=5),
38 | n_folds=2,
39 | stratify=True,
40 | random_state=1):
41 |
42 | self.estimator = estimator
43 | self.n_folds = n_folds
44 | self.stratify = stratify
45 | self.random_state = random_state
46 | self.__cv = None
47 | self.__pred = None
48 | self.__target = None
49 | self.__fitOK = False
50 |
51 | def get_params(self):
52 |
53 | return {'estimator': self.estimator,
54 | 'n_folds': self.n_folds,
55 | 'stratify': self.stratify,
56 | 'random_state': self.random_state}
57 |
58 | def set_params(self, **params):
59 |
60 | if('estimator' in params.keys()):
61 | self.estimator = params['estimator']
62 | if('n_folds' in params.keys()):
63 | self.n_folds = params['n_folds']
64 | if('stratify' in params.keys()):
65 | self.stratify = params['stratify']
66 | if('random_state' in params.keys()):
67 | self.random_state = params['random_state']
68 |
69 | def fit(self, df_train, df_test):
70 |
71 | """
72 | Computes the drift between the two datasets
73 |
74 | Parameters
75 | ----------
76 | df_train : pandas dataframe of shape = (n_train, p)
77 | The train set
78 |
79 | df_test : pandas dataframe of shape = (n_test, p)
80 | The test set
81 |
82 | Returns
83 | -------
84 | self : object
85 | Returns self.
86 | """
87 |
88 | df_train["target"] = 0
89 | df_test["target"] = 1
90 |
91 | self.__target = pd.concat((df_train.target, df_test.target),
92 | ignore_index=True)
93 |
94 | if self.stratify:
95 | self.__cv = StratifiedKFold(n_splits=self.n_folds,
96 | shuffle=True,
97 | random_state=self.random_state)
98 | else:
99 | self.__cv = KFold(n_splits=self.n_folds,
100 | shuffle=True,
101 | random_state=self.random_state)
102 |
103 | X_tmp = pd.concat((df_train, df_test),
104 | ignore_index=True).drop(['target'], axis=1)
105 |
106 | self.__pred = cross_val_predict(estimator=self.estimator,
107 | X=X_tmp,
108 | y=self.__target,
109 | cv=self.__cv,
110 | method="predict_proba")[:,1]
111 |
112 | del df_train["target"]
113 | del df_test["target"]
114 |
115 | self.__fitOK = True
116 |
117 | return self
118 |
119 | def score(self):
120 |
121 | """Returns the global drift measure between two datasets.
122 |
123 | 0. = No drift. 1. = Maximal Drift
124 |
125 | Returns
126 | -------
127 | float
128 | The drift measure
129 | """
130 |
131 | S = []
132 |
133 | if self.__fitOK:
134 |
135 | X_zeros = np.zeros(len(self.__target))
136 |
137 | for train_index, test_index in self.__cv.split(X=X_zeros,
138 | y=self.__target):
139 |
140 | S.append(roc_auc_score(self.__target.iloc[test_index],
141 | self.__pred[test_index]))
142 |
143 | return (max(np.mean(S), 1-np.mean(S))-0.5) * 2
144 |
145 | else:
146 | raise ValueError('Call the fit function before !')
147 |
148 | def predict(self):
149 |
150 | """Returns the probabilities that the sample belongs to the test dataset
151 |
152 | Returns
153 | -------
154 | Array of shape = (n_train+n_test,)
155 | The probabilities
156 | """
157 |
158 | if self.__fitOK:
159 |
160 | return self.__pred
161 |
162 | else:
163 | raise ValueError('Call the fit function before !')
164 |
--------------------------------------------------------------------------------
/mlbox/preprocessing/drift/drift_threshold.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Authors: Axel ARONIO DE ROMBLAY
3 | # Alexis BONDU
4 | # License: BSD 3 clause
5 | import sys
6 |
7 | from joblib import Parallel, delayed
8 | from sklearn.tree import DecisionTreeClassifier
9 |
10 | from .drift_estimator import DriftEstimator
11 |
12 |
13 | def sync_fit(df_train, df_test, estimator, n_folds=2, stratify=True, random_state=1):
14 | """Compute the univariate drifts between df_train and df_test datasets.
15 |
16 | Multi-threaded version.
17 |
18 | Parameters
19 | ----------
20 | df_train : pandas dataframe of shape = (n_train, p)
21 | The train set
22 |
23 | df_test : pandas dataframe of shape = (n_test, p)
24 | The test set
25 |
26 | estimator : classifier, defaut = RandomForestClassifier(n_estimators = 50,
27 | n_jobs=-1,
28 | max_features=1.,
29 | min_samples_leaf = 5,
30 | max_depth = 5)
31 | The estimator that estimates the drift between two datasets
32 |
33 | n_folds : int, default = 2
34 | Number of folds used to estimate the drift
35 |
36 | stratify : bool, default = True
37 | Whether the cv is stratified (same number of train and test samples
38 | within each fold)
39 |
40 | random_state : int, default = 1
41 | Random state for cv
42 |
43 | Returns
44 | -------
45 | float
46 | drift measure
47 |
48 | """
49 | # We will compute the indices of the CV in each thread
50 | de = DriftEstimator(estimator, n_folds, stratify, random_state)
51 | de.fit(df_train, df_test)
52 |
53 | return de.score()
54 |
55 |
56 | class DriftThreshold():
57 | """Estimate the univariate drift between two datasets.
58 |
59 | Estimate the univariate drift between two datasets
60 | and select features with low drifts
61 |
62 | Parameters
63 | ----------
64 | threshold : float, defaut = 0.6
65 | The drift threshold (univariate drift below are kept)
66 | Must be between 0. and 1.
67 |
68 | subsample : float, defaut = 1.
69 | Subsampling parameter for the datasets.
70 | Must be between 0. and 1.
71 |
72 | estimator : classifier, default = DecisionTreeClassifier(max_depth=6)
73 | The estimator that estimates the drift between two datasets.
74 |
75 | n_folds : int, default = 2
76 | Number of folds used to estimate the drift.
77 |
78 | stratify : bool, default = True
79 | Whether the cv is stratified (same number of train and test samples
80 | within each fold)
81 |
82 | random_state : int, default = 1
83 | Seed for for cv and subsampling.
84 |
85 | n_jobs : int, defaut = -1
86 | Number of cores used for processing (-1 for all cores)
87 |
88 | """
89 |
90 | def __init__(self,
91 | threshold=0.6,
92 | subsample=1.,
93 | estimator=DecisionTreeClassifier(max_depth=6),
94 | n_folds=2,
95 | stratify=True,
96 | random_state=1,
97 | n_jobs=-1):
98 | """Init a DriftThreshold object."""
99 | self.threshold = threshold
100 | self.subsample = subsample
101 | self.estimator = estimator
102 | self.n_folds = n_folds
103 | self.stratify = stratify
104 | self.random_state = random_state
105 | self.n_jobs = n_jobs
106 | self.__Ddrifts = dict()
107 | self.__fitOK = False
108 |
109 | def get_params(self):
110 | """Get parameters of a DriftThreshold object."""
111 | return {'threshold': self.threshold,
112 | 'subsample': self.subsample,
113 | 'estimator': self.estimator,
114 | 'n_folds': self.n_folds,
115 | 'stratify': self.stratify,
116 | 'random_state': self.random_state,
117 | 'n_jobs': self.n_jobs}
118 |
119 | def set_params(self, **params):
120 | """Set parameters of a DriftThreshold object."""
121 | if('threshold' in params.keys()):
122 | self.threshold = params['threshold']
123 | if('subsample' in params.keys()):
124 | self.subsample = params['subsample']
125 | if('estimator' in params.keys()):
126 | self.estimator = params['estimator']
127 | if('n_folds' in params.keys()):
128 | self.n_folds = params['n_folds']
129 | if('stratify' in params.keys()):
130 | self.stratify = params['stratify']
131 | if('random_state' in params.keys()):
132 | self.random_state = params['random_state']
133 | if('n_jobs' in params.keys()):
134 | self.n_jobs = params['n_jobs']
135 |
136 | def fit(self, df_train, df_test):
137 | """Compute the univariate drifts between df_train and df_test datasets.
138 |
139 | Parameters
140 | ----------
141 | df_train : pandas dataframe of shape = (n_train, p)
142 | The train set
143 |
144 | df_test : pandas dataframe of shape = (n_test, p)
145 | The test set
146 |
147 | Returns
148 | -------
149 | None
150 |
151 | """
152 | self.__Ddrifts = dict()
153 |
154 | if sys.platform == 'win32':
155 | Ldrifts = [sync_fit(df_train.sample(frac=self.subsample)[[col]],
156 | df_test.sample(frac=self.subsample)[[col]],
157 | self.estimator,
158 | self.n_folds,
159 | self.stratify,
160 | self.random_state)
161 | for col in df_train.columns]
162 | else:
163 | Ldrifts = Parallel(n_jobs=self.n_jobs)(delayed(sync_fit)
164 | (df_train.sample(frac=self.subsample)[[col]],
165 | df_test.sample(frac=self.subsample)[[col]],
166 | self.estimator,
167 | self.n_folds,
168 | self.stratify,
169 | self.random_state)
170 | for col in df_train.columns)
171 |
172 | for i, col in enumerate(df_train.columns):
173 |
174 | self.__Ddrifts[col] = Ldrifts[i]
175 |
176 | del Ldrifts
177 |
178 | self.__fitOK = True
179 |
180 | def transform(self, df):
181 | """Select the features with low drift.
182 |
183 | Parameters
184 | ----------
185 | df : pandas dataframe
186 | A dataset with the same features
187 |
188 | Returns
189 | -------
190 | pandas DataFrame
191 | The transformed dataframe
192 |
193 | """
194 | if self.__fitOK:
195 |
196 | selected_col = []
197 |
198 | for i, col in enumerate(df.columns):
199 |
200 | if (self.__Ddrifts[col] < self.threshold):
201 | selected_col.append(col)
202 |
203 | return df[selected_col]
204 |
205 | else:
206 | raise ValueError('Call the fit function before !')
207 |
208 | def get_support(self, complement=False):
209 | """Return the variables kept or dropped.
210 |
211 | Parameters
212 | ----------
213 | complement : bool, default = True
214 | If True, returns the features to drop
215 | If False, returns the features to keep
216 |
217 | Returns
218 | -------
219 | list
220 | The list of features to keep or to drop.
221 |
222 | """
223 | if self.__fitOK:
224 |
225 | keepList = []
226 | dropList = []
227 |
228 | for col in self.__Ddrifts:
229 |
230 | if (self.__Ddrifts[col] < self.threshold):
231 | keepList.append(col)
232 | else:
233 | dropList.append(col)
234 |
235 | if complement:
236 | return dropList
237 | else:
238 | return keepList
239 | else:
240 | raise ValueError('Call the fit function before !')
241 |
242 | def drifts(self):
243 | """Return the univariate drifts for all variables.
244 |
245 | Returns
246 | -------
247 | dict
248 | The dictionnary of drift measures for each features
249 |
250 | """
251 | if self.__fitOK:
252 |
253 | return self.__Ddrifts
254 |
255 | else:
256 | raise ValueError('Call the fit function before !')
257 |
--------------------------------------------------------------------------------
/mlbox/preprocessing/drift_thresholder.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Author: Axel ARONIO DE ROMBLAY
3 | # License: BSD 3 clause
4 |
5 | import os
6 | import time
7 | from sklearn.pipeline import Pipeline
8 | from .drift import DriftThreshold
9 | from ..encoding.na_encoder import NA_encoder
10 | from ..encoding.categorical_encoder import Categorical_encoder
11 |
12 |
13 | class Drift_thresholder():
14 |
15 | """Automatically drops ids and drifting variables between train and test datasets.
16 |
17 | Drops on train and test datasets. The list of drift coefficients is available and
18 | saved as "drifts.txt". To get familiar with drift:
19 | https://github.com/AxeldeRomblay/MLBox/blob/master/docs/webinars/features.pdf
20 |
21 | Parameters
22 | ----------
23 | threshold : float, defaut = 0.6
24 | Drift threshold under which features are kept. Must be between 0. and 1.
25 | The lower the more you keep non-drifting/stable variables: a feature with
26 | a drift measure of 0. is very stable and a one with 1. is highly unstable.
27 |
28 | inplace : bool, default = False
29 | If True, train and test datasets are transformed. Returns self.
30 | Otherwise, train and test datasets are not transformed. Returns a new dictionnary with
31 | cleaned datasets.
32 |
33 | verbose : bool, default = True
34 | Verbose mode
35 |
36 | to_path : str, default = "save"
37 | Name of the folder where the list of drift coefficients is saved.
38 | """
39 |
40 | def __init__(self,
41 | threshold=0.6,
42 | inplace=False,
43 | verbose=True,
44 | to_path="save"):
45 |
46 | self.threshold = threshold
47 | self.inplace = inplace
48 | self.verbose = verbose
49 | self.to_path = to_path
50 | self.__Ddrifts = {}
51 | self.__fitOK = False
52 |
53 |
54 | def fit_transform(self, df):
55 |
56 | """Fits and transforms train and test datasets
57 |
58 | Automatically drops ids and drifting variables between train and test datasets.
59 | The list of drift coefficients is available and saved as "drifts.txt"
60 |
61 | Parameters
62 | ----------
63 | df : dict, defaut = None
64 | Dictionnary containing :
65 |
66 | - 'train' : pandas dataframe for train dataset
67 | - 'test' : pandas dataframe for test dataset
68 | - 'target' : pandas serie for the target on train set
69 |
70 | Returns
71 | -------
72 | dict
73 | Dictionnary containing :
74 |
75 | - 'train' : transformed pandas dataframe for train dataset
76 | - 'test' : transformed pandas dataframe for test dataset
77 | - 'target' : pandas serie for the target on train set
78 | """
79 |
80 | ######################################################
81 | # Deleting IDs
82 | ######################################################
83 |
84 | # Exception
85 |
86 | if (df["test"].shape[0] == 0):
87 | if (self.verbose):
88 | print("")
89 | print("You have no test dataset...")
90 |
91 | return df
92 |
93 | else:
94 |
95 | start_time = time.time()
96 |
97 | ds = DriftThreshold(self.threshold)
98 | na = NA_encoder(numerical_strategy=0)
99 | ca = Categorical_encoder()
100 |
101 | pp = Pipeline([("na", na), ("ca", ca)])
102 | pp.fit(df['train'], None)
103 |
104 | # Deleting IDs with drift threshold method
105 |
106 | if (self.verbose):
107 | print("")
108 | print("computing drifts ...")
109 |
110 | ds.fit(pp.transform(df['train']), pp.transform(df['test']))
111 |
112 | if (self.verbose):
113 | print("CPU time: %s seconds" % (time.time() - start_time))
114 | print("")
115 |
116 | self.__fitOK = True
117 | self.__Ddrifts = ds.drifts()
118 | drifts_top = sorted(ds.drifts().items(),
119 | key=lambda x: x[1],
120 | reverse=True)[:10]
121 |
122 | if (self.verbose):
123 | print("> Top 10 drifts")
124 | print("")
125 | for d in range(len(drifts_top)):
126 | print(drifts_top[d])
127 |
128 | if (self.verbose):
129 | print("")
130 | print("> Deleted "
131 | "variables : " + str(ds.get_support(complement=True)))
132 |
133 | ######################################################
134 | # Dumping Encoders into directory
135 | ######################################################
136 |
137 | if (self.to_path is not None):
138 |
139 | try:
140 | os.mkdir(self.to_path)
141 | except OSError:
142 | pass
143 |
144 | file = open(self.to_path + '/drifts.txt', "w")
145 | file.write("\n")
146 | file.write(
147 | "*******************************************"
148 | " Drifts coefficients "
149 | "*******************************************\n")
150 | file.write("\n")
151 |
152 | for var, d in sorted(ds.drifts().items(),
153 | key=lambda x: x[1],
154 | reverse=True):
155 | file.write(str(var) + " = " + str(d) + '\n')
156 |
157 | file.close()
158 |
159 | if (self.verbose):
160 | print("> Drift coefficients dumped into directory : " + self.to_path)
161 |
162 | # Returning datasets with no IDs
163 |
164 | if (self.inplace):
165 |
166 | df['train'] = ds.transform(df['train'])
167 | df['test'] = ds.transform(df['test'])
168 |
169 | else:
170 |
171 | return {'train': ds.transform(df['train']),
172 | 'test': ds.transform(df['test']),
173 | 'target': df['target']}
174 |
175 | def drifts(self):
176 |
177 | """Returns the univariate drifts for all variables.
178 |
179 | Returns
180 | -------
181 | dict
182 | Dictionnary containing the drifts for each feature
183 | """
184 |
185 | if self.__fitOK:
186 |
187 | return self.__Ddrifts
188 | else:
189 | raise ValueError('Call the fit_transform function before !')
190 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.18.2
2 | scipy==1.4.1
3 | matplotlib==3.0.3
4 | hyperopt==0.2.3
5 | pandas==0.25.3
6 | joblib==0.14.1
7 | scikit-learn==0.22.1
8 | tensorflow==2.0.0
9 | lightgbm==2.3.1
10 | tables==3.5.2
11 | xlrd==1.2.0
12 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """Setup file for installing mlbox package."""
2 | # !/usr/bin/env python
3 | # -*- coding: utf-8 -*-
4 |
5 | from setuptools import setup
6 |
7 |
8 | with open('requirements.txt', 'rt') as fh:
9 | requirements = fh.read().splitlines()
10 |
11 | with open('README.rst') as readme_file:
12 | readme = readme_file.read()
13 |
14 | with open('VERSION.txt') as version_file:
15 | version = version_file.read()
16 |
17 |
18 | setup(
19 | name='mlbox',
20 | version=version,
21 | description="A powerful Automated Machine Learning python library.",
22 | long_description=readme,
23 | author="Axel ARONIO DE ROMBLAY",
24 | author_email='axelderomblay@gmail.com',
25 | url='https://github.com/AxeldeRomblay/mlbox',
26 | packages=['mlbox', 'mlbox.encoding', 'mlbox.model',
27 | 'mlbox.optimisation', 'mlbox.prediction',
28 | 'mlbox.preprocessing',
29 | 'mlbox.model.classification',
30 | 'mlbox.model.regression',
31 | 'mlbox.preprocessing.drift'],
32 | package_dir={'mlbox': 'mlbox',
33 | 'mlbox.encoding': 'mlbox/encoding',
34 | 'mlbox.model': 'mlbox/model',
35 | 'mlbox.optimisation': 'mlbox/optimisation',
36 | 'mlbox.prediction': 'mlbox/prediction',
37 | 'mlbox.preprocessing': 'mlbox/preprocessing',
38 | 'mlbox.model.classification': 'mlbox/model/classification',
39 | 'mlbox.model.regression': 'mlbox/model/regression',
40 | 'mlbox.preprocessing.drift': 'mlbox/preprocessing/drift'
41 | },
42 | include_package_data=True,
43 | install_requires=requirements,
44 | zip_safe=False,
45 | license='BSD-3',
46 | keywords='mlbox auto-ml stacking pipeline optimisation',
47 | classifiers=[
48 |
49 | 'Development Status :: 5 - Production/Stable',
50 |
51 | 'Intended Audience :: Developers',
52 | 'Intended Audience :: Science/Research',
53 |
54 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
55 | 'Topic :: Scientific/Engineering :: Information Analysis',
56 | 'Topic :: Software Development :: Libraries :: Python Modules',
57 |
58 | 'License :: OSI Approved :: BSD License',
59 |
60 | 'Natural Language :: English',
61 |
62 | 'Operating System :: MacOS',
63 | 'Operating System :: Microsoft :: Windows',
64 | 'Operating System :: POSIX :: Linux',
65 |
66 | 'Programming Language :: Python :: 3.5',
67 | 'Programming Language :: Python :: 3.6',
68 | 'Programming Language :: Python :: 3.7'
69 | ],
70 | test_suite='tests',
71 | tests_require=requirements
72 | )
73 |
--------------------------------------------------------------------------------
/tests/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/.DS_Store
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/tests/data_for_tests/clean_target.csv:
--------------------------------------------------------------------------------
1 | Survived
2 | 0
3 | 1
4 | 1
5 | 1
6 | 0
7 | 0
8 | 0
9 | 0
10 | 1
11 | 1
12 | 1
13 | 1
14 | 0
15 | 0
16 | 0
17 | 1
18 | 0
19 | 1
20 | 0
21 | 1
22 | 0
23 | 1
24 | 1
25 | 1
26 | 0
27 | 1
28 | 0
29 | 0
30 | 1
31 | 0
32 | 0
33 | 1
34 | 1
35 | 0
36 | 0
37 | 0
38 | 1
39 | 0
40 | 0
41 | 1
42 | 0
43 | 0
44 | 0
45 | 1
46 | 1
47 | 0
48 | 0
49 | 1
50 | 0
51 | 0
52 | 0
53 | 0
54 | 1
55 | 1
56 | 0
57 | 1
58 | 1
59 | 0
60 | 1
61 | 0
62 | 0
63 | 1
64 | 0
65 | 0
66 | 0
67 | 1
68 | 1
69 | 0
70 | 1
71 | 0
72 | 0
73 | 0
74 | 0
75 | 0
76 | 1
77 | 0
78 | 0
79 | 0
80 | 1
81 | 1
82 | 0
83 | 1
84 | 1
85 | 0
86 | 1
87 | 1
88 | 0
89 | 0
90 | 1
91 | 0
92 | 0
93 | 0
94 | 0
95 | 0
96 | 0
97 | 0
98 | 0
99 | 1
100 | 1
101 | 0
102 | 0
103 | 0
104 | 0
105 | 0
106 | 0
107 | 0
108 | 1
109 | 1
110 | 0
111 | 1
112 | 0
113 | 0
114 | 0
115 | 0
116 | 0
117 | 0
118 | 0
119 | 0
120 | 0
121 | 0
122 | 0
123 | 0
124 | 0
125 | 1
126 | 0
127 | 1
128 | 0
129 | 1
130 | 1
131 | 0
132 | 0
133 | 0
134 | 0
135 | 1
136 | 0
137 | 0
138 | 1
139 | 0
140 | 0
141 | 0
142 | 0
143 | 1
144 | 1
145 | 0
146 | 0
147 | 0
148 | 1
149 | 0
150 | 0
151 | 0
152 | 0
153 | 1
154 | 0
155 | 0
156 | 0
157 | 0
158 | 1
159 | 0
160 | 0
161 | 0
162 | 0
163 | 1
164 | 0
165 | 0
166 | 0
167 | 1
168 | 1
169 | 0
170 | 0
171 | 0
172 | 0
173 | 0
174 | 1
175 | 0
176 | 0
177 | 0
178 | 0
179 | 0
180 | 0
181 | 0
182 | 0
183 | 0
184 | 0
185 | 1
186 | 1
187 | 0
188 | 1
189 | 1
190 | 0
191 | 0
192 | 1
193 | 0
194 | 1
195 | 1
196 | 1
197 | 1
198 | 0
199 | 0
200 | 1
201 | 0
202 | 0
203 | 0
204 | 0
205 | 0
206 | 1
207 | 0
208 | 0
209 | 1
210 | 1
211 | 1
212 | 0
213 | 1
214 | 0
215 | 0
216 | 0
217 | 1
218 | 1
219 | 0
220 | 1
221 | 0
222 | 1
223 | 0
224 | 0
225 | 0
226 | 1
227 | 0
228 | 1
229 | 0
230 | 0
231 | 0
232 | 1
233 | 0
234 | 0
235 | 1
236 | 0
237 | 0
238 | 0
239 | 1
240 | 0
241 | 0
242 | 0
243 | 1
244 | 0
245 | 0
246 | 0
247 | 0
248 | 0
249 | 1
250 | 1
251 | 0
252 | 0
253 | 0
254 | 0
255 | 0
256 | 0
257 | 1
258 | 1
259 | 1
260 | 1
261 | 1
262 | 0
263 | 1
264 | 0
265 | 0
266 | 0
267 | 0
268 | 0
269 | 1
270 | 1
271 | 1
272 | 0
273 | 1
274 | 1
275 | 0
276 | 1
277 | 1
278 | 0
279 | 0
280 | 0
281 | 1
282 | 0
283 | 0
284 | 0
285 | 1
286 | 0
287 | 0
288 | 1
289 | 0
290 | 1
291 | 1
292 | 1
293 | 1
294 | 0
295 | 0
296 | 0
297 | 0
298 | 0
299 | 0
300 | 1
301 | 1
302 | 1
303 | 1
304 | 0
305 | 1
306 | 0
307 | 1
308 | 1
309 | 1
310 | 0
311 | 1
312 | 1
313 | 1
314 | 0
315 | 0
316 | 0
317 | 1
318 | 1
319 | 0
320 | 1
321 | 1
322 | 0
323 | 0
324 | 1
325 | 1
326 | 0
327 | 1
328 | 0
329 | 1
330 | 1
331 | 1
332 | 1
333 | 0
334 | 0
335 | 0
336 | 1
337 | 0
338 | 0
339 | 1
340 | 1
341 | 0
342 | 1
343 | 1
344 | 0
345 | 0
346 | 0
347 | 1
348 | 1
349 | 1
350 | 1
351 | 0
352 | 0
353 | 0
354 | 0
355 | 0
356 | 0
357 | 0
358 | 1
359 | 0
360 | 1
361 | 1
362 | 0
363 | 0
364 | 0
365 | 0
366 | 0
367 | 0
368 | 1
369 | 1
370 | 1
371 | 1
372 | 1
373 | 0
374 | 0
375 | 0
376 | 0
377 | 1
378 | 1
379 | 0
380 | 0
381 | 0
382 | 1
383 | 1
384 | 0
385 | 1
386 | 0
387 | 0
388 | 0
389 | 1
390 | 0
391 | 1
392 | 1
393 | 1
394 | 0
395 | 1
396 | 1
397 | 0
398 | 0
399 | 0
400 | 0
401 | 1
402 | 1
403 | 0
404 | 0
405 | 0
406 | 0
407 | 0
408 | 0
409 | 1
410 | 0
411 | 0
412 | 0
413 | 0
414 | 1
415 | 0
416 | 1
417 | 0
418 | 1
419 | 1
420 | 0
421 | 0
422 | 0
423 | 0
424 | 0
425 | 0
426 | 0
427 | 0
428 | 1
429 | 1
430 | 0
431 | 1
432 | 1
433 | 1
434 | 1
435 | 0
436 | 0
437 | 1
438 | 0
439 | 1
440 | 0
441 | 0
442 | 1
443 | 0
444 | 0
445 | 1
446 | 1
447 | 1
448 | 1
449 | 1
450 | 1
451 | 1
452 | 0
453 | 0
454 | 0
455 | 1
456 | 0
457 | 1
458 | 0
459 | 1
460 | 1
461 | 0
462 | 1
463 | 0
464 | 0
465 | 0
466 | 0
467 | 0
468 | 0
469 | 0
470 | 0
471 | 1
472 | 0
473 | 0
474 | 1
475 | 1
476 | 0
477 | 0
478 | 0
479 | 0
480 | 0
481 | 1
482 | 0
483 | 0
484 | 0
485 | 1
486 | 1
487 | 0
488 | 1
489 | 0
490 | 0
491 | 1
492 | 0
493 | 0
494 | 0
495 | 0
496 | 0
497 | 0
498 | 1
499 | 0
500 | 0
501 | 0
502 | 0
503 | 0
504 | 0
505 | 0
506 | 1
507 | 0
508 | 1
509 | 1
510 | 0
511 | 1
512 | 1
513 | 0
514 | 1
515 | 1
516 | 0
517 | 0
518 | 1
519 | 0
520 | 1
521 | 0
522 | 1
523 | 0
524 | 0
525 | 1
526 | 0
527 | 0
528 | 1
529 | 0
530 | 0
531 | 0
532 | 1
533 | 0
534 | 0
535 | 1
536 | 0
537 | 1
538 | 0
539 | 1
540 | 0
541 | 1
542 | 1
543 | 0
544 | 0
545 | 1
546 | 0
547 | 0
548 | 1
549 | 1
550 | 0
551 | 1
552 | 1
553 | 0
554 | 0
555 | 1
556 | 1
557 | 0
558 | 1
559 | 0
560 | 1
561 | 1
562 | 0
563 | 0
564 | 0
565 | 0
566 | 0
567 | 0
568 | 0
569 | 0
570 | 0
571 | 1
572 | 1
573 | 1
574 | 1
575 | 1
576 | 0
577 | 0
578 | 1
579 | 1
580 | 0
581 | 1
582 | 1
583 | 1
584 | 0
585 | 0
586 | 0
587 | 1
588 | 0
589 | 1
590 | 0
591 | 0
592 | 0
593 | 1
594 | 0
595 | 0
596 | 0
597 | 0
598 | 1
599 | 0
600 | 0
601 | 1
602 | 1
603 | 0
604 | 0
605 | 0
606 | 1
607 | 0
608 | 0
609 | 1
610 | 1
611 | 1
612 | 0
613 | 0
614 | 1
615 | 0
616 | 0
617 | 1
618 | 0
619 | 0
620 | 1
621 | 0
622 | 0
623 | 1
624 | 1
625 | 0
626 | 0
627 | 0
628 | 0
629 | 1
630 | 0
631 | 0
632 | 1
633 | 0
634 | 1
635 | 0
636 | 0
637 | 1
638 | 0
639 | 0
640 | 0
641 | 0
642 | 0
643 | 1
644 | 0
645 | 1
646 | 1
647 | 1
648 | 0
649 | 1
650 | 0
651 | 1
652 | 0
653 | 1
654 | 0
655 | 1
656 | 0
657 | 0
658 | 0
659 | 0
660 | 0
661 | 0
662 | 1
663 | 0
664 | 0
665 | 0
666 | 1
667 | 0
668 | 0
669 | 0
670 | 0
671 | 1
672 | 1
673 | 0
674 | 0
675 | 1
676 | 0
677 | 0
678 | 0
679 | 1
680 | 0
681 | 1
682 | 0
683 | 1
684 | 0
685 | 0
686 | 0
687 | 0
688 | 0
689 | 0
690 | 0
691 | 1
692 | 1
693 | 1
694 | 1
695 | 0
696 | 0
697 | 0
698 | 0
699 | 1
700 | 0
701 | 0
702 | 1
703 | 1
704 | 0
705 | 0
706 | 0
707 | 0
708 | 1
709 | 1
710 | 1
711 | 1
712 | 1
713 | 0
714 | 1
715 | 0
716 | 0
717 | 0
718 | 1
719 | 1
720 | 0
721 | 0
722 | 1
723 | 0
724 | 0
725 | 0
726 | 1
727 | 0
728 | 1
729 | 1
730 | 0
731 | 0
732 | 1
733 | 0
734 | 0
735 | 0
736 | 0
737 | 0
738 | 0
739 | 1
740 | 0
741 | 0
742 | 1
743 | 0
744 | 1
745 | 0
746 | 1
747 | 0
748 | 0
749 | 1
750 | 0
751 | 0
752 | 1
753 | 1
754 | 0
755 | 0
756 | 1
757 | 1
758 | 0
759 | 0
760 | 0
761 | 1
762 | 0
763 | 0
764 | 1
765 | 1
766 | 0
767 | 1
768 | 0
769 | 0
770 | 0
771 | 0
772 | 0
773 | 0
774 | 0
775 | 0
776 | 1
777 | 0
778 | 0
779 | 1
780 | 0
781 | 1
782 | 1
783 | 1
784 | 0
785 | 0
786 | 0
787 | 0
788 | 1
789 | 0
790 | 1
791 | 0
792 | 0
793 | 0
794 | 0
795 | 0
796 | 0
797 | 0
798 | 1
799 | 1
800 | 0
801 | 0
802 | 0
803 | 1
804 | 1
805 | 1
806 | 1
807 | 0
808 | 0
809 | 0
810 | 0
811 | 1
812 | 0
813 | 0
814 | 0
815 | 0
816 | 0
817 | 0
818 | 0
819 | 0
820 | 0
821 | 0
822 | 1
823 | 1
824 | 0
825 | 1
826 | 0
827 | 0
828 | 0
829 | 1
830 | 1
831 | 1
832 | 1
833 | 1
834 | 0
835 | 0
836 | 0
837 | 1
838 | 0
839 | 0
840 | 1
841 | 1
842 | 0
843 | 0
844 | 1
845 | 0
846 | 0
847 | 0
848 | 0
849 | 0
850 | 0
851 | 1
852 | 0
853 | 0
854 | 0
855 | 1
856 | 0
857 | 1
858 | 1
859 | 1
860 | 1
861 | 0
862 | 0
863 | 0
864 | 1
865 | 0
866 | 0
867 | 1
868 | 1
869 | 0
870 | 0
871 | 1
872 | 0
873 | 1
874 | 0
875 | 0
876 | 1
877 | 1
878 | 0
879 | 0
880 | 0
881 | 1
882 | 1
883 | 0
884 | 0
885 | 0
886 | 0
887 | 0
888 | 0
889 | 1
890 | 0
891 | 1
892 | 0
893 |
--------------------------------------------------------------------------------
/tests/data_for_tests/clean_test.csv:
--------------------------------------------------------------------------------
1 | ,Age,Fare,Parch,Pclass,SibSp
2 | 0,34.5,7.8292,0,3,0
3 | 1,47,7,0,3,1
4 | 2,62,9.6875,0,2,0
5 | 3,27,8.6625,0,3,0
6 | 4,22,12.2875,1,3,1
7 | 5,14,9.225,0,3,0
8 | 6,30,7.6292,0,3,0
9 | 7,26,29,1,2,1
10 | 8,18,7.2292,0,3,0
11 | 9,21,24.15,0,3,2
12 | 10,24,7.8958,0,3,0
13 | 11,46,26,0,1,0
14 | 12,23,82.2667,0,1,1
15 | 13,63,26,0,2,1
16 | 14,47,61.175,0,1,1
17 | 15,24,27.7208,0,2,1
18 | 16,35,12.35,0,2,0
19 | 17,21,7.225,0,3,0
20 | 18,27,7.925,0,3,1
21 | 19,45,7.225,0,3,0
22 | 20,55,59.4,0,1,1
23 | 21,9,3.1708,1,3,0
24 | 22,24,31.6833,0,1,0
25 | 23,21,61.3792,1,1,0
26 | 24,48,262.375,3,1,1
27 | 25,50,14.5,0,3,1
28 | 26,22,61.9792,1,1,0
29 | 27,22.5,7.225,0,3,0
30 | 28,41,30.5,0,1,0
31 | 29,24,21.6792,0,3,2
32 | 30,50,26,0,2,1
33 | 31,24,31.5,0,2,2
34 | 32,33,20.575,2,3,1
35 | 33,24,23.45,2,3,1
36 | 34,30,57.75,0,1,1
37 | 35,18.5,7.2292,0,3,0
38 | 36,24,8.05,0,3,0
39 | 37,21,8.6625,0,3,0
40 | 38,25,9.5,0,3,0
41 | 39,24,56.4958,0,3,0
42 | 40,39,13.4167,1,3,0
43 | 41,24,26.55,0,1,0
44 | 42,41,7.85,0,3,0
45 | 43,30,13,0,2,0
46 | 44,45,52.5542,0,1,1
47 | 45,25,7.925,0,3,0
48 | 46,45,29.7,0,1,0
49 | 47,24,7.75,0,3,0
50 | 48,60,76.2917,0,1,0
51 | 49,36,15.9,2,3,0
52 | 50,24,60,0,1,1
53 | 51,27,15.0333,0,2,0
54 | 52,20,23,1,2,2
55 | 53,28,263,2,1,3
56 | 54,24,15.5792,0,2,0
57 | 55,10,29.125,1,3,4
58 | 56,35,7.8958,0,3,0
59 | 57,25,7.65,0,3,0
60 | 58,24,16.1,0,3,1
61 | 59,36,262.375,0,1,0
62 | 60,17,7.8958,0,3,0
63 | 61,32,13.5,0,2,0
64 | 62,18,7.75,0,3,0
65 | 63,22,7.725,0,3,0
66 | 64,13,262.375,2,1,2
67 | 65,24,21,0,2,0
68 | 66,18,7.8792,0,3,0
69 | 67,47,42.4,0,1,0
70 | 68,31,28.5375,0,1,0
71 | 69,60,263,4,1,1
72 | 70,24,7.75,0,3,0
73 | 71,21,7.8958,0,3,0
74 | 72,29,7.925,0,3,0
75 | 73,28.5,27.7208,0,1,0
76 | 74,35,211.5,0,1,0
77 | 75,32.5,211.5,0,1,0
78 | 76,24,8.05,0,3,0
79 | 77,55,25.7,0,1,2
80 | 78,30,13,0,2,0
81 | 79,24,7.75,0,3,0
82 | 80,6,15.2458,1,3,1
83 | 81,67,221.7792,0,1,1
84 | 82,49,26,0,1,0
85 | 83,24,7.8958,0,3,0
86 | 84,24,10.7083,0,2,0
87 | 85,24,14.4542,0,3,1
88 | 86,27,7.8792,0,3,0
89 | 87,18,8.05,0,3,0
90 | 88,24,7.75,0,3,0
91 | 89,2,23,1,2,1
92 | 90,22,13.9,0,3,1
93 | 91,24,7.775,0,3,0
94 | 92,27,52,2,1,1
95 | 93,24,8.05,0,3,0
96 | 94,25,26,0,1,0
97 | 95,25,7.7958,0,3,0
98 | 96,76,78.85,0,1,1
99 | 97,29,7.925,0,3,0
100 | 98,20,7.8542,0,3,0
101 | 99,33,8.05,0,3,0
102 | 100,43,55.4417,0,1,1
103 | 101,27,26,0,2,1
104 | 102,24,7.75,0,3,0
105 | 103,26,7.775,0,3,0
106 | 104,16,8.5167,1,3,1
107 | 105,28,22.525,0,3,0
108 | 106,21,7.8208,0,3,0
109 | 107,24,7.75,0,3,0
110 | 108,24,8.7125,0,3,0
111 | 109,18.5,13,0,2,0
112 | 110,41,15.0458,0,2,0
113 | 111,24,7.7792,0,3,0
114 | 112,36,31.6792,0,1,0
115 | 113,18.5,7.2833,0,3,0
116 | 114,63,221.7792,0,1,1
117 | 115,18,14.4542,0,3,1
118 | 116,24,6.4375,0,3,0
119 | 117,1,16.7,1,3,1
120 | 118,36,75.2417,0,1,0
121 | 119,29,26,0,2,1
122 | 120,12,15.75,0,2,0
123 | 121,24,7.75,0,3,1
124 | 122,35,57.75,0,1,1
125 | 123,28,7.25,0,3,0
126 | 124,24,7.75,0,3,0
127 | 125,17,16.1,1,3,0
128 | 126,22,7.7958,0,3,0
129 | 127,24,23.25,0,3,2
130 | 128,42,13,0,2,0
131 | 129,24,8.05,0,3,0
132 | 130,32,8.05,0,3,0
133 | 131,53,28.5,0,1,0
134 | 132,24,25.4667,4,3,0
135 | 133,24,6.4375,0,3,1
136 | 134,43,7.8958,0,3,0
137 | 135,24,7.8542,0,3,0
138 | 136,26.5,7.225,0,3,0
139 | 137,26,13,0,2,0
140 | 138,23,8.05,0,3,0
141 | 139,40,46.9,6,3,1
142 | 140,10,46.9,2,3,5
143 | 141,33,151.55,0,1,0
144 | 142,61,262.375,3,1,1
145 | 143,28,26,0,2,0
146 | 144,42,26.55,0,1,0
147 | 145,31,18,0,3,3
148 | 146,24,51.8625,0,1,0
149 | 147,22,8.05,0,3,0
150 | 148,24,26.55,0,1,0
151 | 149,30,26,1,2,1
152 | 150,23,83.1583,1,1,0
153 | 151,24,7.8958,0,3,0
154 | 152,60.5,24,0,3,0
155 | 153,36,12.1833,2,3,0
156 | 154,13,31.3875,2,3,4
157 | 155,24,7.55,0,3,0
158 | 156,29,221.7792,0,1,0
159 | 157,23,7.8542,0,3,0
160 | 158,42,26.55,0,1,0
161 | 159,26,13.775,2,3,0
162 | 160,24,7.7333,0,3,0
163 | 161,7,15.2458,1,3,1
164 | 162,26,13.5,0,2,0
165 | 163,24,7,0,3,0
166 | 164,41,13,0,2,0
167 | 165,26,22.025,1,3,1
168 | 166,48,50.4958,0,1,0
169 | 167,18,34.375,2,3,2
170 | 168,24,27.7208,0,1,0
171 | 169,22,8.9625,0,3,0
172 | 170,24,7.55,0,3,0
173 | 171,27,7.225,0,3,0
174 | 172,23,13.9,0,3,1
175 | 173,24,7.2292,0,3,0
176 | 174,40,31.3875,5,3,1
177 | 175,15,39,2,2,0
178 | 176,20,36.75,0,2,0
179 | 177,54,55.4417,0,1,1
180 | 178,36,39,3,2,0
181 | 179,64,83.1583,2,1,0
182 | 180,30,13,0,2,0
183 | 181,37,83.1583,1,1,1
184 | 182,18,53.1,0,1,1
185 | 183,24,7.75,0,3,0
186 | 184,27,247.5208,1,1,1
187 | 185,40,16,0,2,0
188 | 186,21,21,1,2,0
189 | 187,17,8.05,0,3,2
190 | 188,24,69.55,2,3,8
191 | 189,40,13,0,2,0
192 | 190,34,26,0,2,1
193 | 191,24,26,0,1,0
194 | 192,11.5,14.5,1,3,1
195 | 193,61,12.35,0,2,0
196 | 194,8,32.5,2,2,0
197 | 195,33,7.8542,0,3,0
198 | 196,6,134.5,2,1,0
199 | 197,18,7.775,0,3,0
200 | 198,23,10.5,0,2,0
201 | 199,24,8.1125,0,3,0
202 | 200,24,15.5,0,3,0
203 | 201,0.33,14.4,2,3,0
204 | 202,47,227.525,0,1,1
205 | 203,8,26,1,2,1
206 | 204,25,10.5,0,2,0
207 | 205,24,25.7417,0,1,0
208 | 206,35,7.75,0,3,0
209 | 207,24,10.5,0,2,0
210 | 208,33,27.7208,0,1,0
211 | 209,25,7.8958,0,3,0
212 | 210,32,22.525,0,3,0
213 | 211,24,7.05,0,3,0
214 | 212,17,73.5,0,2,0
215 | 213,60,26,0,2,1
216 | 214,38,7.775,2,3,4
217 | 215,42,42.5,0,1,0
218 | 216,24,7.8792,0,3,0
219 | 217,57,164.8667,1,1,1
220 | 218,50,211.5,1,1,1
221 | 219,24,8.05,0,3,0
222 | 220,30,13.8583,0,2,1
223 | 221,21,8.05,0,3,0
224 | 222,22,10.5,0,2,0
225 | 223,21,7.7958,0,3,0
226 | 224,53,27.4458,0,1,0
227 | 225,24,15.2458,2,3,0
228 | 226,23,7.7958,0,3,0
229 | 227,24,7.75,0,3,0
230 | 228,40.5,15.1,0,3,0
231 | 229,36,13,0,2,0
232 | 230,14,65,0,2,0
233 | 231,21,26.55,0,1,0
234 | 232,21,6.4958,0,3,1
235 | 233,24,7.8792,0,3,0
236 | 234,39,71.2833,0,1,1
237 | 235,20,7.8542,0,3,0
238 | 236,64,75.25,0,1,1
239 | 237,20,7.225,0,3,0
240 | 238,18,13,1,2,1
241 | 239,48,106.425,0,1,1
242 | 240,55,27.7208,0,1,0
243 | 241,45,30,2,2,0
244 | 242,45,134.5,1,1,1
245 | 243,24,7.8875,0,3,0
246 | 244,24,23.45,2,3,1
247 | 245,41,51.8625,0,1,1
248 | 246,22,21,0,2,0
249 | 247,42,32.5,1,2,1
250 | 248,29,26,0,2,1
251 | 249,24,14.4542,0,3,1
252 | 250,0.92,27.75,2,2,1
253 | 251,20,7.925,0,3,0
254 | 252,27,136.7792,0,1,1
255 | 253,24,9.325,0,3,0
256 | 254,32.5,9.5,0,3,0
257 | 255,24,7.55,0,3,0
258 | 256,24,7.75,0,3,0
259 | 257,28,8.05,0,3,0
260 | 258,19,13,0,2,0
261 | 259,21,7.775,0,3,0
262 | 260,36.5,17.4,0,3,1
263 | 261,21,7.8542,0,3,0
264 | 262,29,23,2,2,0
265 | 263,1,12.1833,1,3,1
266 | 264,30,12.7375,0,2,0
267 | 265,24,7.8958,0,3,0
268 | 266,24,0,0,1,0
269 | 267,24,7.55,0,3,0
270 | 268,24,8.05,0,3,0
271 | 269,17,8.6625,0,3,0
272 | 270,46,75.2417,0,1,0
273 | 271,24,7.75,0,3,0
274 | 272,26,136.7792,0,1,1
275 | 273,24,15.5,0,3,1
276 | 274,24,7.225,0,3,0
277 | 275,20,26,0,2,1
278 | 276,28,10.5,0,2,0
279 | 277,40,26,0,2,1
280 | 278,30,21,0,2,1
281 | 279,22,10.5,0,2,0
282 | 280,23,8.6625,0,3,0
283 | 281,0.75,13.775,1,3,1
284 | 282,24,7.75,0,3,0
285 | 283,9,15.2458,1,3,1
286 | 284,2,20.2125,1,3,1
287 | 285,36,7.25,0,3,0
288 | 286,24,7.25,0,3,0
289 | 287,24,82.2667,0,1,1
290 | 288,24,7.2292,0,3,0
291 | 289,24,8.05,0,3,0
292 | 290,24,39.6,0,1,0
293 | 291,30,6.95,0,3,0
294 | 292,24,7.2292,0,3,0
295 | 293,53,81.8583,1,1,1
296 | 294,36,9.5,0,3,0
297 | 295,26,7.8958,0,3,0
298 | 296,1,41.5792,2,2,1
299 | 297,24,21.6792,0,3,2
300 | 298,30,45.5,0,1,0
301 | 299,29,7.8542,0,3,0
302 | 300,32,7.775,0,3,0
303 | 301,24,15.0458,0,2,0
304 | 302,43,21,1,2,0
305 | 303,24,8.6625,0,3,0
306 | 304,24,7.75,0,3,0
307 | 305,64,26.55,1,1,1
308 | 306,30,151.55,2,1,1
309 | 307,0.83,9.35,1,3,0
310 | 308,55,93.5,1,1,1
311 | 309,45,14.1083,0,3,1
312 | 310,18,8.6625,0,3,0
313 | 311,22,7.225,0,3,0
314 | 312,24,7.575,0,3,0
315 | 313,37,7.75,0,3,0
316 | 314,55,135.6333,0,1,0
317 | 315,17,7.7333,0,3,0
318 | 316,57,146.5208,0,1,1
319 | 317,19,10.5,0,2,0
320 | 318,27,7.8542,0,3,0
321 | 319,22,31.5,0,2,2
322 | 320,26,7.775,0,3,0
323 | 321,25,7.2292,0,3,0
324 | 322,26,13,0,2,0
325 | 323,33,26.55,0,1,0
326 | 324,39,211.3375,0,1,0
327 | 325,23,7.05,0,3,0
328 | 326,12,39,1,2,2
329 | 327,46,79.2,0,1,0
330 | 328,29,26,0,2,1
331 | 329,21,13,0,2,0
332 | 330,48,36.75,2,2,0
333 | 331,39,29.7,0,1,0
334 | 332,24,7.225,0,3,0
335 | 333,19,15.7417,1,3,1
336 | 334,27,7.8958,0,3,0
337 | 335,30,26,0,1,0
338 | 336,32,13,0,2,0
339 | 337,39,7.2292,2,3,0
340 | 338,25,31.5,0,2,0
341 | 339,24,7.2292,0,3,0
342 | 340,18,10.5,0,2,0
343 | 341,32,7.5792,0,3,0
344 | 342,24,69.55,9,3,1
345 | 343,58,512.3292,1,1,0
346 | 344,24,14.5,1,3,1
347 | 345,16,7.65,0,3,0
348 | 346,26,13,0,2,0
349 | 347,38,7.2292,0,3,0
350 | 348,24,13.5,0,2,0
351 | 349,31,21,0,2,0
352 | 350,45,63.3583,1,1,0
353 | 351,25,10.5,0,2,0
354 | 352,18,73.5,0,2,0
355 | 353,49,65,2,2,1
356 | 354,0.17,20.575,2,3,1
357 | 355,50,26,0,1,0
358 | 356,59,51.4792,0,1,2
359 | 357,24,7.8792,0,3,0
360 | 358,24,7.75,0,3,0
361 | 359,30,15.55,0,3,1
362 | 360,14.5,69.55,2,3,8
363 | 361,24,37.0042,1,2,1
364 | 362,31,21,0,2,0
365 | 363,27,8.6625,0,3,0
366 | 364,25,55.4417,0,1,1
367 | 365,24,69.55,9,3,1
368 | 366,24,14.4583,0,3,1
369 | 367,22,39.6875,0,3,0
370 | 368,45,59.4,1,1,0
371 | 369,29,13.8583,0,2,0
372 | 370,21,11.5,0,2,1
373 | 371,31,134.5,0,1,0
374 | 372,49,0,0,1,0
375 | 373,44,13,0,2,0
376 | 374,54,81.8583,1,1,1
377 | 375,45,262.375,0,1,0
378 | 376,22,8.6625,0,3,2
379 | 377,21,11.5,0,2,0
380 | 378,55,50,0,1,0
381 | 379,5,31.3875,2,3,4
382 | 380,24,7.75,0,3,0
383 | 381,26,7.8792,0,3,0
384 | 382,24,14.5,0,3,0
385 | 383,19,16.1,0,3,1
386 | 384,24,12.875,0,2,0
387 | 385,24,65,2,2,1
388 | 386,24,7.775,0,3,0
389 | 387,57,13,0,2,0
390 | 388,21,7.75,0,3,0
391 | 389,6,21.075,1,3,3
392 | 390,23,93.5,0,1,0
393 | 391,51,39.4,1,1,0
394 | 392,13,20.25,2,3,0
395 | 393,47,10.5,0,2,0
396 | 394,29,22.025,1,3,3
397 | 395,18,60,0,1,1
398 | 396,24,7.25,0,3,0
399 | 397,48,79.2,1,1,1
400 | 398,22,7.775,0,3,0
401 | 399,31,7.7333,0,3,0
402 | 400,30,164.8667,0,1,0
403 | 401,38,21,0,2,1
404 | 402,22,59.4,1,1,0
405 | 403,17,47.1,0,1,0
406 | 404,43,27.7208,0,1,1
407 | 405,20,13.8625,0,2,0
408 | 406,23,10.5,0,2,1
409 | 407,50,211.5,1,1,1
410 | 408,24,7.7208,0,3,0
411 | 409,3,13.775,1,3,1
412 | 410,24,7.75,0,3,0
413 | 411,37,90,0,1,1
414 | 412,28,7.775,0,3,0
415 | 413,24,8.05,0,3,0
416 | 414,39,108.9,0,1,0
417 | 415,38.5,7.25,0,3,0
418 | 416,24,8.05,0,3,0
419 | 417,24,22.3583,1,3,1
420 |
--------------------------------------------------------------------------------
/tests/data_for_tests/train.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/data_for_tests/train.h5
--------------------------------------------------------------------------------
/tests/data_for_tests/train.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/data_for_tests/train.xls
--------------------------------------------------------------------------------
/tests/test_categorical_encoder.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.encoding.categorical_encoder module."""
7 | import pytest
8 | import pandas as pd
9 |
10 | from mlbox.encoding.categorical_encoder import Categorical_encoder
11 |
12 |
13 | def test_init_encoder():
14 | """Test init method of Categorical_encoder class."""
15 | encoder = Categorical_encoder()
16 | assert encoder.strategy == "label_encoding"
17 | assert not (encoder.verbose)
18 | assert encoder._Categorical_encoder__Lcat == []
19 | assert encoder._Categorical_encoder__Lnum == []
20 | assert encoder._Categorical_encoder__Enc == dict()
21 | assert encoder._Categorical_encoder__K == dict()
22 | assert not encoder._Categorical_encoder__weights
23 | assert not encoder._Categorical_encoder__fitOK
24 |
25 |
26 | def test_get_params_encoder():
27 | """Test get_params method of Categorical_encoder class."""
28 | encoder = Categorical_encoder()
29 | dict = {'strategy': "label_encoding",
30 | 'verbose': False}
31 | assert encoder.get_params() == dict
32 |
33 |
34 | def test_set_params_encoder():
35 | """Test set_params method of Categorical_encoder class."""
36 | encoder = Categorical_encoder()
37 | encoder.set_params(strategy="label_encoding")
38 | assert encoder.strategy == "label_encoding"
39 | encoder.set_params(strategy="dummification")
40 | assert encoder.strategy == "dummification"
41 | encoder.set_params(strategy="random_projection")
42 | assert encoder.strategy == "random_projection"
43 | encoder.set_params(strategy="entity_embedding")
44 | assert encoder.strategy == "entity_embedding"
45 | encoder.set_params(verbose=True)
46 | assert encoder.verbose
47 | encoder.set_params(verbose=False)
48 | assert not encoder.verbose
49 | with pytest.warns(UserWarning) as record:
50 | encoder.set_params(_Categorical_encoder__Lcat=[])
51 | assert len(record) == 1
52 |
53 |
54 | def test_fit_encoder():
55 | """Test method fit of Categorical_encoder class."""
56 | df = pd.read_csv("data_for_tests/train.csv")
57 | encoder = Categorical_encoder(strategy="wrong_strategy")
58 | with pytest.raises(ValueError):
59 | encoder.fit(df, df["Survived"])
60 | encoder.set_params(strategy="label_encoding")
61 | encoder.fit(df, df["Survived"])
62 | assert encoder._Categorical_encoder__fitOK
63 | encoder.set_params(strategy="dummification")
64 | encoder.fit(df, df["Survived"])
65 | assert encoder._Categorical_encoder__fitOK
66 | encoder.set_params(strategy="random_projection")
67 | encoder.fit(df, df["Survived"])
68 | assert encoder._Categorical_encoder__fitOK
69 | encoder.set_params(strategy="entity_embedding")
70 | encoder.fit(df, df["Survived"])
71 | assert encoder._Categorical_encoder__fitOK
72 |
73 |
74 | def test_transform_encoder():
75 | """Test transform method of Categorical_encoder class."""
76 | df = pd.read_csv("data_for_tests/train.csv")
77 | encoder = Categorical_encoder()
78 | with pytest.raises(ValueError):
79 | encoder.transform(df)
80 | encoder.fit(df, df["Survived"])
81 | df_encoded = encoder.transform(df)
82 | assert (df.columns == df_encoded.columns).all()
83 | encoder.set_params(strategy="dummification")
84 | encoder.fit(df, df["Survived"])
85 | df_encoded = encoder.transform(df)
86 | assert (type(df_encoded) == pd.SparseDataFrame) | (type(df_encoded) == pd.DataFrame)
87 | encoder.set_params(strategy="random_projection")
88 | encoder.fit(df, df["Survived"])
89 | df_encoded = encoder.transform(df)
90 | assert type(df_encoded) == pd.DataFrame
91 | encoder.set_params(strategy="entity_embedding")
92 | encoder.fit(df, df["Survived"])
93 | df_encoded = encoder.transform(df)
94 | assert type(df_encoded) == pd.DataFrame
95 |
96 |
--------------------------------------------------------------------------------
/tests/test_classification_feature_selector.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.model.classification.feature_selector module."""
7 | import pytest
8 | import pandas as pd
9 |
10 | from mlbox.model.classification.feature_selector import Clf_feature_selector
11 |
12 |
13 | def test_init_Clf_feature_selector():
14 | """Test init method of Clf_feature_selector class."""
15 | feature_selector = Clf_feature_selector()
16 | assert feature_selector.strategy == "l1"
17 | assert feature_selector.threshold == 0.3
18 | assert not feature_selector._Clf_feature_selector__fitOK
19 | assert feature_selector._Clf_feature_selector__to_discard == []
20 |
21 |
22 | def test_get_params_Clf_feature_selector():
23 | """Test get_params method of Clf_feature_selector class."""
24 | feature_selector = Clf_feature_selector()
25 | dict = {'strategy': "l1",
26 | 'threshold': 0.3}
27 | assert feature_selector.get_params() == dict
28 |
29 |
30 | def test_set_params_Clf_feature_selector():
31 | """Test set_params method of Clf_feature_selector class."""
32 | feature_selector = Clf_feature_selector()
33 | feature_selector.set_params(strategy="variance")
34 | assert feature_selector.strategy == "variance"
35 | feature_selector.set_params(threshold=0.2)
36 | assert feature_selector.threshold == 0.2
37 | with pytest.warns(UserWarning) as record:
38 | feature_selector.set_params(wrong_strategy="wrong_strategy")
39 | assert len(record) == 1
40 |
41 |
42 | def test_fit_Clf_feature_selector():
43 | """Test fit method of Clf_feature_selector class."""
44 | feature_selector = Clf_feature_selector()
45 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
46 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
47 | with pytest.raises(ValueError):
48 | feature_selector.fit(None, y_train)
49 | with pytest.raises(ValueError):
50 | feature_selector.fit(df_train, None)
51 | feature_selector.fit(df_train, y_train)
52 | assert feature_selector._Clf_feature_selector__fitOK
53 | feature_selector.set_params(strategy="variance")
54 | feature_selector.fit(df_train, y_train)
55 | assert feature_selector._Clf_feature_selector__fitOK
56 | feature_selector.set_params(strategy="rf_feature_importance")
57 | feature_selector.fit(df_train, y_train)
58 | assert feature_selector._Clf_feature_selector__fitOK
59 | feature_selector.set_params(strategy="wrond_strategy")
60 | with pytest.raises(ValueError):
61 | feature_selector.fit(df_train, y_train)
62 |
63 |
64 | def test_transform_Clf_feature_selector():
65 | """Test transform method of Clf_feature_selector class."""
66 | feature_selector = Clf_feature_selector(threshold=0)
67 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
68 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
69 | with pytest.raises(ValueError):
70 | feature_selector.transform(df_train)
71 | feature_selector.fit(df_train, y_train)
72 | with pytest.raises(ValueError):
73 | feature_selector.transform(None)
74 | df_transformed = feature_selector.transform(df_train)
75 | assert (df_transformed.columns == df_train.columns).all()
76 |
77 |
78 | def test_fit_transform_Clf_feature_selector():
79 | """Test fit_transform method of Clf_feature_selector class."""
80 | feature_selector = Clf_feature_selector(threshold=0)
81 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
82 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
83 | df_transformed = feature_selector.fit_transform(df_train, y_train)
84 | assert (df_transformed.columns == df_train.columns).all()
85 |
--------------------------------------------------------------------------------
/tests/test_classifier.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.model.classification.classifier module."""
7 | import pytest
8 | import pandas as pd
9 | import numpy as np
10 |
11 | from mlbox.model.classification.classifier import Classifier
12 | from lightgbm import LGBMClassifier
13 |
14 |
15 | def test_init_classifier():
16 | """Test init method of Classifier class."""
17 | classifier = Classifier()
18 | assert classifier._Classifier__strategy == "LightGBM"
19 | assert classifier._Classifier__classif_params == {}
20 | assert classifier._Classifier__classifier
21 | assert not classifier._Classifier__col
22 | assert not classifier._Classifier__fitOK
23 |
24 |
25 | def test_get_params_classifier():
26 | """Test get_params method of Classifier class."""
27 | classifier = Classifier()
28 | params = classifier.get_params()
29 | assert params == {'strategy': "LightGBM"}
30 | assert not classifier._Classifier__classif_params
31 |
32 |
33 | def test_set_params_classifier():
34 | """Test set_params method of Classifier class."""
35 | classifier = Classifier()
36 | classifier.set_params(strategy="LightGBM")
37 | assert classifier._Classifier__strategy == "LightGBM"
38 | classifier.set_params(strategy="RandomForest")
39 | assert classifier._Classifier__strategy == "RandomForest"
40 | classifier.set_params(strategy="ExtraTrees")
41 | assert classifier._Classifier__strategy == "ExtraTrees"
42 | classifier.set_params(strategy="RandomForest")
43 | assert classifier._Classifier__strategy == "RandomForest"
44 | classifier.set_params(strategy="Tree")
45 | assert classifier._Classifier__strategy == "Tree"
46 | classifier.set_params(strategy="AdaBoost")
47 | assert classifier._Classifier__strategy == "AdaBoost"
48 | classifier.set_params(strategy="Linear")
49 | assert classifier._Classifier__strategy == "Linear"
50 | with pytest.warns(UserWarning) as record:
51 | classifier.set_params(wrong_strategy="wrong_strategy")
52 | assert len(record) == 1
53 |
54 |
55 | def test_set_classifier():
56 | """Test set method of Classifier class."""
57 | classifier = Classifier()
58 | with pytest.raises(ValueError):
59 | classifier._Classifier__set_classifier("wrong_strategy")
60 |
61 |
62 | def test_fit_classifier():
63 | """Test fit method of Classifier class."""
64 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
65 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
66 | classifier = Classifier()
67 | classifier.fit(df_train, y_train)
68 | assert np.all(classifier._Classifier__col == df_train.columns)
69 | assert classifier._Classifier__fitOK
70 |
71 |
72 | def test_feature_importances_classifier():
73 | """Test feature_importances method of Classifier class."""
74 | classifier = Classifier()
75 | with pytest.raises(ValueError):
76 | classifier.feature_importances()
77 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
78 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
79 | classifier.set_params(strategy="LightGBM")
80 | classifier.fit(df_train, y_train)
81 | importance = classifier.feature_importances()
82 | assert importance != {}
83 | classifier.set_params(strategy="Linear")
84 | classifier.fit(df_train, y_train)
85 | importance = classifier.feature_importances()
86 | assert importance != {}
87 | classifier.set_params(strategy="RandomForest")
88 | classifier.fit(df_train, y_train)
89 | importance = classifier.feature_importances()
90 | assert importance != {}
91 | classifier.set_params(strategy="AdaBoost")
92 | classifier.fit(df_train, y_train)
93 | importance = classifier.feature_importances()
94 | assert importance != {}
95 | classifier.set_params(strategy="Bagging")
96 | classifier.fit(df_train, y_train)
97 | importance = classifier.feature_importances()
98 | assert importance != {}
99 |
100 |
101 | def test_predict_classifier():
102 | """Test predict method of Classifier class."""
103 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
104 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
105 | classifier = Classifier()
106 | with pytest.raises(ValueError):
107 | classifier.predict(df_train)
108 | classifier.fit(df_train, y_train)
109 | with pytest.raises(ValueError):
110 | classifier.predict(None)
111 | assert len(classifier.predict(df_train)) > 0
112 |
113 |
114 | def test_predict_log_proba_classifier():
115 | """Test predict_log_proba method of Classifier class."""
116 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
117 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
118 | classifier = Classifier(strategy="Linear")
119 | with pytest.raises(ValueError):
120 | classifier.predict_log_proba(df_train)
121 | classifier.fit(df_train, y_train)
122 | with pytest.raises(ValueError):
123 | classifier.predict_log_proba(None)
124 | assert len(classifier.predict_log_proba(df_train)) > 0
125 |
126 |
127 | def test_predict_proba_classifier():
128 | """Test predict_proba method of Classifier class."""
129 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
130 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
131 | classifier = Classifier()
132 | with pytest.raises(ValueError):
133 | classifier.predict_proba(df_train)
134 | classifier.fit(df_train, y_train)
135 | with pytest.raises(ValueError):
136 | classifier.predict_proba(None)
137 | assert len(classifier.predict_proba(df_train)) > 0
138 |
139 |
140 | def test_score_classifier():
141 | """Test score method of Classifier class."""
142 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
143 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
144 | classifier = Classifier()
145 | with pytest.raises(ValueError):
146 | classifier.score(df_train, y_train)
147 | classifier.fit(df_train, y_train)
148 | with pytest.raises(ValueError):
149 | classifier.score(None, y_train)
150 | with pytest.raises(ValueError):
151 | classifier.score(df_train, None)
152 | assert classifier.score(df_train, y_train) > 0
153 |
154 |
155 | def test_get_estimator_classifier():
156 | """Test get_estimator method of Classifier class."""
157 | classifier = Classifier()
158 | estimator = classifier.get_estimator()
159 | assert isinstance(estimator, type(LGBMClassifier()))
160 |
--------------------------------------------------------------------------------
/tests/test_drift_estimator.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.preprocessing.drift.drift_estimator module."""
7 | import pytest
8 | import pandas as pd
9 |
10 | from mlbox.preprocessing.drift.drift_estimator import DriftEstimator
11 |
12 |
13 | def test_init_drift_estimator():
14 | """Test init method of DriftEstimator class."""
15 | drift_estimator = DriftEstimator()
16 | assert drift_estimator.n_folds == 2
17 | assert drift_estimator.stratify
18 | assert drift_estimator.random_state == 1
19 | assert not drift_estimator._DriftEstimator__cv
20 | assert not drift_estimator._DriftEstimator__pred
21 | assert not drift_estimator._DriftEstimator__target
22 | assert not drift_estimator._DriftEstimator__fitOK
23 |
24 |
25 | def test_get_params_drift_estimator():
26 | """Test get_params method of DriftEstimator class."""
27 | drift_estimator = DriftEstimator()
28 | dict = {'estimator': drift_estimator.estimator,
29 | 'n_folds': 2,
30 | 'stratify': True,
31 | 'random_state': 1}
32 | assert drift_estimator.get_params() == dict
33 |
34 |
35 | def test_set_params_drift_estimator():
36 | """Test set_params method of DriftEstimator class."""
37 | drift_estimator = DriftEstimator()
38 | dict = {'estimator': drift_estimator.estimator,
39 | 'n_folds': 3,
40 | 'stratify': False,
41 | 'random_state': 2}
42 | drift_estimator.set_params(**dict)
43 | assert drift_estimator.get_params() == dict
44 |
45 |
46 | def test_fit_drift_estimator():
47 | """Test fit method of DriftEstimator class."""
48 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
49 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
50 | drift_estimator = DriftEstimator()
51 | drift_estimator.fit(df_train, df_test)
52 | assert drift_estimator._DriftEstimator__fitOK
53 |
54 |
55 | def test_score_drift_estimator():
56 | """Test score method of DriftEstimator class."""
57 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
58 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
59 | drift_estimator = DriftEstimator()
60 | with pytest.raises(ValueError):
61 | drift_estimator.score()
62 | drift_estimator.fit(df_train, df_test)
63 | assert drift_estimator.score() > 0
64 |
65 |
66 | def test_predict_drift_estimator():
67 | """Test predict method of DriftEstimator class."""
68 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
69 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
70 | drift_estimator = DriftEstimator()
71 | with pytest.raises(ValueError):
72 | drift_estimator.predict()
73 | drift_estimator.fit(df_train, df_test)
74 | results = drift_estimator.predict()
75 | assert len(results) == 1309
76 |
--------------------------------------------------------------------------------
/tests/test_drift_threshold.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.preprocessing.drift module."""
7 | import pytest
8 | import pandas as pd
9 |
10 | from mlbox.preprocessing.drift import DriftThreshold
11 | from mlbox.preprocessing.drift import sync_fit
12 | from sklearn.tree import DecisionTreeClassifier
13 | from sklearn.ensemble import RandomForestClassifier
14 |
15 |
16 | def test_init_drift_threshold():
17 | """Test init method of DriftThreshold class."""
18 | drift_threshold = DriftThreshold()
19 | assert drift_threshold.threshold == 0.6
20 | assert drift_threshold.subsample == 1.
21 | assert isinstance(drift_threshold.estimator,
22 | type(DecisionTreeClassifier()))
23 | assert drift_threshold.n_folds == 2
24 | assert drift_threshold.stratify
25 | assert drift_threshold.random_state == 1
26 | assert drift_threshold.n_jobs == -1
27 | assert not drift_threshold._DriftThreshold__fitOK
28 |
29 |
30 | def test_get_params_drift_threshold():
31 | """Test get_params method of DriftThreshold class."""
32 | drift_threshold = DriftThreshold()
33 | dict = {'threshold': 0.6,
34 | 'subsample': 1.,
35 | 'n_folds': 2,
36 | 'stratify': True,
37 | 'random_state': 1,
38 | 'n_jobs': -1}
39 | dict_get_params = drift_threshold.get_params()
40 | assert dict_get_params["threshold"] == dict["threshold"]
41 | assert dict_get_params["subsample"] == dict["subsample"]
42 | assert dict_get_params["n_folds"] == dict["n_folds"]
43 | assert dict_get_params["stratify"] == dict["stratify"]
44 | assert dict_get_params["random_state"] == dict["random_state"]
45 | assert dict_get_params["n_jobs"] == dict["n_jobs"]
46 |
47 |
48 | def test_set_params_drift_threshold():
49 | """Test set_params method of DriftThreshold class."""
50 | drift_threshold = DriftThreshold()
51 | dict = {'threshold': 0.6,
52 | 'subsample': 1.,
53 | 'estimator': DecisionTreeClassifier(max_depth=6),
54 | 'n_folds': 2,
55 | 'stratify': True,
56 | 'random_state': 1,
57 | 'n_jobs': -1}
58 | drift_threshold.set_params(**dict)
59 | dict_get_params = drift_threshold.get_params()
60 | assert dict_get_params["threshold"] == dict["threshold"]
61 | assert dict_get_params["subsample"] == dict["subsample"]
62 | assert dict_get_params["n_folds"] == dict["n_folds"]
63 | assert dict_get_params["stratify"] == dict["stratify"]
64 | assert dict_get_params["random_state"] == dict["random_state"]
65 | assert dict_get_params["n_jobs"] == dict["n_jobs"]
66 |
67 |
68 | def test_fit_drift_threshold():
69 | """Test fit method of DriftThreshold class."""
70 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
71 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
72 | drift_threshold = DriftThreshold()
73 | drift_threshold.fit(df_train, df_test)
74 | assert drift_threshold._DriftThreshold__fitOK
75 |
76 |
77 | def test_transform_drift_threshold():
78 | """Test transform method of DriftThreshold class."""
79 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
80 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
81 | drift_threshold = DriftThreshold()
82 | with pytest.raises(ValueError):
83 | drift_threshold.transform(df_train)
84 | drift_threshold.fit(df_train, df_test)
85 | df_transformed = drift_threshold.transform(df_train)
86 | assert (df_train.columns == df_transformed.columns).all()
87 |
88 |
89 | def test_get_support_drift_threshold():
90 | """Test get_support method of DriftThreshold class."""
91 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
92 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
93 | drift_threshold = DriftThreshold()
94 | with pytest.raises(ValueError):
95 | drift_threshold.get_support()
96 | drift_threshold.fit(df_train, df_test)
97 | keep_list = drift_threshold.get_support()
98 | drop_list = drift_threshold.get_support(complement=True)
99 | for name in ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']:
100 | assert (name in keep_list)
101 | assert not drop_list
102 |
103 |
104 | def test_drifts_drift_threshold():
105 | """Test drifts method of DriftThreshold class."""
106 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
107 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
108 | drift_threshold = DriftThreshold()
109 | with pytest.raises(ValueError):
110 | drift_threshold.drifts()
111 | drift_threshold.fit(df_train, df_test)
112 | drifts = drift_threshold.drifts()
113 | for name in ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']:
114 | assert (name in list(drifts.keys()))
115 |
116 |
117 | def test_sync_fit_drift_threshold():
118 | """Test method sync_fit of drift_threshold module."""
119 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
120 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
121 | estimator = RandomForestClassifier(n_estimators=50,
122 | n_jobs=-1,
123 | max_features=1.,
124 | min_samples_leaf=5,
125 | max_depth=5)
126 |
127 | score = sync_fit(df_train, df_test, estimator)
128 | assert 0 <= score
129 |
--------------------------------------------------------------------------------
/tests/test_drift_thresholder.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.preprocessing.drift_thresholder module."""
7 | import pytest
8 |
9 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder
10 | from mlbox.preprocessing.reader import Reader
11 |
12 |
13 | def test_init_drift_thresholder():
14 | """Test init method of Drift_thresholder class."""
15 | drift_thresholder = Drift_thresholder()
16 | assert drift_thresholder.threshold == 0.6
17 | assert not drift_thresholder.inplace
18 | assert drift_thresholder.verbose
19 | assert drift_thresholder.to_path == "save"
20 | assert drift_thresholder._Drift_thresholder__Ddrifts == {}
21 | assert not drift_thresholder._Drift_thresholder__fitOK
22 |
23 |
24 | def test_fit_transform():
25 | """Test fit transform method of Drift_thresholder class."""
26 | drift_thresholder = Drift_thresholder()
27 | reader = Reader(sep=",")
28 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"],
29 | target_name="Survived")
30 | drift_thresholder.fit_transform(dict)
31 | assert not drift_thresholder._Drift_thresholder__fitOK
32 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
33 | "data_for_tests/test.csv"],
34 | target_name="Survived")
35 | drift_thresholder.fit_transform(dict)
36 | assert drift_thresholder._Drift_thresholder__fitOK
37 | dict = reader.train_test_split(Lpath=["data_for_tests/inplace_train.csv",
38 | "data_for_tests/inplace_test.csv"],
39 | target_name="Survived")
40 | drift_thresholder.inplace = True
41 | drift_thresholder.fit_transform(dict)
42 | assert drift_thresholder._Drift_thresholder__fitOK
43 |
44 |
45 | def test_drifts():
46 | """Test drifts method of Drift_thresholder class."""
47 | drift_thresholder = Drift_thresholder()
48 | with pytest.raises(ValueError):
49 | drift_thresholder.drifts()
50 | reader = Reader(sep=",")
51 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
52 | "data_for_tests/test.csv"],
53 | target_name="Survived")
54 | drift_thresholder.fit_transform(dict)
55 | drifts = drift_thresholder.drifts()
56 | assert drifts != {}
57 |
--------------------------------------------------------------------------------
/tests/test_na_encoder.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.encoding.na_encoder module."""
7 | import pytest
8 | import pandas as pd
9 |
10 | from mlbox.encoding.na_encoder import NA_encoder
11 |
12 |
13 | def test_init_NA_encoder():
14 | """Test init method of NA_encoder class."""
15 | encoder = NA_encoder()
16 | assert encoder.numerical_strategy == "mean"
17 | assert encoder.categorical_strategy == ""
18 | assert encoder._NA_encoder__Lcat == []
19 | assert encoder._NA_encoder__Lnum == []
20 | assert not encoder._NA_encoder__imp
21 | assert encoder._NA_encoder__mode == dict()
22 | assert not encoder._NA_encoder__fitOK
23 |
24 |
25 | def test_get_params_NA_encoder():
26 | """Test get_params method of NA_encoder class."""
27 | encoder = NA_encoder()
28 | dict = {'numerical_strategy': "mean",
29 | 'categorical_strategy': ""}
30 | assert encoder.get_params() == dict
31 |
32 |
33 | def test_set_params_NA_encoder():
34 | """Test set_params method of NA_encoder class."""
35 | encoder = NA_encoder()
36 |
37 | encoder.set_params(numerical_strategy="mean")
38 | assert encoder.numerical_strategy == "mean"
39 | encoder.set_params(numerical_strategy="median")
40 | assert encoder.numerical_strategy == "median"
41 | encoder.set_params(numerical_strategy="most_frequent")
42 | assert encoder.numerical_strategy == "most_frequent"
43 | encoder.set_params(numerical_strategy=3.0)
44 | assert encoder.numerical_strategy == 3.0
45 |
46 | encoder.set_params(categorical_strategy="")
47 | assert encoder.categorical_strategy == ""
48 | encoder.set_params(categorical_strategy="most_frequent")
49 | assert encoder.categorical_strategy == "most_frequent"
50 | encoder.set_params(categorical_strategy="string_test")
51 | assert encoder.categorical_strategy == "string_test"
52 |
53 | with pytest.warns(UserWarning) as record:
54 | encoder.set_params(_Categorical_encoder__Lcat=[])
55 | assert len(record) == 1
56 |
57 |
58 | def test_fit_NA_encoder():
59 | """Test fit method of NA_encoder class."""
60 | df = pd.read_csv("data_for_tests/train.csv")
61 |
62 | encoder = NA_encoder(numerical_strategy="wrong_strategy")
63 | with pytest.raises(ValueError):
64 | encoder.fit(df, df["Survived"])
65 | encoder.set_params(numerical_strategy="mean")
66 | encoder.fit(df, df["Survived"])
67 | assert encoder._NA_encoder__fitOK
68 | encoder.set_params(numerical_strategy="median")
69 | encoder.fit(df, df["Survived"])
70 | assert encoder._NA_encoder__fitOK
71 | encoder.set_params(numerical_strategy="most_frequent")
72 | encoder.fit(df, df["Survived"])
73 | assert encoder._NA_encoder__fitOK
74 | encoder.set_params(numerical_strategy=3.0)
75 | encoder.fit(df, df["Survived"])
76 | assert encoder._NA_encoder__fitOK
77 |
78 | encoder = NA_encoder(categorical_strategy=2)
79 | with pytest.raises(ValueError):
80 | encoder.fit(df, df["Survived"])
81 | encoder.set_params(categorical_strategy="")
82 | encoder.fit(df, df["Survived"])
83 | assert encoder._NA_encoder__fitOK
84 | encoder.set_params(categorical_strategy="most_frequent")
85 | encoder.fit(df, df["Survived"])
86 |
87 |
88 | def test_transform_NA_encoder():
89 | """Test transform method of NA_encoder class."""
90 | df = pd.read_csv("data_for_tests/train.csv")
91 | encoder = NA_encoder()
92 | with pytest.raises(ValueError):
93 | encoder.transform(df)
94 | encoder.fit(df, df["Survived"])
95 | df_encoded = encoder.transform(df)
96 | assert (df.columns == df_encoded.columns).all()
97 |
--------------------------------------------------------------------------------
/tests/test_optimiser.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.optimisation.optimiser module."""
7 | import pytest
8 | import numpy as np
9 |
10 | from mlbox.optimisation.optimiser import Optimiser
11 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder
12 | from mlbox.preprocessing.reader import Reader
13 | from mlbox.optimisation import make_scorer
14 |
15 |
16 | def test_init_optimiser():
17 | """Test init method of Optimiser class."""
18 | with pytest.warns(UserWarning) as record:
19 | optimiser = Optimiser()
20 | assert len(record) == 1
21 | assert not optimiser.scoring
22 | assert optimiser.n_folds == 2
23 | assert optimiser.random_state == 1
24 | assert optimiser.to_path == "save"
25 | assert optimiser.verbose
26 |
27 |
28 | def test_get_params_optimiser():
29 | """Test get_params method of optimiser class."""
30 | with pytest.warns(UserWarning) as record:
31 | optimiser = Optimiser()
32 | assert len(record) == 1
33 | dict = {'scoring': None,
34 | 'n_folds': 2,
35 | 'random_state': 1,
36 | 'to_path': "save",
37 | 'verbose': True}
38 | assert optimiser.get_params() == dict
39 |
40 |
41 | def test_set_params_optimiser():
42 | """Test set_params method of Optimiser class."""
43 | with pytest.warns(UserWarning) as record:
44 | optimiser = Optimiser()
45 | assert len(record) == 1
46 | optimiser.set_params(scoring='accuracy')
47 | assert optimiser.scoring == 'accuracy'
48 | optimiser.set_params(n_folds=3)
49 | assert optimiser.n_folds == 3
50 | optimiser.set_params(random_state=2)
51 | assert optimiser.random_state == 2
52 | optimiser.set_params(to_path="name")
53 | assert optimiser.to_path == "name"
54 | optimiser.set_params(verbose=False)
55 | assert not optimiser.verbose
56 | with pytest.warns(UserWarning) as record:
57 | optimiser.set_params(wrong_key=3)
58 | assert len(record) == 1
59 |
60 |
61 | def test_evaluate_classification_optimiser():
62 | """Test evaluate method of Optimiser class for classication."""
63 | reader = Reader(sep=",")
64 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
65 | "data_for_tests/test.csv"],
66 | target_name="Survived")
67 | drift_thresholder = Drift_thresholder()
68 | drift_thresholder = drift_thresholder.fit_transform(dict)
69 |
70 | with pytest.warns(UserWarning) as record:
71 | opt = Optimiser(scoring=None, n_folds=3)
72 | assert len(record) == 1
73 | score = opt.evaluate(None, dict)
74 | assert -np.Inf <= score
75 |
76 | with pytest.warns(UserWarning) as record:
77 | opt = Optimiser(scoring="roc_auc", n_folds=3)
78 | assert len(record) == 1
79 | score = opt.evaluate(None, dict)
80 | assert 0. <= score <= 1.
81 |
82 | with pytest.warns(UserWarning) as record:
83 | opt = Optimiser(scoring="wrong_scoring", n_folds=3)
84 | assert len(record) == 1
85 | with pytest.warns(UserWarning) as record:
86 | score = opt.evaluate(None, dict)
87 | assert opt.scoring == "neg_log_loss"
88 |
89 |
90 | def test_evaluate_regression_optimiser():
91 | """Test evaluate method of Optimiser class for regression."""
92 | reader = Reader(sep=",")
93 | dict = reader.train_test_split(Lpath=["data_for_tests/train_regression.csv",
94 | "data_for_tests/test_regression.csv"],
95 | target_name="SalePrice")
96 | drift_thresholder = Drift_thresholder()
97 | drift_thresholder = drift_thresholder.fit_transform(dict)
98 |
99 | mape = make_scorer(lambda y_true,
100 | y_pred: 100*np.sum(
101 | np.abs(y_true-y_pred)/y_true
102 | )/len(y_true),
103 | greater_is_better=False,
104 | needs_proba=False)
105 | with pytest.warns(UserWarning) as record:
106 | opt = Optimiser(scoring=mape, n_folds=3)
107 | assert len(record) == 1
108 | score = opt.evaluate(None, dict)
109 | assert -np.Inf <= score
110 |
111 | with pytest.warns(UserWarning) as record:
112 | opt = Optimiser(scoring=None, n_folds=3)
113 | assert len(record) == 1
114 | score = opt.evaluate(None, dict)
115 | assert -np.Inf <= score
116 |
117 | with pytest.warns(UserWarning) as record:
118 | opt = Optimiser(scoring="wrong_scoring", n_folds=3)
119 | assert len(record) == 1
120 | with pytest.warns(UserWarning) as record:
121 | score = opt.evaluate(None, dict)
122 | assert -np.Inf <= score
123 |
124 |
125 | def test_evaluate_and_optimise_classification():
126 | """Test evaluate_and_optimise method of Optimiser class."""
127 | reader = Reader(sep=",")
128 |
129 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
130 | "data_for_tests/test.csv"],
131 | target_name="Survived")
132 | drift_thresholder = Drift_thresholder()
133 | drift_thresholder = drift_thresholder.fit_transform(dict)
134 |
135 | with pytest.warns(UserWarning) as record:
136 | opt = Optimiser(scoring='accuracy', n_folds=3)
137 | assert len(record) == 1
138 | dict_error = dict.copy()
139 | dict_error["target"] = dict_error["target"].astype(str)
140 | with pytest.raises(ValueError):
141 | score = opt.evaluate(None, dict_error)
142 |
143 | with pytest.warns(UserWarning) as record:
144 | opt = Optimiser(scoring='accuracy', n_folds=3)
145 | assert len(record) == 1
146 | score = opt.evaluate(None, dict)
147 | assert 0. <= score <= 1.
148 |
149 | space = {'ne__numerical_strategy': {"search": "choice", "space": [0]},
150 | 'ce__strategy': {"search": "choice",
151 | "space": ["label_encoding"]},
152 | 'fs__threshold': {"search": "uniform",
153 | "space": [0.01, 0.3]},
154 | 'est__max_depth': {"search": "choice",
155 | "space": [3, 4, 5, 6, 7]}
156 |
157 | }
158 |
159 | best = opt.optimise(space, dict, 1)
160 | assert type(best) == type(dict)
161 |
--------------------------------------------------------------------------------
/tests/test_predictor.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.prediction0.predictor module."""
7 | import sys
8 | import pytest
9 | import numpy as np
10 | import pandas as pd
11 |
12 | from mlbox.prediction.predictor import Predictor
13 | from mlbox.optimisation.optimiser import Optimiser
14 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder
15 | from mlbox.preprocessing.reader import Reader
16 | from mlbox.optimisation import make_scorer
17 |
18 | if sys.version_info[0] >= 3:
19 | from unittest.mock import patch
20 |
21 |
22 | set_backend = "import matplotlib\nmatplotlib.use('Agg')\n"
23 |
24 |
25 | def test_init_predictor():
26 | """Test init method of Predictor class."""
27 | predictor = Predictor()
28 | assert predictor.to_path == "save"
29 | assert predictor.verbose
30 |
31 |
32 | def test_get_params_predictor():
33 | """Test get_params method of Predictor class."""
34 | predictor = Predictor()
35 | dict = {'to_path': "save",
36 | 'verbose': True}
37 | assert predictor.get_params() == dict
38 |
39 |
40 | def test_set_params_predictor():
41 | """Test set_params method of Predictor class."""
42 | predictor = Predictor()
43 | predictor.set_params(to_path="name")
44 | assert predictor.to_path == "name"
45 | predictor.set_params(verbose=False)
46 | assert not predictor.verbose
47 | with pytest.warns(UserWarning) as record:
48 | predictor.set_params(wrong_key=3)
49 | assert len(record) == 1
50 |
51 |
52 | def test_fit_predict_predictor_classification():
53 | """Test fit_predict method of Predictor class for classification."""
54 | reader = Reader(sep=",")
55 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
56 | "data_for_tests/test.csv"],
57 | target_name="Survived")
58 | drift_thresholder = Drift_thresholder()
59 | drift_thresholder = drift_thresholder.fit_transform(dict)
60 |
61 | with pytest.warns(UserWarning) as record:
62 | opt = Optimiser(scoring='accuracy', n_folds=3)
63 | assert len(record) == 1
64 |
65 | space = {'ne__numerical_strategy': {"search": "choice", "space": [0]},
66 | 'ce__strategy': {"search": "choice",
67 | "space": ["entity_embedding"]},
68 | 'fs__threshold': {"search": "uniform",
69 | "space": [0.01, 0.3]},
70 | 'est__max_depth': {"search": "choice",
71 | "space": [3, 4, 5, 6, 7]}
72 |
73 | }
74 |
75 | optimal_hyper_parameters = opt.optimise(space, dict, 1)
76 |
77 | predictor = Predictor(verbose=False)
78 | predictor.fit_predict(optimal_hyper_parameters, dict)
79 | pred_df = pd.read_csv("save/Survived_predictions.csv")
80 | assert np.all(list(pred_df.columns) == ['Unnamed: 0',
81 | '0.0',
82 | '1.0',
83 | 'Survived_predicted'])
84 | assert np.shape(pred_df) == (418, 4)
85 |
86 |
87 | if sys.version_info[0] >= 3:
88 | @patch('matplotlib.pyplot.show')
89 | def test_fit_predict_predictor_regression(mock_show):
90 | """Test fit_predict method of Predictor class for regression."""
91 | rd = Reader(sep=',')
92 | dt = rd.train_test_split(Lpath=["data_for_tests/train_regression.csv",
93 | "data_for_tests/test_regression.csv"],
94 | target_name="SalePrice")
95 |
96 | drift_thresholder = Drift_thresholder()
97 | df = drift_thresholder.fit_transform(dt)
98 |
99 | mape = make_scorer(lambda y_true,
100 | y_pred: 100*np.sum(
101 | np.abs(y_true-y_pred)/y_true
102 | )/len(y_true),
103 | greater_is_better=False,
104 | needs_proba=False)
105 | opt = Optimiser(scoring=mape, n_folds=3)
106 |
107 | opt.evaluate(None, df)
108 |
109 | space = {
110 | 'ne__numerical_strategy': {"search": "choice",
111 | "space": [0]},
112 | 'ce__strategy': {"search": "choice",
113 | "space": ["random_projection"]},
114 | 'fs__threshold': {"search": "uniform",
115 | "space": [0.01, 0.3]},
116 | 'est__max_depth': {"search": "choice",
117 | "space": [3, 4, 5, 6, 7]}
118 |
119 | }
120 |
121 | best = opt.optimise(space, df, 1)
122 |
123 | prd = Predictor(verbose=True)
124 | prd.fit_predict(best, df)
125 | pred_df = pd.read_csv("save/SalePrice_predictions.csv")
126 | assert np.all(list(pred_df.columns) == ['Unnamed: 0',
127 | 'SalePrice_predicted'])
128 | assert np.shape(pred_df) == (1459, 2)
129 |
130 | else:
131 | def test_fit_predict_predictor_regression():
132 | """Test fit_predict method of Predictor class for regression."""
133 | rd = Reader(sep=',')
134 | dt = rd.train_test_split(Lpath=["data_for_tests/train_regression.csv",
135 | "data_for_tests/test_regression.csv"],
136 | target_name="SalePrice")
137 |
138 | drift_thresholder = Drift_thresholder()
139 | df = drift_thresholder.fit_transform(dt)
140 |
141 | mape = make_scorer(lambda y_true,
142 | y_pred: 100*np.sum(
143 | np.abs(y_true-y_pred)/y_true
144 | )/len(y_true),
145 | greater_is_better=False,
146 | needs_proba=False)
147 | opt = Optimiser(scoring=mape, n_folds=3)
148 |
149 | opt.evaluate(None, df)
150 |
151 | space = {
152 | 'ne__numerical_strategy': {"search": "choice",
153 | "space": [0]},
154 | 'ce__strategy': {"search": "choice",
155 | "space": ["label_encoding",
156 | "random_projection",
157 | "entity_embedding"]},
158 | 'fs__threshold': {"search": "uniform",
159 | "space": [0.01, 0.3]},
160 | 'est__max_depth': {"search": "choice",
161 | "space": [3, 4, 5, 6, 7]}
162 |
163 | }
164 |
165 | best = opt.optimise(space, df, 1)
166 |
167 | prd = Predictor(verbose=False)
168 | prd.fit_predict(best, df)
169 | pred_df = pd.read_csv("save/SalePrice_predictions.csv")
170 | assert np.all(list(pred_df.columns) == ['Unnamed: 0',
171 | 'SalePrice_predicted'])
172 | assert np.shape(pred_df) == (1459, 2)
173 |
--------------------------------------------------------------------------------
/tests/test_reader.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.preprocessing.reader module."""
7 | import sys
8 |
9 | import pytest
10 | import pandas as pd
11 | import numpy as np
12 |
13 |
14 | from mlbox.preprocessing.reader import convert_list
15 | from mlbox.preprocessing.reader import convert_float_and_dates
16 | from mlbox.preprocessing.reader import Reader
17 |
18 |
19 | def test_init_reader():
20 | """Test init method of Reader class."""
21 | reader = Reader()
22 | assert not reader.sep
23 | assert reader.header == 0
24 | assert not reader.to_hdf5
25 | assert reader.to_path == "save"
26 | assert reader.verbose
27 |
28 |
29 | def test_clean_reader():
30 | """Test clean method of Reader class."""
31 | reader = Reader()
32 | with pytest.raises(ValueError):
33 | reader.clean(path=None, drop_duplicate=False)
34 | with pytest.raises(ValueError):
35 | reader.clean(path="data_for_tests/train.csv")
36 | reader = Reader(sep=",")
37 | df = reader.clean(path="data_for_tests/train.csv")
38 | assert np.shape(df) == (891, 12)
39 | with pytest.raises(ValueError):
40 | reader.clean(path="data_for_tests/train.wrong_extension")
41 | df_drop = reader.clean(path="data_for_tests/train.csv",
42 | drop_duplicate=True)
43 | assert np.shape(df_drop) == (891, 12)
44 | assert np.all(df["Name"] == df_drop["Name"])
45 | reader = Reader()
46 | df_excel = reader.clean(path="data_for_tests/train.xls")
47 | assert np.shape(df_excel) == (891, 12)
48 | assert np.all(df["Name"] == df_excel["Name"])
49 | if (sys.platform == "win32" and sys.version_info[0] <=3 and sys.version_info[1] <=5):
50 | pass
51 | else:
52 | if sys.version_info[0] >= 3:
53 | df_hdf = reader.clean(path="data_for_tests/train.h5")
54 | assert np.shape(df_hdf) == (891, 12)
55 | assert np.all(df["Name"] == df_hdf["Name"])
56 | df_json = reader.clean(path="data_for_tests/train.json")
57 | assert np.shape(df_json) == (891, 12)
58 |
59 |
60 | def test_train_test_split_reader():
61 | """Test train_test_split method of Reader class."""
62 | reader = Reader(sep=",")
63 | with pytest.raises(ValueError):
64 | reader.train_test_split(Lpath=None, target_name="target")
65 | with pytest.raises(ValueError):
66 | reader.train_test_split(Lpath=["data_for_tests/train.csv"],
67 | target_name=None)
68 | with pytest.raises(ValueError):
69 | reader = Reader(to_path=None)
70 | reader.train_test_split(Lpath=["data_for_tests/train.csv"],
71 | target_name="Survived")
72 | reader = Reader(sep=",")
73 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"],
74 | target_name="Survived")
75 | assert len(dict) == 3
76 | assert "train" in list(dict.keys())
77 | assert "test" in list(dict.keys())
78 | assert "target" in list(dict.keys())
79 | assert np.all(dict["train"].columns == dict["train"].columns)
80 | if (sys.version_info[0] >= 3 and sys.platform != "win32"):
81 | reader = Reader(to_hdf5=True)
82 | dict = reader.train_test_split(Lpath=["data_for_tests/train.h5"],
83 | target_name="Survived")
84 | assert len(dict) == 3
85 | assert "train" in list(dict.keys())
86 | assert "test" in list(dict.keys())
87 | assert "target" in list(dict.keys())
88 | assert np.all(dict["train"].columns == dict["train"].columns)
89 |
90 |
91 | def test_convert_list_reader():
92 | """Test convert_list function of reader module."""
93 | data_list = list()
94 | data_list.append([1, 2])
95 | data_list.append([3, 4])
96 | index = ['a', 'b']
97 | serie = pd.Series(data=data_list, index=index, name="test")
98 | df = convert_list(serie)
99 | assert np.all(df.index == serie.index)
100 | assert np.all(df.columns.values == ['test_item1', 'test_item2'])
101 |
102 |
103 | def test_convert_float_and_dates_reader():
104 | """Test convert_float_and_dates function of reader module."""
105 | index = ['a', 'b', 'c']
106 | values = [1, 2, 3]
107 | serie = pd.Series(data=values, index=index)
108 | serie = convert_float_and_dates(serie)
109 | assert serie.dtype == 'float64'
110 |
111 | index = ['a', 'b', 'c']
112 | values = np.array(['2007-07-13', '2006-01-13', '2010-08-13'],
113 | dtype='datetime64')
114 | serie = pd.Series(data=values,
115 | index=index,
116 | dtype='datetime64[ns]',
117 | name="test")
118 | df = convert_float_and_dates(serie)
119 | assert np.all(df.index == serie.index)
120 | assert np.all(df.columns.values == ['test_TIMESTAMP',
121 | 'test_YEAR',
122 | 'test_MONTH',
123 | 'test_DAY',
124 | 'test_DAYOFWEEK',
125 | 'test_HOUR'])
126 |
127 | index = ['a', 'b', 'c']
128 | values = np.array(['2007-07-13', '2006-01-13', '2010-08-13'])
129 | serie = pd.Series(data=values, index=index, name="test")
130 | df = convert_float_and_dates(serie)
131 | assert np.all(df.index == serie.index)
132 | assert np.all(df.columns.values == ['test_TIMESTAMP',
133 | 'test_YEAR',
134 | 'test_MONTH',
135 | 'test_DAY',
136 | 'test_DAYOFWEEK',
137 | 'test_HOUR'])
138 |
--------------------------------------------------------------------------------
/tests/test_regression_feature_selector.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | # import pytest
7 | """Test mlbox.model.regression.feature_selector module."""
8 | import pytest
9 | import pandas as pd
10 |
11 | from mlbox.model.regression.feature_selector import Reg_feature_selector
12 |
13 |
14 | def test_init_Reg_feature_selector():
15 | """Test init method of Reg_feature_selector class."""
16 | feature_selector = Reg_feature_selector()
17 | assert feature_selector.strategy == "l1"
18 | assert feature_selector.threshold == 0.3
19 | assert not feature_selector._Reg_feature_selector__fitOK
20 | assert feature_selector._Reg_feature_selector__to_discard == []
21 |
22 |
23 | def test_get_params_Reg_feature_selector():
24 | """Test get_params method of Reg_feature_selector class."""
25 | feature_selector = Reg_feature_selector()
26 | dict = {'strategy': "l1",
27 | 'threshold': 0.3}
28 | assert feature_selector.get_params() == dict
29 |
30 |
31 | def test_set_params_Reg_feature_selector():
32 | """Test set_params of method Reg_feature_selector class."""
33 | feature_selector = Reg_feature_selector()
34 | feature_selector.set_params(strategy="variance")
35 | assert feature_selector.strategy == "variance"
36 | feature_selector.set_params(threshold=0.2)
37 | assert feature_selector.threshold == 0.2
38 | with pytest.warns(UserWarning) as record:
39 | feature_selector.set_params(wrong_strategy="wrong_strategy")
40 | assert len(record) == 1
41 |
42 |
43 | def test_fit_Reg_feature_selector():
44 | """Test fit method of Reg_feature_selector class."""
45 | feature_selector = Reg_feature_selector()
46 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
47 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
48 | with pytest.raises(ValueError):
49 | feature_selector.fit(None, y_train)
50 | with pytest.raises(ValueError):
51 | feature_selector.fit(df_train, None)
52 | feature_selector.fit(df_train, y_train)
53 | assert feature_selector._Reg_feature_selector__fitOK
54 | feature_selector.set_params(strategy="variance")
55 | feature_selector.fit(df_train, y_train)
56 | assert feature_selector._Reg_feature_selector__fitOK
57 | feature_selector.set_params(strategy="rf_feature_importance")
58 | feature_selector.fit(df_train, y_train)
59 | assert feature_selector._Reg_feature_selector__fitOK
60 | feature_selector.set_params(strategy="wrond_strategy")
61 | with pytest.raises(ValueError):
62 | feature_selector.fit(df_train, y_train)
63 |
64 |
65 | def test_transform_Reg_feature_selector():
66 | """Test transform method of Reg_feature_selector class."""
67 | feature_selector = Reg_feature_selector(threshold=0)
68 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
69 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
70 | with pytest.raises(ValueError):
71 | feature_selector.transform(df_train)
72 | feature_selector.fit(df_train, y_train)
73 | with pytest.raises(ValueError):
74 | feature_selector.transform(None)
75 | df_transformed = feature_selector.transform(df_train)
76 | assert (df_transformed.columns == df_train.columns).all()
77 |
78 |
79 | def test_fit_transform_Reg_feature_selector():
80 | """Test fit_transform method of Reg_feature_selector class."""
81 | feature_selector = Reg_feature_selector(threshold=0)
82 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
83 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
84 | df_transformed = feature_selector.fit_transform(df_train, y_train)
85 | assert (df_transformed.columns == df_train.columns).all()
86 |
--------------------------------------------------------------------------------
/tests/test_regressor.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | # import pytest
7 | """Test mlbox.model.regression.regressor module."""
8 | import pytest
9 | import pandas as pd
10 | import numpy as np
11 |
12 | from mlbox.model.regression.regressor import Regressor
13 | from lightgbm import LGBMRegressor
14 |
15 |
16 | def test_init_regressor():
17 | """Test init method of Regressor class."""
18 | regressor = Regressor()
19 | assert regressor._Regressor__strategy == "LightGBM"
20 | assert regressor._Regressor__regress_params == {}
21 | assert regressor._Regressor__regressor
22 | assert not regressor._Regressor__col
23 | assert not regressor._Regressor__fitOK
24 |
25 |
26 | def test_get_params_regressor():
27 | """Test get_params method of Regressor class."""
28 | regressor = Regressor()
29 | params = regressor.get_params()
30 | assert params == {'strategy': "LightGBM"}
31 | assert not regressor._Regressor__regress_params
32 |
33 |
34 | def test_set_params_regressor():
35 | """Test set_params method of Regressor class."""
36 | regressor = Regressor()
37 | regressor.set_params(strategy="LightGBM")
38 | assert regressor._Regressor__strategy == "LightGBM"
39 | regressor.set_params(strategy="RandomForest")
40 | assert regressor._Regressor__strategy == "RandomForest"
41 | regressor.set_params(strategy="ExtraTrees")
42 | assert regressor._Regressor__strategy == "ExtraTrees"
43 | regressor.set_params(strategy="RandomForest")
44 | assert regressor._Regressor__strategy == "RandomForest"
45 | regressor.set_params(strategy="Tree")
46 | assert regressor._Regressor__strategy == "Tree"
47 | regressor.set_params(strategy="AdaBoost")
48 | assert regressor._Regressor__strategy == "AdaBoost"
49 | regressor.set_params(strategy="Linear")
50 | assert regressor._Regressor__strategy == "Linear"
51 | regressor.set_params(strategy="Bagging")
52 | assert regressor._Regressor__strategy == "Bagging"
53 | with pytest.warns(UserWarning) as record:
54 | regressor.set_params(wrong_strategy="wrong_strategy")
55 | assert len(record) == 1
56 |
57 |
58 | def test_set_regressor():
59 | """Test set method of Regressor class."""
60 | regressor = Regressor()
61 | with pytest.raises(ValueError):
62 | regressor._Regressor__set_regressor("wrong_strategy")
63 |
64 |
65 | def test_fit_regressor():
66 | """Test fit method of Regressor class."""
67 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
68 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
69 | regressor = Regressor()
70 | regressor.fit(df_train, y_train)
71 | assert np.all(regressor._Regressor__col == df_train.columns)
72 | assert regressor._Regressor__fitOK
73 |
74 |
75 | def test_feature_importances_regressor():
76 | """Test feature_importances of Regressor class."""
77 | regressor = Regressor()
78 | with pytest.raises(ValueError):
79 | regressor.feature_importances()
80 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
81 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
82 | regressor.set_params(strategy="LightGBM")
83 | regressor.fit(df_train, y_train)
84 | importance = regressor.feature_importances()
85 | assert importance != {}
86 | regressor.set_params(strategy="Linear")
87 | regressor.fit(df_train, y_train)
88 | importance = regressor.feature_importances()
89 | assert importance != {}
90 | regressor.set_params(strategy="RandomForest")
91 | regressor.fit(df_train, y_train)
92 | importance = regressor.feature_importances()
93 | assert importance != {}
94 | regressor.set_params(strategy="AdaBoost")
95 | regressor.fit(df_train, y_train)
96 | importance = regressor.feature_importances()
97 | assert importance != {}
98 | regressor.set_params(strategy="Bagging")
99 | regressor.fit(df_train, y_train)
100 | importance = regressor.feature_importances()
101 | assert importance != {}
102 |
103 |
104 | def test_predict_regressor():
105 | """Test predict method of Regressor class."""
106 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
107 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
108 | regressor = Regressor()
109 | with pytest.raises(ValueError):
110 | regressor.predict(df_train)
111 | regressor.fit(df_train, y_train)
112 | with pytest.raises(ValueError):
113 | regressor.predict(None)
114 | assert len(regressor.predict(df_train)) > 0
115 |
116 |
117 | def test_score_regressor():
118 | """Test_score method of Regressor class."""
119 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
120 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
121 | regressor = Regressor(strategy="Linear")
122 | with pytest.raises(ValueError):
123 | regressor.score(df_train, y_train)
124 | regressor.fit(df_train, y_train)
125 | with pytest.raises(ValueError):
126 | regressor.score(None, y_train)
127 | with pytest.raises(ValueError):
128 | regressor.score(df_train, None)
129 | assert regressor.score(df_train, y_train) > 0
130 |
131 |
132 | def test_get_estimator_regressor():
133 | """Test get_estimator of Regressor class."""
134 | regressor = Regressor()
135 | estimator = regressor.get_estimator()
136 | assert isinstance(estimator, type(LGBMRegressor()))
137 |
--------------------------------------------------------------------------------
/tests/test_stacking_classifer.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.model.classification.stacking_classifier module."""
7 | import pytest
8 | import pandas as pd
9 | import numpy as np
10 |
11 | from sklearn.linear_model import LogisticRegression
12 | from mlbox.model.classification.stacking_classifier import StackingClassifier
13 |
14 |
15 | def test_init_stacking_classifier():
16 | """Test init method of StackingClassifier class."""
17 | with pytest.raises(ValueError):
18 | stacking_classifier = StackingClassifier(base_estimators=dict())
19 | with pytest.raises(ValueError):
20 | stacking_classifier = StackingClassifier(n_folds=dict())
21 | with pytest.raises(ValueError):
22 | stacking_classifier = StackingClassifier(copy="True")
23 | with pytest.raises(ValueError):
24 | stacking_classifier = StackingClassifier(drop_first="True")
25 | with pytest.raises(ValueError):
26 | stacking_classifier = StackingClassifier(random_state="1")
27 | with pytest.raises(ValueError):
28 | stacking_classifier = StackingClassifier(verbose="True")
29 | stacking_classifier = StackingClassifier()
30 | assert len(stacking_classifier.base_estimators) == 3
31 | assert isinstance(stacking_classifier.level_estimator,
32 | type(LogisticRegression()))
33 | assert stacking_classifier.n_folds == 5
34 | assert not stacking_classifier.copy
35 | assert stacking_classifier.drop_first
36 | assert stacking_classifier.random_state == 1
37 | assert stacking_classifier.verbose
38 | assert not stacking_classifier._StackingClassifier__fitOK
39 | assert not stacking_classifier._StackingClassifier__fittransformOK
40 |
41 |
42 | def test_get_params_stacking_classifier():
43 | """Test get_params method StackingClassifier class."""
44 | stacking_classifier = StackingClassifier()
45 | dict = stacking_classifier.get_params()
46 | assert len(dict["base_estimators"]) == 3
47 | assert isinstance(dict["level_estimator"],
48 | type(LogisticRegression()))
49 | assert dict["n_folds"] == 5
50 | assert not dict["copy"]
51 | assert dict["drop_first"]
52 | assert dict["random_state"] == 1
53 | assert dict["verbose"]
54 |
55 |
56 | def test_set_params_stacking_classifier():
57 | """Test set_params method of StackingClassifier class."""
58 | stacking_classifier = StackingClassifier()
59 | stacking_classifier.set_params(n_folds=6)
60 | assert stacking_classifier.n_folds == 6
61 | stacking_classifier.set_params(copy=True)
62 | assert stacking_classifier.copy
63 | stacking_classifier.set_params(drop_first=False)
64 | assert not stacking_classifier.drop_first
65 | stacking_classifier.set_params(random_state=2)
66 | assert stacking_classifier.random_state == 2
67 | stacking_classifier.set_params(verbose=False)
68 | assert not stacking_classifier.verbose
69 | with pytest.warns(UserWarning) as record:
70 | stacking_classifier.set_params(wrong_parameters=None)
71 | assert len(record) == 1
72 |
73 |
74 | def test_fit_transform_stacking_classifier():
75 | """Test fit_transform method of StackingClassifier class."""
76 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
77 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
78 | stacking_classifier = StackingClassifier()
79 | with pytest.raises(ValueError):
80 | stacking_classifier.fit_transform(None, y_train)
81 | with pytest.raises(ValueError):
82 | stacking_classifier.fit_transform(df_train, None)
83 | stacking_classifier.fit_transform(df_train, y_train)
84 | assert stacking_classifier._StackingClassifier__fittransformOK
85 |
86 |
87 | def test_transform_stacking_classifier():
88 | """Test transform method of StackingClassifier class."""
89 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
90 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
91 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
92 | stacking_classifier = StackingClassifier()
93 | with pytest.raises(ValueError):
94 | stacking_classifier.transform(None)
95 | with pytest.raises(ValueError):
96 | stacking_classifier.transform(df_test)
97 | stacking_classifier.fit_transform(df_train, y_train)
98 | results = stacking_classifier.transform(df_test)
99 | assert len(results.columns == 3)
100 |
101 |
102 | def test_fit_stacking_classifier():
103 | """Test fit method of StackingClassifier class."""
104 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
105 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
106 | stacking_classifier = StackingClassifier(verbose=True)
107 | stacking_classifier.fit(df_train, y_train)
108 | assert stacking_classifier._StackingClassifier__fitOK
109 |
110 |
111 | def test_predict_proba_stacking_classifier():
112 | """Test predict_proba method of StackingClassifier class."""
113 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
114 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
115 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
116 | stacking_classifier = StackingClassifier()
117 | with pytest.raises(ValueError):
118 | stacking_classifier.predict_proba(df_test)
119 | stacking_classifier.fit(df_train, y_train)
120 | results = stacking_classifier.predict_proba(df_test)
121 | assert np.shape(results) == (418, 2)
122 |
123 |
124 | def test_predict_stacking_classifier():
125 | """Test predict method of StackingClassifier class."""
126 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
127 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
128 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
129 | stacking_classifier = StackingClassifier()
130 | with pytest.raises(ValueError):
131 | stacking_classifier.predict(df_test)
132 | stacking_classifier.fit(df_train, y_train)
133 | results = stacking_classifier.predict(df_test)
134 | assert np.shape(results) == (418,)
135 |
--------------------------------------------------------------------------------
/tests/test_stacking_regressor.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding: utf-8
3 | # Author: Axel ARONIO DE ROMBLAY
4 | # Author: Henri GERARD
5 | # License: BSD 3 clause
6 | """Test mlbox.model.regression.stacking_regressor module."""
7 | import pytest
8 | import pandas as pd
9 | import numpy as np
10 |
11 | from sklearn.linear_model import LinearRegression
12 | from mlbox.model.regression.stacking_regressor import StackingRegressor
13 |
14 |
15 | def test_init_stacking_regressor():
16 | """Test init method of StackingRegressor class."""
17 | with pytest.raises(ValueError):
18 | stacking_regressor = StackingRegressor(base_estimators=dict())
19 | with pytest.raises(ValueError):
20 | stacking_regressor = StackingRegressor(n_folds=dict())
21 | with pytest.raises(ValueError):
22 | stacking_regressor = StackingRegressor(copy="True")
23 | with pytest.raises(ValueError):
24 | stacking_regressor = StackingRegressor(random_state="1")
25 | with pytest.raises(ValueError):
26 | stacking_regressor = StackingRegressor(verbose="True")
27 | stacking_regressor = StackingRegressor()
28 | assert len(stacking_regressor.base_estimators) == 3
29 | assert isinstance(stacking_regressor.level_estimator,
30 | type(LinearRegression()))
31 | assert stacking_regressor.n_folds == 5
32 | assert not stacking_regressor.copy
33 | assert stacking_regressor.random_state == 1
34 | assert stacking_regressor.verbose
35 | assert not stacking_regressor._StackingRegressor__fitOK
36 | assert not stacking_regressor._StackingRegressor__fittransformOK
37 |
38 |
39 | def test_get_params_stacking_regressor():
40 | """Test get_params method of StackingRegressor class."""
41 | stacking_regressor = StackingRegressor()
42 | dict = stacking_regressor.get_params()
43 | assert len(dict["base_estimators"]) == 3
44 | assert isinstance(dict["level_estimator"],
45 | type(LinearRegression()))
46 | assert dict["n_folds"] == 5
47 | assert not dict["copy"]
48 | assert dict["random_state"] == 1
49 | assert dict["verbose"]
50 |
51 |
52 | def test_set_params_stacking_regressor():
53 | """Test set_params method of StackingRegressor class."""
54 | stacking_regressor = StackingRegressor()
55 | stacking_regressor.set_params(n_folds=6)
56 | assert stacking_regressor.n_folds == 6
57 | stacking_regressor.set_params(copy=True)
58 | assert stacking_regressor.copy
59 | stacking_regressor.set_params(random_state=2)
60 | assert stacking_regressor.random_state == 2
61 | stacking_regressor.set_params(verbose=False)
62 | assert not stacking_regressor.verbose
63 | with pytest.warns(UserWarning) as record:
64 | stacking_regressor.set_params(wrong_parameters=None)
65 | assert len(record) == 1
66 |
67 |
68 | def test_fit_transform_stacking_regressor():
69 | """Test fit_transform method of Stacking regressor class."""
70 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
71 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
72 | stacking_regressor = StackingRegressor()
73 | with pytest.raises(ValueError):
74 | stacking_regressor.fit_transform(None, y_train)
75 | with pytest.raises(ValueError):
76 | stacking_regressor.fit_transform(df_train, None)
77 | stacking_regressor.fit_transform(df_train, y_train)
78 | assert stacking_regressor._StackingRegressor__fittransformOK
79 |
80 |
81 | def test_transform_stacking_regressor():
82 | """Test transform method of StackingRegressor class."""
83 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
84 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
85 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
86 | stacking_regressor = StackingRegressor()
87 | with pytest.raises(ValueError):
88 | stacking_regressor.transform(None)
89 | with pytest.raises(ValueError):
90 | stacking_regressor.transform(df_test)
91 | stacking_regressor.fit_transform(df_train, y_train)
92 | results = stacking_regressor.transform(df_test)
93 | assert len(results.columns == 3)
94 |
95 |
96 | def test_fit_stacking_regressor():
97 | """Test fit method of StackingRegressor class."""
98 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
99 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
100 | stacking_regressor = StackingRegressor(verbose=True)
101 | stacking_regressor.fit(df_train, y_train)
102 | assert stacking_regressor._StackingRegressor__fitOK
103 |
104 |
105 | def test_predict_stacking_regressor():
106 | """Test predict method of StackingRegressor class."""
107 | df_train = pd.read_csv("data_for_tests/clean_train.csv")
108 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
109 | df_test = pd.read_csv("data_for_tests/clean_test.csv")
110 | stacking_regressor = StackingRegressor()
111 | with pytest.raises(ValueError):
112 | stacking_regressor.predict(df_test)
113 | stacking_regressor.fit(df_train, y_train)
114 | results = stacking_regressor.predict(df_test)
115 | assert np.shape(results) == (418,)
116 |
--------------------------------------------------------------------------------