├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── VERSION.txt ├── codecov.yml ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── features.rst ├── history.rst ├── index.rst ├── installation.rst ├── introduction.rst ├── logos │ ├── logo.png │ ├── small_logo.ico │ └── small_logo.png ├── make.bat └── webinars │ ├── auto-ML.pdf │ └── features.pdf ├── examples ├── classification │ ├── classification.py │ ├── example.ipynb │ ├── test_classification.csv │ └── train_classification.csv └── regression │ ├── example.ipynb │ ├── regression.py │ ├── test_regression.csv │ └── train_regression.csv ├── mlbox ├── __init__.py ├── encoding │ ├── __init__.py │ ├── categorical_encoder.py │ └── na_encoder.py ├── model │ ├── __init__.py │ ├── classification │ │ ├── __init__.py │ │ ├── classifier.py │ │ ├── feature_selector.py │ │ └── stacking_classifier.py │ └── regression │ │ ├── __init__.py │ │ ├── feature_selector.py │ │ ├── regressor.py │ │ └── stacking_regressor.py ├── optimisation │ ├── __init__.py │ └── optimiser.py ├── prediction │ ├── __init__.py │ └── predictor.py └── preprocessing │ ├── __init__.py │ ├── drift │ ├── __init__.py │ ├── drift_estimator.py │ └── drift_threshold.py │ ├── drift_thresholder.py │ └── reader.py ├── requirements.txt ├── setup.py └── tests ├── .DS_Store ├── __init__.py ├── data_for_tests ├── clean_target.csv ├── clean_test.csv ├── clean_train.csv ├── inplace_test.csv ├── inplace_train.csv ├── test.csv ├── test_regression.csv ├── train.csv ├── train.h5 ├── train.json ├── train.xls └── train_regression.csv ├── test_categorical_encoder.py ├── test_classification_feature_selector.py ├── test_classifier.py ├── test_drift_estimator.py ├── test_drift_threshold.py ├── test_drift_thresholder.py ├── test_na_encoder.py ├── test_optimiser.py ├── test_predictor.py ├── test_reader.py ├── test_regression_feature_selector.py ├── test_regressor.py ├── test_stacking_classifer.py └── test_stacking_regressor.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .pytest_cache/ 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pycharm 76 | .idea 77 | .DS_Store 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | .venv 93 | venv/ 94 | ENV/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # save folders 110 | *save/ 111 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | matrix: 3 | include: 4 | - os: linux 5 | python: '3.5' 6 | - os: linux 7 | python: '3.6' 8 | - os: linux 9 | python: '3.7' 10 | - os: osx 11 | language: generic 12 | python: '3.5' 13 | before_install: 14 | - brew install libomp 15 | - brew upgrade pyenv 16 | - brew install pyenv-virtualenv 17 | - pyenv install 3.5.6 18 | - eval "$(pyenv init -)" 19 | - pyenv virtualenv 3.5.6 venv 20 | - pyenv activate venv 21 | - os: osx 22 | language: generic 23 | python: '3.6' 24 | before_install: 25 | - brew install libomp 26 | - brew upgrade pyenv 27 | - brew install pyenv-virtualenv 28 | - pyenv install 3.6.7 29 | - eval "$(pyenv init -)" 30 | - pyenv virtualenv 3.6.7 venv 31 | - pyenv activate venv 32 | - os: osx 33 | language: generic 34 | python: '3.7' 35 | before_install: 36 | - brew install libomp 37 | - brew upgrade pyenv 38 | - brew install pyenv-virtualenv 39 | - pyenv install 3.7.2 40 | - eval "$(pyenv init -)" 41 | - pyenv virtualenv 3.7.2 venv 42 | - pyenv activate venv 43 | - os: windows 44 | language: sh 45 | python: '3.5' 46 | before_install: 47 | - choco install python --version 3.5.4 48 | - export PATH="/c/Python35:/c/Python35/Scripts:$PATH" 49 | - os: windows 50 | language: sh 51 | python: '3.6' 52 | before_install: 53 | - choco install python --version 3.6.7 54 | - export PATH="/c/Python36:/c/Python36/Scripts:$PATH" 55 | - os: windows 56 | language: sh 57 | python: '3.7' 58 | before_install: 59 | - choco install python --version 3.7.2 60 | - export PATH="/c/Python37:/c/Python37/Scripts:$PATH" 61 | install: 62 | - pip install coverage 63 | - pip install codecov 64 | - pip install -U pytest 65 | - pip install --upgrade setuptools wheel 66 | script: 67 | - python setup.py install 68 | - cd tests 69 | - if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" = "3.7" ] ; then 70 | coverage run -m --source=../mlbox/ pytest; fi 71 | - if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" != "3.7" ] ; then 72 | pytest; fi 73 | - if [ "$TRAVIS_OS_NAME" = "osx" ] ; then pytest; fi 74 | - if [ "$TRAVIS_OS_NAME" = "windows" ] ; then pytest; fi 75 | - cd .. 76 | after_success: 77 | - codecov 78 | deploy: 79 | provider: pypi 80 | user: AxeldeRomblay 81 | password: 82 | secure: l4S5cjkkjhj82j3Tq51/zkBEkjOfSl9xaISu9rmcQNQUbsqp1qrLiKmcMVm0mirNezhTnNdeeCWRyeuvXBNpbRq37KKM6NGScmbAPdCKZeDw6/wDOwjzaMpsnzynq7EiowrgrawwffTa1kP6dgzkG4U/ftjd1jNdNMmOz5MyMnkS2cVv2Uy0o/g7MPQ1hIVAGpoLtnjJ+iGZrQrCWGOr9zp6k003T0xGlS9oEPLM1yid1s1Aeeq8p8Jaee2gGbhpOZ8fySHPcBX2e7TThgoqwfN/wvDzBwko5VPHTaWiVa9FW4zirwyE9EK8LmjAuodF63QOBujO5YTCf1ja5iC5czxZrjNsZCznXmsVqZlyetF2aMofDk++0T0zCmXpMRjivmLV0O/ZSl/HDkMua1TdPuink+FKdGrwCH/IzyeAfT95yVisiRpmgNAhn8/IW/U8v87voquy+YoVL6egSjoB5EyEnzSoojK7qyRPCPmFmKcJHK3aoT3yocwgOSgClqX1gbrYrXAKkXR8lPp7VlZdNKIbKQLu6TILAOVILsAU2MFJbomMAREL/kM9tB3jOj34gKl0qghMOM10BUnWZ3L+MrNamm/0nrnFhlsI8OIVB47ahOnhVZsLk1H2LGZDwBvJTv2gzEG0mUaQaA45/dxJWvR9IZpObEu6T/U/e+uKI+g= 83 | skip_existing: true 84 | skip_cleanup: true 85 | on: 86 | condition: $TRAVIS_OS_NAME != "windows" 87 | repo: AxeldeRomblay/MLBox 88 | branch: master 89 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD-3 License 2 | Copyright (c) 2017, Axel ARONIO DE ROMBLAY 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of MLBox nor the names of its contributors may be used 14 | to endorse or promote products derived from this software without specific 15 | prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL AXEL ARONIO DE ROMBLAY BE LIABLE FOR ANY 21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include *.rst 3 | include *.txt 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts. 2 | 3 | clean-build: ## remove build artifacts 4 | rm -fr build/ 5 | rm -fr dist/ 6 | rm -fr .eggs/ 7 | find . -name '*.egg-info' -exec rm -fr {} + 8 | find . -name '*.egg' -exec rm -f {} + 9 | 10 | clean-pyc: ## remove Python file artifacts 11 | find . -name '*.pyc' -exec rm -f {} + 12 | find . -name '*.pyo' -exec rm -f {} + 13 | find . -name '*~' -exec rm -f {} + 14 | find . -name '__pycache__' -exec rm -fr {} + 15 | 16 | clean-test: ## remove test and coverage artifacts 17 | cd tests/; \ 18 | rm -fr .tox/; \ 19 | rm -f .coverage; \ 20 | rm -fr htmlcov/ 21 | 22 | test: ## run tests quickly with the default Python 23 | cd tests/; \ 24 | pytest 25 | 26 | coverage: ## check code coverage quickly with the default Python 27 | cd tests/; \ 28 | coverage run -m --source=../mlbox/ pytest;\ 29 | coverage html;\ 30 | $(BROWSER) htmlcov/index.html 31 | 32 | docs: ## generate Sphinx HTML documentation, including API docs 33 | rm -f docs/mlbox.rst 34 | rm -f docs/modules.rst 35 | sphinx-apidoc -o docs/ mlbox 36 | $(MAKE) -C docs clean 37 | $(MAKE) -C docs html 38 | $(BROWSER) docs/_build/html/index.html 39 | 40 | release: ## package and upload a release 41 | python setup.py sdist upload 42 | python setup.py bdist_wheel upload 43 | 44 | dist: ## builds source and wheel package 45 | python setup.py sdist 46 | python setup.py bdist_wheel 47 | ls -l dist 48 | 49 | install: ## install the package to the active Python's site-packages 50 | python setup.py install 51 | 52 | develop: ## install the package to the active Python's site-packages in developer mode 53 | python setup.py develop 54 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: docs/logos/logo.png 2 | 3 | |Documentation Status| |PyPI version| |Build Status| |GitHub Issues| |codecov| |License| |Downloads| |Python Versions| 4 | 5 | ----------------------- 6 | 7 | **MLBox is a powerful Automated Machine Learning python library.** It provides the following features: 8 | 9 | 10 | * Fast reading and distributed data preprocessing/cleaning/formatting 11 | * Highly robust feature selection and leak detection 12 | * Accurate hyper-parameter optimization in high-dimensional space 13 | * State-of-the art predictive models for classification and regression (Deep Learning, Stacking, LightGBM,...) 14 | * Prediction with models interpretation 15 | 16 | 17 | **For more details**, please refer to the `official documentation `__ 18 | 19 | 20 | -------------------------- 21 | 22 | How to Contribute 23 | ================= 24 | 25 | MLBox has been developed and used by many active community members. Your help is very valuable to make it better for everyone. 26 | 27 | - Check out `call for contributions `__ to see what can be improved, or open an issue if you want something. 28 | - Contribute to the `tests `__ to make it more reliable. 29 | - Contribute to the `documents `__ to make it clearer for everyone. 30 | - Contribute to the `examples `__ to share your experience with other users. 31 | - Open `issue `__ if you met problems during development. 32 | 33 | For more details, please refer to `CONTRIBUTING `__. 34 | 35 | .. |Documentation Status| image:: https://readthedocs.org/projects/mlbox/badge/?version=latest 36 | :target: https://mlbox.readthedocs.io/en/latest/ 37 | .. |PyPI version| image:: https://badge.fury.io/py/mlbox.svg 38 | :target: https://pypi.python.org/pypi/mlbox 39 | .. |Build Status| image:: https://travis-ci.org/AxeldeRomblay/MLBox.svg?branch=master 40 | :target: https://travis-ci.org/AxeldeRomblay/MLBox 41 | .. |GitHub Issues| image:: https://img.shields.io/github/issues/AxeldeRomblay/MLBox.svg 42 | :target: https://github.com/AxeldeRomblay/MLBox/issues 43 | .. |codecov| image:: https://codecov.io/gh/AxeldeRomblay/MLBox/branch/master/graph/badge.svg 44 | :target: https://codecov.io/gh/AxeldeRomblay/MLBox 45 | .. |License| image:: https://img.shields.io/badge/License-BSD%203--Clause-blue.svg 46 | :target: https://github.com/AxeldeRomblay/MLBox/blob/master/LICENSE 47 | .. |Downloads| image:: https://pepy.tech/badge/mlbox 48 | :target: https://pepy.tech/project/mlbox 49 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/mlbox.svg 50 | :target: https://pypi.org/project/mlbox 51 | -------------------------------------------------------------------------------- /VERSION.txt: -------------------------------------------------------------------------------- 1 | 0.8.5 2 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | token: 989a47e4-aa64-4cbd-8516-52d00e1eb129 3 | notify: 4 | require_ci_to_pass: yes 5 | coverage: 6 | precision: 2 7 | round: up 8 | range: "50...100" 9 | status: 10 | project: 11 | default: 12 | # Commits pushed to master should not make the overall 13 | # project coverage decrease by more than 1% 14 | target: auto 15 | threshold: 1% 16 | patch: 17 | default: 18 | # Be tolerant on slight code coverage diff on PRs to limit 19 | # noisy red coverage status on github PRs. 20 | target: auto 21 | threshold: 1% 22 | changes: no 23 | parsers: 24 | gcov: 25 | branch_detection: 26 | conditional: yes 27 | loop: yes 28 | method: no 29 | macro: no 30 | 31 | comment: 32 | layout: "header, diff" 33 | behavior: default 34 | require_changes: no -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/mlbox.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/mlbox.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/mlbox" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/mlbox" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Authors 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Axel ARONIO DE ROMBLAY 9 | 10 | * email: 11 | * linkedin: 12 | 13 | Contributors 14 | ------------ 15 | 16 | * Nicolas CHEREL 17 | * Mohamed MASKANI 18 | * Henri GERARD 19 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # mlbox documentation build configuration file, created by 5 | # sphinx-quickstart on Tue Jul 9 22:26:36 2013. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import sys 17 | import os 18 | from mock import Mock as MagicMock 19 | 20 | class Mock(MagicMock): 21 | @classmethod 22 | def __getattr__(cls, name): 23 | return MagicMock() 24 | 25 | MOCK_MODULES = ['numpy', 26 | 'matplotlib', 27 | 'matplotlib.pyplot', 28 | 'hyperopt', 29 | 'joblib', 30 | 'pandas', 31 | 'sklearn', 32 | 'sklearn.ensemble', 33 | 'sklearn.metrics', 34 | 'sklearn.impute', 35 | 'sklearn.linear_model', 36 | 'sklearn.model_selection', 37 | 'sklearn.tree', 38 | 'sklearn.pipeline', 39 | 'sklearn.preprocessing', 40 | 'tensorflow', 41 | 'tensorflow.keras.layers', 42 | 'tensorflow.keras.models', 43 | 'lightgbm' 44 | ] 45 | 46 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 47 | 48 | 49 | # If extensions (or modules to document with autodoc) are in another 50 | # directory, add these directories to sys.path here. If the directory is 51 | # relative to the documentation root, use os.path.abspath to make it 52 | # absolute, like shown here. 53 | #sys.path.insert(0, os.path.abspath('.')) 54 | 55 | # Get the project root dir, which is the parent dir of this 56 | cwd = os.getcwd() 57 | project_root = os.path.dirname(cwd) 58 | 59 | # Insert the project root dir as the first element in the PYTHONPATH. 60 | # This lets us ensure that the source package is imported, and that its 61 | # version is used. 62 | 63 | sys.path.insert(0, project_root) 64 | 65 | #import mlbox 66 | 67 | # -- General configuration --------------------------------------------- 68 | 69 | # If your documentation needs a minimal Sphinx version, state it here. 70 | #needs_sphinx = '1.0' 71 | 72 | # Add any Sphinx extension module names here, as strings. They can be 73 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 74 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon'] 75 | napoleon_numpy_docstring = True 76 | 77 | # Add any paths that contain templates here, relative to this directory. 78 | templates_path = ['_templates'] 79 | 80 | # The suffix of source filenames. 81 | source_suffix = '.rst' 82 | 83 | # The encoding of source files. 84 | #source_encoding = 'utf-8-sig' 85 | 86 | # The master toctree document. 87 | master_doc = 'index' 88 | 89 | # General information about the project. 90 | project = u'MLBox' 91 | copyright = u"2017, Axel ARONIO DE ROMBLAY" 92 | 93 | # The version info for the project you're documenting, acts as replacement 94 | # for |version| and |release|, also used in various other places throughout 95 | # the built documents. 96 | # 97 | # The short X.Y version. 98 | #version = mlbox.__version__ 99 | # The full version, including alpha/beta/rc tags. 100 | #release = mlbox.__version__ 101 | 102 | # The language for content autogenerated by Sphinx. Refer to documentation 103 | # for a list of supported languages. 104 | #language = None 105 | 106 | # There are two options for replacing |today|: either, you set today to 107 | # some non-false value, then it is used: 108 | #today = '' 109 | # Else, today_fmt is used as the format for a strftime call. 110 | #today_fmt = '%B %d, %Y' 111 | 112 | # List of patterns, relative to source directory, that match files and 113 | # directories to ignore when looking for source files. 114 | exclude_patterns = ['_build'] 115 | 116 | # The reST default role (used for this markup: `text`) to use for all 117 | # documents. 118 | #default_role = None 119 | 120 | # If true, '()' will be appended to :func: etc. cross-reference text. 121 | #add_function_parentheses = True 122 | 123 | # If true, the current module name will be prepended to all description 124 | # unit titles (such as .. function::). 125 | #add_module_names = True 126 | 127 | # If true, sectionauthor and moduleauthor directives will be shown in the 128 | # output. They are ignored by default. 129 | #show_authors = False 130 | 131 | # The name of the Pygments (syntax highlighting) style to use. 132 | pygments_style = 'sphinx' 133 | 134 | # A list of ignored prefixes for module index sorting. 135 | #modindex_common_prefix = [] 136 | 137 | # If true, keep warnings as "system message" paragraphs in the built 138 | # documents. 139 | #keep_warnings = False 140 | 141 | 142 | # -- Options for HTML output ------------------------------------------- 143 | 144 | # The theme to use for HTML and HTML Help pages. See the documentation for 145 | # a list of builtin themes. 146 | html_theme = 'default' 147 | 148 | # Theme options are theme-specific and customize the look and feel of a 149 | # theme further. For a list of options available for each theme, see the 150 | # documentation. 151 | #html_theme_options = {} 152 | 153 | # Add any paths that contain custom themes here, relative to this directory. 154 | #html_theme_path = [] 155 | 156 | # The name for this set of Sphinx documents. If None, it defaults to 157 | # " v documentation". 158 | html_title = "MLBox Documentation" 159 | 160 | # A shorter title for the navigation bar. Default is the same as 161 | # html_title. 162 | html_short_title = "MLBox Documentation" 163 | 164 | # The name of an image file (relative to this directory) to place at the 165 | # top of the sidebar. 166 | html_logo = "logos/small_logo.png" 167 | 168 | # The name of an image file (within the static path) to use as favicon 169 | # of the docs. This file should be a Windows icon file (.ico) being 170 | # 16x16 or 32x32 pixels large. 171 | html_favicon = "logos/small_logo.ico" 172 | 173 | # Add any paths that contain custom static files (such as style sheets) 174 | # here, relative to this directory. They are copied after the builtin 175 | # static files, so a file named "default.css" will overwrite the builtin 176 | # "default.css". 177 | html_static_path = ['_static'] 178 | 179 | # If not '', a 'Last updated on:' timestamp is inserted at every page 180 | # bottom, using the given strftime format. 181 | #html_last_updated_fmt = '%b %d, %Y' 182 | 183 | # If true, SmartyPants will be used to convert quotes and dashes to 184 | # typographically correct entities. 185 | #html_use_smartypants = True 186 | 187 | # Custom sidebar templates, maps document names to template names. 188 | #html_sidebars = {} 189 | 190 | # Additional templates that should be rendered to pages, maps page names 191 | # to template names. 192 | #html_additional_pages = {} 193 | 194 | # If false, no module index is generated. 195 | #html_domain_indices = True 196 | 197 | # If false, no index is generated. 198 | #html_use_index = True 199 | 200 | # If true, the index is split into individual pages for each letter. 201 | #html_split_index = False 202 | 203 | # If true, links to the reST sources are added to the pages. 204 | #html_show_sourcelink = True 205 | 206 | # If true, "Created using Sphinx" is shown in the HTML footer. 207 | # Default is True. 208 | #html_show_sphinx = True 209 | 210 | # If true, "(C) Copyright ..." is shown in the HTML footer. 211 | # Default is True. 212 | html_show_copyright = True 213 | 214 | # If true, an OpenSearch description file will be output, and all pages 215 | # will contain a tag referring to it. The value of this option 216 | # must be the base URL from which the finished HTML is served. 217 | #html_use_opensearch = '' 218 | 219 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 220 | #html_file_suffix = None 221 | 222 | # Output file base name for HTML help builder. 223 | htmlhelp_basename = 'mlboxdoc' 224 | 225 | 226 | # -- Options for LaTeX output ------------------------------------------ 227 | 228 | latex_elements = { 229 | # The paper size ('letterpaper' or 'a4paper'). 230 | #'papersize': 'letterpaper', 231 | 232 | # The font size ('10pt', '11pt' or '12pt'). 233 | #'pointsize': '10pt', 234 | 235 | # Additional stuff for the LaTeX preamble. 236 | #'preamble': '', 237 | } 238 | 239 | # Grouping the document tree into LaTeX files. List of tuples 240 | # (source start file, target name, title, author, documentclass 241 | # [howto/manual]). 242 | latex_documents = [ 243 | ('index', 'mlbox.tex', 244 | u'MLBox Documentation', 245 | u'Axel ARONIO DE ROMBLAY', 'manual'), 246 | ] 247 | 248 | # The name of an image file (relative to this directory) to place at 249 | # the top of the title page. 250 | #latex_logo = None 251 | 252 | # For "manual" documents, if this is true, then toplevel headings 253 | # are parts, not chapters. 254 | #latex_use_parts = False 255 | 256 | # If true, show page references after internal links. 257 | #latex_show_pagerefs = False 258 | 259 | # If true, show URL addresses after external links. 260 | #latex_show_urls = False 261 | 262 | # Documents to append as an appendix to all manuals. 263 | #latex_appendices = [] 264 | 265 | # If false, no module index is generated. 266 | #latex_domain_indices = True 267 | 268 | 269 | # -- Options for manual page output ------------------------------------ 270 | 271 | # One entry per manual page. List of tuples 272 | # (source start file, name, description, authors, manual section). 273 | man_pages = [ 274 | ('index', 'mlbox', 275 | u'MLBox Documentation', 276 | [u'Axel ARONIO DE ROMBLAY'], 1) 277 | ] 278 | 279 | # If true, show URL addresses after external links. 280 | #man_show_urls = False 281 | 282 | 283 | # -- Options for Texinfo output ---------------------------------------- 284 | 285 | # Grouping the document tree into Texinfo files. List of tuples 286 | # (source start file, target name, title, author, 287 | # dir menu entry, description, category) 288 | texinfo_documents = [ 289 | ('index', 'mlbox', 290 | u'MLBox Documentation', 291 | u'Axel ARONIO DE ROMBLAY', 292 | 'mlbox', 293 | 'MLBox is a powerful Automated Machine Learning python library.', 294 | 'Miscellaneous'), 295 | ] 296 | 297 | # Documents to append as an appendix to all manuals. 298 | #texinfo_appendices = [] 299 | 300 | # If false, no module index is generated. 301 | #texinfo_domain_indices = True 302 | 303 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 304 | #texinfo_show_urls = 'footnote' 305 | 306 | # If true, do not generate a @detailmenu in the "Top" node's menu. 307 | #texinfo_no_detailmenu = False 308 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | Contributions are welcome, and they are greatly appreciated! Every 6 | little bit helps, and credit will always be given. 7 | 8 | You can contribute in many ways: 9 | 10 | Types of Contributions 11 | ---------------------- 12 | 13 | Report Bugs 14 | ~~~~~~~~~~~ 15 | 16 | Report bugs at https://github.com/AxeldeRomblay/mlbox/issues. 17 | 18 | If you are reporting a bug, please include: 19 | 20 | * Your operating system name and version. 21 | * Any details about your local setup that might be helpful in troubleshooting. 22 | * The smallest possible example to reproduce the bug. 23 | 24 | Fix Bugs 25 | ~~~~~~~~ 26 | 27 | Look through the GitHub issues for bugs. Anything tagged with "bug" 28 | and "help wanted" is open to whoever wants to implement it. 29 | 30 | Implement Features 31 | ~~~~~~~~~~~~~~~~~~ 32 | 33 | Look through the GitHub issues for features. Anything tagged with "enhancement" 34 | and "help wanted" is open to whoever wants to implement it. 35 | 36 | Write Documentation 37 | ~~~~~~~~~~~~~~~~~~~ 38 | 39 | MLBox could always use more documentation, whether as part of the 40 | official MLBox docs, in docstrings, or even on the web in blog posts, 41 | articles, and such. 42 | 43 | Submit Feedback 44 | ~~~~~~~~~~~~~~~ 45 | 46 | The best way to send feedback is to file an issue at https://github.com/AxeldeRomblay/mlbox/issues. 47 | 48 | If you are proposing a feature: 49 | 50 | * Explain in detail how it would work. 51 | * Keep the scope as narrow as possible, to make it easier to implement. 52 | * Remember that this is a volunteer-driven project, and that contributions 53 | are welcome :) 54 | 55 | Get Started! 56 | ------------ 57 | 58 | Ready to contribute? Here's how to set up `mlbox` for local development. 59 | 60 | 1. Fork the `mlbox` repo on GitHub. 61 | 62 | 2. Clone your fork:: 63 | 64 | $ git clone git@github.com:your_name_here/mlbox.git 65 | 66 | 3. If you have virtualenv installed, skip this step. Either, run the following:: 67 | 68 | $ pip install virtualenv 69 | 70 | 4. Install your local copy into a virtualenv following this commands to set up your fork for local development:: 71 | 72 | $ cd MLBox 73 | $ virtualenv env 74 | $ source env/bin/activate 75 | $ python setup.py develop 76 | 77 | If you have any troubles with the setup, please refer to the `installation guide `__ 78 | 79 | 5. Create a branch for local development:: 80 | 81 | $ git checkout -b name-of-your-bugfix-or-feature 82 | 83 | **Now you're set, you can make your changes locally.** 84 | 85 | NOTE : each time you work on your branch, you will need to activate the virtualenv: ``$ source env/bin/activate``. To deactivate it, simply run: ``$ deactivate``. 86 | 87 | 6. When you're done making changes, check that your changes pass the tests. 88 | 89 | NOTE : you need to install **pytest** before running the tests:: 90 | 91 | $ cd tests 92 | $ pytest 93 | 94 | 7. Commit your changes and push your branch to GitHub:: 95 | 96 | $ git add . 97 | $ git commit -m "Your detailed description of your changes." 98 | $ git push origin name-of-your-bugfix-or-feature 99 | 100 | 8. Submit a pull request through the GitHub website. 101 | 102 | Pull Request Guidelines 103 | ----------------------- 104 | 105 | Before you submit a pull request, check that it meets these guidelines: 106 | 107 | 1. The pull request should include tests. 108 | 2. If the pull request adds functionality, the docs should be updated. Put 109 | your new functionality into a function with a docstring. 110 | 3. The pull request should work for all supported Python versions and for PyPy. Check 111 | https://travis-ci.org/AxeldeRomblay/MLBox/pull_requests 112 | and make sure that the tests pass for all supported Python versions. 113 | -------------------------------------------------------------------------------- /docs/features.rst: -------------------------------------------------------------------------------- 1 | Preprocessing 2 | ============= 3 | 4 | Reading 5 | ------- 6 | 7 | .. autoclass:: mlbox.preprocessing.Reader 8 | :members: 9 | 10 | Drift thresholding 11 | ------------------ 12 | 13 | .. autoclass:: mlbox.preprocessing.Drift_thresholder 14 | :members: 15 | 16 | Encoding 17 | ======== 18 | 19 | Missing values 20 | -------------- 21 | 22 | .. autoclass:: mlbox.encoding.NA_encoder 23 | :members: 24 | 25 | Categorical features 26 | -------------------- 27 | 28 | .. autoclass:: mlbox.encoding.Categorical_encoder 29 | :members: 30 | 31 | Model 32 | ===== 33 | 34 | Classification 35 | -------------- 36 | 37 | Feature selection 38 | ~~~~~~~~~~~~~~~~~ 39 | 40 | .. autoclass:: mlbox.model.classification.Clf_feature_selector 41 | :members: 42 | 43 | Classification 44 | ~~~~~~~~~~~~~~ 45 | 46 | .. autoclass:: mlbox.model.classification.Classifier 47 | :members: 48 | 49 | Stacking 50 | ~~~~~~~~ 51 | 52 | .. autoclass:: mlbox.model.classification.StackingClassifier 53 | :members: 54 | 55 | Regression 56 | ---------- 57 | 58 | Feature selection 59 | ~~~~~~~~~~~~~~~~~ 60 | 61 | .. autoclass:: mlbox.model.regression.Reg_feature_selector 62 | :members: 63 | 64 | Regression 65 | ~~~~~~~~~~ 66 | 67 | .. autoclass:: mlbox.model.regression.Regressor 68 | :members: 69 | 70 | Stacking 71 | ~~~~~~~~ 72 | 73 | .. autoclass:: mlbox.model.regression.StackingRegressor 74 | :members: 75 | 76 | 77 | Optimisation 78 | ============ 79 | 80 | .. autoclass:: mlbox.optimisation.Optimiser 81 | :members: 82 | 83 | Prediction 84 | ========== 85 | 86 | .. autoclass:: mlbox.prediction.Predictor 87 | :members: 88 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | History 2 | ======= 3 | 4 | 0.1.0 (2017-02-09) 5 | ------------------ 6 | * First non-official release. 7 | 8 | 0.1.1 (2017-02-23) 9 | ------------------ 10 | * add of several estimators : Random Forest, Extra Trees, Logistic Regression, ... 11 | * improvement in verbose mode for reader. 12 | 13 | 0.1.2 (2017-03-02) 14 | ------------------ 15 | * add of dropout for entity embeddings. 16 | * improvement in optimiser. 17 | 18 | 0.2.0 (2017-03-22) 19 | ------------------ 20 | * add of feature importances for base learners. 21 | * add of leak detection. 22 | * add of stacking meta-model. 23 | * improvement in verbose mode for optimiser (folds variance). 24 | 25 | 0.2.1 (2017-04-26) 26 | ------------------ 27 | * add of feature importances for bagging and boosting meta-models. 28 | 29 | 0.2.2 (first official release : 2017-06-13) 30 | ------------------------------------------- 31 | * update of dependencies (Keras 2.0,...). 32 | * add of LightGBM model. 33 | 34 | 0.3.0 (2017-07-11) 35 | ------------------ 36 | * Python 2.7 & Python 3.4-3.6 compatibilities 37 | 38 | 0.3.1 (2017-07-12) 39 | ------------------ 40 | * Availability on PyPI. 41 | 42 | 0.4.0 (2017-07-18) 43 | ------------------ 44 | * add of pipeline memory. 45 | 46 | 0.4.1 (2017-07-21) 47 | ------------------ 48 | * improvement in verbose mode for reader (display missing values) 49 | 50 | 0.4.2 (2017-07-25) 51 | ------------------ 52 | * update of dependencies 53 | 54 | 0.4.3 (2017-07-26) 55 | ------------------ 56 | * improvement in verbose mode for predictor (display feature importances) 57 | * wait until modules and engines are imported 58 | 59 | 0.4.4 (2017-08-04) 60 | ------------------ 61 | * pep8 style 62 | * normalization of drift coefficients 63 | * warning size of folder 'save' 64 | 65 | 0.5.0 (2017-08-24) 66 | ------------------ 67 | * improvement in verbose mode 68 | * add of new dates features 69 | * add of a new strategy for missing categorical values 70 | * new parallel computing 71 | 72 | 0.5.1 (2017-08-25) 73 | ------------------ 74 | * improvement in verbose mode for reader (display target quantiles for regression) 75 | 76 | 0.6.0 (2019-04-26) 77 | ------------------ 78 | * remove xgboost installation 79 | 80 | 0.7.0 (2019-06-26) 81 | ------------------ 82 | * add support for Mac OS & Windows 83 | * update support for python versions 84 | * improve setup 85 | * add tests 86 | * improve documentation & examples 87 | * minor changes in the package architecture 88 | 89 | 0.8.0 (2019-07-29) 90 | ------------------ 91 | * remove support for python 2.7 version 92 | 93 | 0.8.1 (2019-08-29) 94 | ------------------ 95 | * add python 3.7 version 96 | * update package dependencies 97 | 98 | 0.8.4 (2020-04-13) 99 | ------------------ 100 | * update package dependencies 101 | 102 | 0.8.5 (2020-08-25) 103 | ------------------ 104 | * minor fix (package dependencies) 105 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Home - Welcome to MLBox's official documentation 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | ------------------ 5 | 6 | .. image:: logos/logo.png 7 | 8 | 9 | **MLBox is a powerful Automated Machine Learning python library.** 10 | It provides the following features: 11 | 12 | * Fast reading and distributed data preprocessing/cleaning/formatting. 13 | * Highly robust feature selection and leak detection. 14 | * Accurate hyper-parameter optimization in high-dimensional space. 15 | * State-of-the art predictive models for classification and regression (Deep Learning, Stacking, LightGBM,...). 16 | * Prediction with models interpretation. 17 | 18 | ------------------- 19 | 20 | Links 21 | ~~~~~ 22 | 23 | * **Performance experiments:** 24 | * `Kaggle competition "Two Sigma Connect: Rental Listing Inquiries" `__ (rank: **85/2488**) 25 | * `Kaggle competition "Sberbank Russian Housing Market" `__ (rank: **190/3274**) 26 | 27 | * **Examples & demos:** 28 | * `Kaggle kernel on "Titanic" dataset `__ (classification) 29 | * `Kaggle kernel on "House Prices" dataset `__ (regression) 30 | 31 | * **Articles, books & tutorials from users:** 32 | * `Tutorial on Automated Machine Learning using MLBox `__ (Analytics Vidhya article) 33 | * `MLBox: a short regression tutorial `__ (user blog) 34 | * `Implementing Auto-ML Systems with Open Source Tools `__ (KDnuggets article) 35 | * `Hands-On Automated Machine Learning `__ (O'Reilly book) 36 | * `Automatic Machine Learning `__ (Youtube tutorial) 37 | * `Automated Machine Learning with MLBox `__ (user blog) 38 | * `Introduction to AutoML with MLBox `__ (user blog) 39 | 40 | * **Webinars & conferences:** 41 | * `Paris ML Hors Série #13: Automated Machine Learning `__ 42 | * `Analytics Vidhya: Automated Machine Learning using MLBox python package `__ 43 | * `DataHack Summit 2017 by Analytics Vidhya `__ 44 | 45 | * **References:** 46 | * `AutoML.org `__ 47 | * `Skymind AI Wiki `__ 48 | * `TPOT github `__ 49 | * `Towards Data Science `__ 50 | 51 | 52 | .. toctree:: 53 | :maxdepth: 1 54 | :caption: Tutorials 55 | :hidden: 56 | 57 | installation 58 | introduction 59 | 60 | .. toctree:: 61 | :maxdepth: 3 62 | :caption: Features 63 | :hidden: 64 | 65 | features 66 | 67 | .. toctree:: 68 | :maxdepth: 1 69 | :caption: Contribution 70 | :hidden: 71 | 72 | authors 73 | history 74 | contributing 75 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation guide 2 | ================== 3 | 4 | |Documentation Status| |PyPI version| |Build Status| |GitHub Issues| |codecov| |License| |Downloads| |Python Versions| 5 | 6 | 7 | Compatibilities 8 | --------------- 9 | 10 | * *Operating systems:* **Linux**, **MacOS** & **Windows**. 11 | * *Python versions:* **3.5** - **3.7**. & **64-bit version** only (32-bit python is not supported) 12 | 13 | 14 | Basic requirements 15 | ------------------ 16 | 17 | We suppose that `pip `__ is already installed. 18 | 19 | Also, please make sure you have `setuptools `__ and `wheel `__ installed, which is usually the case if pip is installed. 20 | If not, you can install both by running the following commands respectively: ``pip install setuptools`` and ``pip install wheel``. 21 | 22 | 23 | Preparation (MacOS only) 24 | ------------------------ 25 | 26 | For **MacOS** users only, **OpenMP** is required. You can install it by the following command: ``brew install libomp``. 27 | 28 | 29 | Installation 30 | ------------ 31 | 32 | You can choose to install MLBox either from pip or from the Github. 33 | 34 | 35 | Install from pip 36 | ~~~~~~~~~~~~~~~~ 37 | 38 | Official releases of MLBox are available on **PyPI**, so you only need to run the following command: 39 | 40 | .. code-block:: console 41 | 42 | $ pip install mlbox 43 | 44 | 45 | Install from the Github 46 | ~~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | If you want to get the latest features, you can also install MLBox from the Github. 49 | 50 | * **The sources for MLBox can be downloaded** from the `Github repo`_. 51 | 52 | * You can either clone the public repository: 53 | 54 | .. code-block:: console 55 | 56 | $ git clone git://github.com/AxeldeRomblay/mlbox 57 | 58 | * Or download the `tarball`_: 59 | 60 | .. code-block:: console 61 | 62 | $ curl -OL https://github.com/AxeldeRomblay/mlbox/tarball/master 63 | 64 | 65 | * Once you have a copy of the source, **you can install it**: 66 | 67 | .. code-block:: console 68 | 69 | $ cd MLBox 70 | $ python setup.py install 71 | 72 | 73 | Issues 74 | ------ 75 | 76 | If you get any troubles during installation, you can refer to the `issues `__. 77 | 78 | **Please first check that there are no similar issues opened before opening one**. 79 | 80 | 81 | .. _Github repo: https://github.com/AxeldeRomblay/mlbox 82 | 83 | .. _tarball: https://github.com/AxeldeRomblay/mlbox/tarball/master 84 | 85 | .. |Documentation Status| image:: https://readthedocs.org/projects/mlbox/badge/?version=latest 86 | :target: http://mlbox.readthedocs.io/en/latest/?badge=latest 87 | .. |PyPI version| image:: https://badge.fury.io/py/mlbox.svg 88 | :target: https://pypi.python.org/pypi/mlbox 89 | .. |Build Status| image:: https://travis-ci.org/AxeldeRomblay/MLBox.svg?branch=master 90 | :target: https://travis-ci.org/AxeldeRomblay/MLBox 91 | .. |GitHub Issues| image:: https://img.shields.io/github/issues/AxeldeRomblay/MLBox.svg 92 | :target: https://github.com/AxeldeRomblay/MLBox/issues 93 | .. |codecov| image:: https://codecov.io/gh/AxeldeRomblay/MLBox/branch/master/graph/badge.svg 94 | :target: https://codecov.io/gh/AxeldeRomblay/MLBox 95 | .. |License| image:: https://img.shields.io/badge/License-BSD%203--Clause-blue.svg 96 | :target: https://github.com/AxeldeRomblay/MLBox/blob/master/LICENSE 97 | .. |Downloads| image:: https://pepy.tech/badge/mlbox 98 | :target: https://pepy.tech/project/mlbox 99 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/mlbox.svg 100 | :target: https://pypi.org/project/mlbox 101 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | Getting started: 30 seconds to MLBox 2 | ==================================== 3 | 4 | MLBox main package contains 3 sub-packages : **preprocessing**, **optimisation** and **prediction**. Each one of them are respectively aimed at reading and preprocessing data, testing or optimising a wide range of learners and predicting the target on a test dataset. 5 | 6 | **Here are a few lines to import the MLBox:** 7 | 8 | .. code-block:: python 9 | 10 | from mlbox.preprocessing import * 11 | from mlbox.optimisation import * 12 | from mlbox.prediction import * 13 | 14 | 15 | **Then, all you need to give is :** 16 | 17 | * the list of paths to your train datasets and test datasets 18 | * the name of the target you try to predict (classification or regression) 19 | 20 | .. code-block:: python 21 | 22 | paths = [".csv", ".csv", ..., ".csv"] #to modify 23 | target_name = "" #to modify 24 | 25 | 26 | **Now, let the MLBox do the job !** 27 | 28 | ... to read and preprocess your files : 29 | 30 | .. code-block:: python 31 | 32 | data = Reader(sep=",").train_test_split(paths, target_name) #reading 33 | data = Drift_thresholder().fit_transform(data) #deleting non-stable variables 34 | 35 | ... to evaluate models (here default configuration): 36 | 37 | .. code-block:: python 38 | 39 | Optimiser().evaluate(None, data) 40 | 41 | 42 | ... or to test and optimize the whole Pipeline [**OPTIONAL**]: 43 | 44 | * missing data encoder, aka 'ne' 45 | * categorical variables encoder, aka 'ce' 46 | * feature selector, aka 'fs' 47 | * meta-features stacker, aka 'stck' 48 | * final estimator, aka 'est' 49 | 50 | **NB** : please have a look at all the possibilities you have to configure the Pipeline (steps, parameters and values...) 51 | 52 | .. code-block:: python 53 | 54 | space = { 55 | 56 | 'ne__numerical_strategy' : {"space" : [0, 'mean']}, 57 | 58 | 'ce__strategy' : {"space" : ["label_encoding", "random_projection", "entity_embedding"]}, 59 | 60 | 'fs__strategy' : {"space" : ["variance", "rf_feature_importance"]}, 61 | 'fs__threshold': {"search" : "choice", "space" : [0.1, 0.2, 0.3]},             62 | 63 | 'est__strategy' : {"space" : ["LightGBM"]}, 64 | 'est__max_depth' : {"search" : "choice", "space" : [5,6]}, 65 | 'est__subsample' : {"search" : "uniform", "space" : [0.6,0.9]} 66 | 67 | } 68 | 69 | best = opt.optimise(space, data, max_evals = 5) 70 | 71 | ... finally to predict on the test set with the best parameters (or None for default configuration): 72 | 73 | .. code-block:: python 74 | 75 | Predictor().fit_predict(best, data) 76 | 77 | 78 | **That's all !** You can have a look at the folder "save" where you can find : 79 | 80 | * your predictions 81 | * feature importances 82 | * drift coefficients of your variables (0.5 = very stable, 1. = not stable at all) 83 | -------------------------------------------------------------------------------- /docs/logos/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/logo.png -------------------------------------------------------------------------------- /docs/logos/small_logo.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/small_logo.ico -------------------------------------------------------------------------------- /docs/logos/small_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/small_logo.png -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\mlbox.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\mlbox.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/webinars/auto-ML.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/webinars/auto-ML.pdf -------------------------------------------------------------------------------- /docs/webinars/features.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/webinars/features.pdf -------------------------------------------------------------------------------- /examples/classification/classification.py: -------------------------------------------------------------------------------- 1 | """A classification example using mlbox.""" 2 | from mlbox.preprocessing import Reader 3 | from mlbox.preprocessing import Drift_thresholder 4 | from mlbox.optimisation import Optimiser 5 | from mlbox.prediction import Predictor 6 | 7 | # Paths to the train set and the test set. 8 | paths = ["train_classification.csv", "test_classification.csv"] 9 | # Name of the feature to predict. 10 | # This columns should only be present in the train set. 11 | target_name = "Survived" 12 | 13 | # Reading and cleaning all files 14 | # Declare a reader for csv files 15 | rd = Reader(sep=',') 16 | # Return a dictionnary containing three entries 17 | # dict["train"] contains training samples withtout target columns 18 | # dict["test"] contains testing elements withtout target columns 19 | # dict["target"] contains target columns for training samples. 20 | data = rd.train_test_split(paths, target_name) 21 | 22 | dft = Drift_thresholder() 23 | data = dft.fit_transform(data) 24 | 25 | # Tuning 26 | # Declare an optimiser. Scoring possibilities for classification lie in : 27 | # {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"} 28 | opt = Optimiser(scoring='accuracy', n_folds=3) 29 | opt.evaluate(None, data) 30 | 31 | # Space of hyperparameters 32 | # The keys must respect the following syntax : "enc__param". 33 | # "enc" = "ne" for na encoder 34 | # "enc" = "ce" for categorical encoder 35 | # "enc" = "fs" for feature selector [OPTIONAL] 36 | # "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL] 37 | # "enc" = "est" for the final estimator 38 | # "param" : a correct associated parameter for each step. 39 | # Ex: "max_depth" for "enc"="est", ... 40 | # The values must respect the syntax: {"search":strategy,"space":list} 41 | # "strategy" = "choice" or "uniform". Default = "choice" 42 | # list : a list of values to be tested if strategy="choice". 43 | # Else, list = [value_min, value_max]. 44 | # Available strategies for ne_numerical_strategy are either an integer, a float 45 | # or in {'mean', 'median', "most_frequent"} 46 | # Available strategies for ce_strategy are: 47 | # {"label_encoding", "dummification", "random_projection", entity_embedding"} 48 | space = {'ne__numerical_strategy': {"search": "choice", "space": [0]}, 49 | 'ce__strategy': {"search": "choice", 50 | "space": ["label_encoding", 51 | "random_projection", 52 | "entity_embedding"]}, 53 | 'fs__threshold': {"search": "uniform", 54 | "space": [0.01, 0.3]}, 55 | 'est__max_depth': {"search": "choice", 56 | "space": [3, 4, 5, 6, 7]} 57 | 58 | } 59 | 60 | # Optimises hyper-parameters of the whole Pipeline with a given scoring 61 | # function. Algorithm used to optimize : Tree Parzen Estimator. 62 | # 63 | # IMPORTANT : Try to avoid dependent parameters and to set one feature 64 | # selection strategy and one estimator strategy at a time. 65 | best = opt.optimise(space, data, 15) 66 | 67 | # Make prediction and save the results in save folder. 68 | prd = Predictor() 69 | prd.fit_predict(best, data) 70 | -------------------------------------------------------------------------------- /examples/regression/regression.py: -------------------------------------------------------------------------------- 1 | """A regression example using mlbox.""" 2 | import numpy as np 3 | 4 | from mlbox.preprocessing import Reader 5 | from mlbox.preprocessing import Drift_thresholder 6 | from mlbox.optimisation import make_scorer 7 | from mlbox.optimisation import Optimiser 8 | from mlbox.prediction import Predictor 9 | 10 | # Paths to the train set and the test set. 11 | paths = ["train_regression.csv", "test_regression.csv"] 12 | # Name of the feature to predict. 13 | # This columns should only be present in the train set. 14 | target_name = "SalePrice" 15 | 16 | # Reading and cleaning all files 17 | # Declare a reader for csv files 18 | rd = Reader(sep=',') 19 | # Return a dictionnary containing three entries 20 | # dict["train"] contains training samples withtout target columns 21 | # dict["test"] contains testing elements withtout target columns 22 | # dict["target"] contains target columns for training samples. 23 | data = rd.train_test_split(paths, target_name) 24 | 25 | dft = Drift_thresholder() 26 | data = dft.fit_transform(data) 27 | 28 | # Tuning 29 | mape = make_scorer(lambda y_true, 30 | y_pred: 100*np.sum( 31 | np.abs(y_true-y_pred)/y_true 32 | )/len(y_true), 33 | greater_is_better=False, 34 | needs_proba=False) 35 | # Declare an optimiser. You can declare your own score 36 | # as presented here or use one in 37 | # {"neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"} 38 | opt = Optimiser(scoring=mape, n_folds=3) 39 | opt.evaluate(None, data) 40 | 41 | # Space of hyperparameters 42 | # The keys must respect the following syntax : "enc__param". 43 | # "enc" = "ne" for na encoder 44 | # "enc" = "ce" for categorical encoder 45 | # "enc" = "fs" for feature selector [OPTIONAL] 46 | # "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL] 47 | # "enc" = "est" for the final estimator 48 | # "param" : a correct associated parameter for each step. 49 | # Ex: "max_depth" for "enc"="est", ... 50 | # The values must respect the syntax: {"search":strategy,"space":list} 51 | # "strategy" = "choice" or "uniform". Default = "choice" 52 | # list : a list of values to be tested if strategy="choice". 53 | # Else, list = [value_min, value_max]. 54 | # Available strategies for ne_numerical_strategy are either an integer, a float 55 | # or in {'mean', 'median', "most_frequent"} 56 | # Available strategies for ce_strategy are: 57 | # {"label_encoding", "dummification", "random_projection", entity_embedding"} 58 | space = { 59 | 'ne__numerical_strategy': {"search": "choice", 60 | "space": [0]}, 61 | 'ce__strategy': {"search": "choice", 62 | "space": ["label_encoding", 63 | "random_projection", 64 | "entity_embedding"]}, 65 | 'fs__threshold': {"search": "uniform", 66 | "space": [0.01, 0.3]}, 67 | 'est__max_depth': {"search": "choice", 68 | "space": [3, 4, 5, 6, 7]} 69 | 70 | } 71 | 72 | # Optimises hyper-parameters of the whole Pipeline with a given scoring 73 | # function. Algorithm used to optimize : Tree Parzen Estimator. 74 | # 75 | # IMPORTANT : Try to avoid dependent parameters and to set one feature 76 | # selection strategy and one estimator strategy at a time. 77 | best = opt.optimise(space, data, 15) 78 | 79 | # Make prediction and save the results in save folder. 80 | prd = Predictor() 81 | prd.fit_predict(best, data) 82 | -------------------------------------------------------------------------------- /mlbox/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = """Axel ARONIO DE ROMBLAY""" 4 | __email__ = 'axelderomblay@gmail.com' 5 | 6 | from .preprocessing import * 7 | from .encoding import * 8 | from .optimisation import * 9 | from .prediction import * 10 | from .model import * 11 | -------------------------------------------------------------------------------- /mlbox/encoding/__init__.py: -------------------------------------------------------------------------------- 1 | from .na_encoder import * 2 | from .categorical_encoder import * 3 | -------------------------------------------------------------------------------- /mlbox/encoding/na_encoder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Author: Axel ARONIO DE ROMBLAY 3 | # License: BSD 3 clause 4 | 5 | import pandas as pd 6 | import warnings 7 | 8 | from sklearn.impute import SimpleImputer 9 | 10 | 11 | class NA_encoder(): 12 | """Encodes missing values for both numerical and categorical features. 13 | 14 | Several strategies are possible in each case. 15 | 16 | Parameters 17 | ---------- 18 | numerical_strategy : str or float or int. default = "mean" 19 | The strategy to encode NA for numerical features. 20 | Available strategies = "mean", "median", 21 | "most_frequent" or a float/int value 22 | 23 | categorical_strategy : str, default = '' 24 | The strategy to encode NA for categorical features. 25 | Available strategies = a string or "most_frequent" 26 | 27 | """ 28 | 29 | def __init__(self, 30 | numerical_strategy='mean', 31 | categorical_strategy=''): 32 | """Init a NA_encoder. 33 | 34 | User can choose numerical strategy and categorical strategy. 35 | 36 | Parameters 37 | ---------- 38 | numerical_strategy : str or float or int. default = "mean" 39 | The strategy to encode NA for numerical features. 40 | 41 | categorical_strategy : str, default = '' 42 | The strategy to encode NA for categorical features. 43 | 44 | """ 45 | self.numerical_strategy = numerical_strategy 46 | self.categorical_strategy = categorical_strategy 47 | self.__Lcat = [] 48 | self.__Lnum = [] 49 | self.__imp = None 50 | self.__mode = dict() 51 | self.__fitOK = False 52 | 53 | def get_params(self, deep=True): 54 | """Get parameters of a NA_encoder object.""" 55 | return {'numerical_strategy': self.numerical_strategy, 56 | 'categorical_strategy': self.categorical_strategy} 57 | 58 | def set_params(self, **params): 59 | """Set parameters for a NA_encoder object. 60 | 61 | Set numerical strategy and categorical strategy. 62 | 63 | Parameters 64 | ---------- 65 | numerical_strategy : str or float or int. default = "mean" 66 | The strategy to encode NA for numerical features. 67 | 68 | categorical_strategy : str, default = '' 69 | The strategy to encode NA for categorical features. 70 | 71 | """ 72 | self.__fitOK = False 73 | 74 | for k, v in params.items(): 75 | if k not in self.get_params(): 76 | warnings.warn("Invalid parameter(s) for encoder NA_encoder. " 77 | "Parameter(s) IGNORED. " 78 | "Check the list of available parameters with " 79 | "`encoder.get_params().keys()`") 80 | else: 81 | setattr(self, k, v) 82 | 83 | def fit(self, df_train, y_train=None): 84 | """Fits NA Encoder. 85 | 86 | Parameters 87 | ---------- 88 | df_train : pandas dataframe of shape = (n_train, n_features) 89 | The train dataset with numerical and categorical features. 90 | 91 | y_train : pandas series of shape = (n_train, ), default = None 92 | The target for classification or regression tasks. 93 | 94 | Returns 95 | ------- 96 | object 97 | self 98 | 99 | """ 100 | self.__Lcat = df_train.dtypes[df_train.dtypes == 'object'].index 101 | self.__Lnum = df_train.dtypes[df_train.dtypes != 'object'].index 102 | 103 | # Dealing with numerical features 104 | 105 | if (self.numerical_strategy in ['mean', 'median', "most_frequent"]): 106 | 107 | self.__imp = SimpleImputer(strategy=self.numerical_strategy) 108 | 109 | if (len(self.__Lnum) != 0): 110 | self.__imp.fit(df_train[self.__Lnum]) 111 | else: 112 | pass 113 | 114 | elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)): 115 | 116 | pass 117 | 118 | else: 119 | 120 | raise ValueError("Numerical strategy for NA encoding is not valid") 121 | 122 | # Dealing with categorical features 123 | 124 | if (type(self.categorical_strategy) == str): 125 | 126 | if (self.categorical_strategy == "most_frequent"): 127 | 128 | na_count = df_train[self.__Lcat].isnull().sum() 129 | 130 | for col in na_count[na_count>0].index: 131 | 132 | try: 133 | self.__mode[col] = df_train[col].mode()[0] 134 | except: 135 | self.__mode[col] = "" 136 | 137 | else: 138 | pass 139 | 140 | else: 141 | raise ValueError("Categorical strategy for NA encoding is not valid") 142 | 143 | self.__fitOK = True 144 | 145 | return self 146 | 147 | def fit_transform(self, df_train, y_train=None): 148 | """Fits NA Encoder and transforms the dataset. 149 | 150 | Parameters 151 | ---------- 152 | df_train : pandas.Dataframe of shape = (n_train, n_features) 153 | The train dataset with numerical and categorical features. 154 | 155 | y_train : pandas.Series of shape = (n_train, ), default = None 156 | The target for classification or regression tasks. 157 | 158 | Returns 159 | ------- 160 | pandas.Dataframe of shape = (n_train, n_features) 161 | The train dataset with no missing values. 162 | 163 | """ 164 | self.fit(df_train, y_train) 165 | 166 | return self.transform(df_train) 167 | 168 | def transform(self, df): 169 | """Transform the dataset. 170 | 171 | Parameters 172 | ---------- 173 | df : pandas.Dataframe of shape = (n, n_features) 174 | The dataset with numerical and categorical features. 175 | 176 | Returns 177 | ------- 178 | pandas.Dataframe of shape = (n, n_features) 179 | The dataset with no missing values. 180 | 181 | """ 182 | if(self.__fitOK): 183 | 184 | if(len(self.__Lnum) == 0): 185 | 186 | if (self.categorical_strategy != "most_frequent"): 187 | return df[self.__Lcat].fillna(self.categorical_strategy) 188 | 189 | else: 190 | return df[self.__Lcat].fillna(self.__mode) 191 | 192 | else: 193 | 194 | if (self.numerical_strategy in ['mean', 195 | 'median', 196 | "most_frequent"]): 197 | 198 | if (len(self.__Lcat) != 0): 199 | 200 | if (self.categorical_strategy != "most_frequent"): 201 | 202 | return pd.concat( 203 | (pd.DataFrame(self.__imp.transform(df[self.__Lnum]), 204 | columns=self.__Lnum, 205 | index=df.index), 206 | df[self.__Lcat].fillna(self.categorical_strategy) 207 | ), 208 | axis=1)[df.columns] 209 | 210 | else: 211 | 212 | return pd.concat( 213 | (pd.DataFrame(self.__imp.transform(df[self.__Lnum]), 214 | columns=self.__Lnum, 215 | index=df.index), 216 | df[self.__Lcat].fillna(self.__mode) 217 | ), 218 | axis=1)[df.columns] 219 | 220 | else: 221 | 222 | return pd.DataFrame( 223 | self.__imp.transform(df[self.__Lnum]), 224 | columns=self.__Lnum, 225 | index=df.index 226 | ) 227 | 228 | elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)): 229 | 230 | if (len(self.__Lcat) != 0): 231 | 232 | if (self.categorical_strategy != "most_frequent"): 233 | 234 | return pd.concat( 235 | (df[self.__Lnum].fillna(self.numerical_strategy), 236 | df[self.__Lcat].fillna(self.categorical_strategy) 237 | ), 238 | axis=1)[df.columns] 239 | 240 | else: 241 | 242 | return pd.concat( 243 | (df[self.__Lnum].fillna(self.numerical_strategy), 244 | df[self.__Lcat].fillna(self.__mode) 245 | ), 246 | axis=1)[df.columns] 247 | else: 248 | 249 | return df[self.__Lnum].fillna(self.numerical_strategy) 250 | 251 | else: 252 | 253 | raise ValueError("Call fit or fit_transform function before") 254 | -------------------------------------------------------------------------------- /mlbox/model/__init__.py: -------------------------------------------------------------------------------- 1 | from . import classification 2 | from . import regression 3 | 4 | __all__ = ['classification', 'regression'] 5 | -------------------------------------------------------------------------------- /mlbox/model/classification/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature_selector import Clf_feature_selector 2 | from .classifier import Classifier 3 | from .stacking_classifier import StackingClassifier 4 | 5 | __all__ = ['Clf_feature_selector', 'Classifier', 'StackingClassifier'] 6 | -------------------------------------------------------------------------------- /mlbox/model/classification/feature_selector.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Author: Axel ARONIO DE ROMBLAY 3 | # License: BSD 3 clause 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.ensemble import RandomForestClassifier 9 | import warnings 10 | 11 | 12 | class Clf_feature_selector(): 13 | 14 | """Selects useful features. 15 | 16 | Several strategies are possible (filter and wrapper methods). 17 | Works for classification problems only (multiclass or binary). 18 | 19 | Parameters 20 | ---------- 21 | strategy : str, defaut = "l1" 22 | The strategy to select features. 23 | Available strategies = {"variance", "l1", "rf_feature_importance"} 24 | 25 | threshold : float, defaut = 0.3 26 | The percentage of variable to discard according to the strategy. 27 | Must be between 0. and 1. 28 | """ 29 | 30 | def __init__(self, strategy='l1', threshold=0.3): 31 | 32 | # 'variance','l1, 'rf_feature_importance' 33 | self.strategy = strategy 34 | # a float between 0. and 1. defaut : 0.3 ie we drop 0.3 of features 35 | self.threshold = threshold 36 | self.__fitOK = False 37 | self.__to_discard = [] 38 | 39 | 40 | def get_params(self, deep=True): 41 | 42 | return {'strategy': self.strategy, 43 | 'threshold': self.threshold} 44 | 45 | 46 | def set_params(self, **params): 47 | 48 | self.__fitOK = False 49 | 50 | for k, v in params.items(): 51 | if k not in self.get_params(): 52 | warnings.warn("Invalid parameter a for feature selector" 53 | "Clf_feature_selector. Parameter IGNORED. Check" 54 | "the list of available parameters with" 55 | "`feature_selector.get_params().keys()`") 56 | else: 57 | setattr(self, k, v) 58 | 59 | 60 | def fit(self, df_train, y_train): 61 | 62 | """Fits Clf_feature_selector 63 | 64 | Parameters 65 | ---------- 66 | df_train : pandas dataframe of shape = (n_train, n_features) 67 | The train dataset with numerical features and no NA 68 | 69 | y_train : pandas series of shape = (n_train, ) 70 | The target for classification task. Must be encoded. 71 | 72 | Returns 73 | ------- 74 | object 75 | self 76 | """ 77 | 78 | # sanity checks 79 | if((type(df_train) != pd.SparseDataFrame) and 80 | (type(df_train) != pd.DataFrame)): 81 | raise ValueError("df_train must be a DataFrame") 82 | 83 | if (type(y_train) != pd.core.series.Series): 84 | raise ValueError("y_train must be a Series") 85 | 86 | if(self.strategy == 'variance'): 87 | coef = df_train.std() 88 | abstract_threshold = np.percentile(coef, 100. * self.threshold) 89 | self.__to_discard = coef[coef < abstract_threshold].index 90 | self.__fitOK = True 91 | 92 | elif(self.strategy == 'l1'): 93 | model = LogisticRegression(C=0.01, penalty='l1', solver="saga", 94 | n_jobs=-1, random_state=0) # to be tuned 95 | model.fit(df_train, y_train) 96 | coef = np.mean(np.abs(model.coef_), axis=0) 97 | abstract_threshold = np.percentile(coef, 100. * self.threshold) 98 | self.__to_discard = df_train.columns[coef < abstract_threshold] 99 | self.__fitOK = True 100 | 101 | elif(self.strategy == 'rf_feature_importance'): 102 | model = RandomForestClassifier(n_estimators=50, n_jobs=-1, 103 | random_state=0) # to be tuned 104 | model.fit(df_train, y_train) 105 | coef = model.feature_importances_ 106 | abstract_threshold = np.percentile(coef, 100. * self.threshold) 107 | self.__to_discard = df_train.columns[coef < abstract_threshold] 108 | self.__fitOK = True 109 | 110 | else: 111 | raise ValueError("Strategy invalid. Please choose between " 112 | "'variance', 'l1' or 'rf_feature_importance'") 113 | 114 | return self 115 | 116 | 117 | def transform(self, df): 118 | 119 | """Transforms the dataset 120 | 121 | Parameters 122 | ---------- 123 | df : pandas dataframe of shape = (n, n_features) 124 | The dataset with numerical features and no NA 125 | 126 | Returns 127 | ------- 128 | pandas dataframe of shape = (n_train, n_features*(1-threshold)) 129 | The train dataset with relevant features 130 | """ 131 | 132 | if(self.__fitOK): 133 | 134 | # sanity checks 135 | if((type(df) != pd.SparseDataFrame) and 136 | (type(df) != pd.DataFrame)): 137 | raise ValueError("df must be a DataFrame") 138 | 139 | return df.drop(self.__to_discard, axis=1) 140 | else: 141 | raise ValueError("call fit or fit_transform function before") 142 | 143 | 144 | def fit_transform(self, df_train, y_train): 145 | 146 | """Fits Clf_feature_selector and transforms the dataset 147 | 148 | Parameters 149 | ---------- 150 | df_train : pandas dataframe of shape = (n_train, n_features) 151 | The train dataset with numerical features and no NA 152 | 153 | y_train : pandas series of shape = (n_train, ). 154 | The target for classification task. Must be encoded. 155 | 156 | Returns 157 | ------- 158 | pandas dataframe of shape = (n_train, n_features*(1-threshold)) 159 | The train dataset with relevant features 160 | """ 161 | 162 | self.fit(df_train, y_train) 163 | 164 | return self.transform(df_train) 165 | -------------------------------------------------------------------------------- /mlbox/model/regression/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature_selector import Reg_feature_selector 2 | from .regressor import Regressor 3 | from .stacking_regressor import StackingRegressor 4 | 5 | __all__ = ['Reg_feature_selector', 'Regressor', 'StackingRegressor'] 6 | -------------------------------------------------------------------------------- /mlbox/model/regression/feature_selector.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Author: Axel ARONIO DE ROMBLAY 3 | # License: BSD 3 clause 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.linear_model import Lasso 8 | from sklearn.ensemble import RandomForestRegressor 9 | import warnings 10 | 11 | 12 | class Reg_feature_selector(): 13 | 14 | """Selects useful features. 15 | 16 | Several strategies are possible (filter and wrapper methods). 17 | Works for regression problems only. 18 | 19 | Parameters 20 | ---------- 21 | strategy : str, defaut = "l1" 22 | The strategy to select features. 23 | Available strategies = {"variance", "l1", "rf_feature_importance"} 24 | 25 | threshold : float, defaut = 0.3 26 | The percentage of variable to discard according the strategy. 27 | Must be between 0. and 1. 28 | """ 29 | 30 | def __init__(self, strategy='l1', threshold=0.3): 31 | self.strategy = strategy 32 | self.threshold = threshold 33 | self.__fitOK = False 34 | self.__to_discard = [] 35 | 36 | 37 | def get_params(self, deep=True): 38 | return {'strategy': self.strategy, 39 | 'threshold': self.threshold} 40 | 41 | 42 | def set_params(self, **params): 43 | self.__fitOK = False 44 | 45 | for k, v in params.items(): 46 | if k not in self.get_params(): 47 | warnings.warn("Invalid parameter a for feature selector" 48 | "Reg_feature_selector. Parameter IGNORED. Check " 49 | "the list of available parameters with " 50 | "`feature_selector.get_params().keys()`") 51 | else: 52 | setattr(self, k, v) 53 | 54 | 55 | def fit(self, df_train, y_train): 56 | 57 | """Fits Reg_feature_selector. 58 | 59 | Parameters 60 | ---------- 61 | df_train : pandas dataframe of shape = (n_train, n_features) 62 | The train dataset with numerical features and no NA 63 | 64 | y_train : pandas series of shape = (n_train, ). 65 | The target for regression task. 66 | 67 | Returns 68 | ------- 69 | sobject 70 | self 71 | """ 72 | 73 | # sanity checks 74 | if((type(df_train) != pd.SparseDataFrame) and 75 | (type(df_train) != pd.DataFrame)): 76 | raise ValueError("df_train must be a DataFrame") 77 | 78 | if (type(y_train) != pd.core.series.Series): 79 | raise ValueError("y_train must be a Series") 80 | 81 | if(self.strategy == 'variance'): 82 | coef = df_train.std() 83 | abstract_threshold = np.percentile(coef, 100. * self.threshold) 84 | self.__to_discard = coef[coef < abstract_threshold].index 85 | self.__fitOK = True 86 | 87 | elif(self.strategy == 'l1'): 88 | model = Lasso(alpha=100.0, random_state=0) # to be tuned 89 | model.fit(df_train, y_train) 90 | coef = np.abs(model.coef_) 91 | abstract_threshold = np.percentile(coef, 100. * self.threshold) 92 | self.__to_discard = df_train.columns[coef < abstract_threshold] 93 | self.__fitOK = True 94 | 95 | elif(self.strategy == 'rf_feature_importance'): 96 | model = RandomForestRegressor(n_estimators=50, 97 | n_jobs=-1, 98 | random_state=0) # to be tuned 99 | model.fit(df_train, y_train) 100 | coef = model.feature_importances_ 101 | abstract_threshold = np.percentile(coef, 100. * self.threshold) 102 | self.__to_discard = df_train.columns[coef < abstract_threshold] 103 | self.__fitOK = True 104 | 105 | else: 106 | raise ValueError("Strategy invalid. Please choose between " 107 | "'variance', 'l1' or 'rf_feature_importance'") 108 | 109 | return self 110 | 111 | 112 | def transform(self, df): 113 | 114 | """Transforms the dataset 115 | 116 | Parameters 117 | ---------- 118 | df : pandas dataframe of shape = (n, n_features) 119 | The dataset with numerical features and no NA 120 | 121 | Returns 122 | ------- 123 | pandas dataframe of shape = (n_train, n_features*(1-threshold)) 124 | The train dataset with relevant features 125 | """ 126 | 127 | if(self.__fitOK): 128 | 129 | # sanity checks 130 | if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): 131 | raise ValueError("df must be a DataFrame") 132 | 133 | return df.drop(self.__to_discard, axis=1) 134 | else: 135 | raise ValueError("call fit or fit_transform function before") 136 | 137 | 138 | def fit_transform(self, df_train, y_train): 139 | 140 | """Fits Reg_feature_selector and transforms the dataset 141 | 142 | Parameters 143 | ---------- 144 | df_train : pandas dataframe of shape = (n_train, n_features) 145 | The train dataset with numerical features and no NA 146 | 147 | y_train : pandas series of shape = (n_train, ). 148 | The target for regression task. 149 | 150 | Returns 151 | ------- 152 | pandas dataframe of shape = (n_train, n_features*(1-threshold)) 153 | The train dataset with relevant features 154 | """ 155 | 156 | self.fit(df_train, y_train) 157 | 158 | return self.transform(df_train) 159 | -------------------------------------------------------------------------------- /mlbox/model/regression/regressor.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # License: BSD 3 clause 5 | 6 | import warnings 7 | from copy import copy 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor, 12 | ExtraTreesRegressor, RandomForestRegressor) 13 | from sklearn.linear_model import Ridge 14 | from sklearn.tree import DecisionTreeRegressor 15 | from lightgbm import LGBMRegressor 16 | 17 | 18 | class Regressor(): 19 | """Wrap scikitlearn regressors. 20 | 21 | Parameters 22 | ---------- 23 | strategy : str, default = "LightGBM" 24 | The choice for the regressor. 25 | Available strategies = {"LightGBM", "RandomForest", "ExtraTrees", 26 | "Tree", "Bagging", "AdaBoost" or "Linear"} 27 | 28 | **params : default = None 29 | Parameters of the corresponding regressor. 30 | Examples : n_estimators, max_depth... 31 | 32 | """ 33 | 34 | def __init__(self, **params): 35 | """Init Regressor object where user can pass a strategy.""" 36 | if ("strategy" in params): 37 | self.__strategy = params["strategy"] 38 | else: 39 | self.__strategy = "LightGBM" 40 | 41 | self.__regress_params = {} 42 | 43 | self.__regressor = None 44 | self.__set_regressor(self.__strategy) 45 | self.__col = None 46 | 47 | self.set_params(**params) 48 | self.__fitOK = False 49 | 50 | def get_params(self, deep=True): 51 | """Get parameters of Regressor object.""" 52 | params = {} 53 | params["strategy"] = self.__strategy 54 | params.update(self.__regress_params) 55 | 56 | return params 57 | 58 | def set_params(self, **params): 59 | """Set parameters of Regressor object.""" 60 | self.__fitOK = False 61 | 62 | if 'strategy' in params.keys(): 63 | self.__set_regressor(params['strategy']) 64 | 65 | for k, v in self.__regress_params.items(): 66 | if k not in self.get_params().keys(): 67 | warnings.warn("Invalid parameter for regressor " 68 | + str(self.__strategy) 69 | + ". Parameter IGNORED. Check the list of " 70 | "available parameters with " 71 | "`regressor.get_params().keys()`") 72 | else: 73 | setattr(self.__regressor, k, v) 74 | 75 | for k, v in params.items(): 76 | if(k == "strategy"): 77 | pass 78 | else: 79 | if k not in self.__regressor.get_params().keys(): 80 | warnings.warn("Invalid parameter for regressor " 81 | + str(self.__strategy) 82 | + ". Parameter IGNORED. Check the list of " 83 | "available parameters with " 84 | "`regressor.get_params().keys()`") 85 | else: 86 | setattr(self.__regressor, k, v) 87 | self.__regress_params[k] = v 88 | 89 | def __set_regressor(self, strategy): 90 | """Set strategy of a regressor object.""" 91 | self.__strategy = strategy 92 | 93 | if(strategy == 'RandomForest'): 94 | self.__regressor = RandomForestRegressor( 95 | n_estimators=400, max_depth=10, max_features='sqrt', 96 | bootstrap=True, n_jobs=-1, random_state=0) 97 | 98 | elif(strategy == "LightGBM"): 99 | self.__regressor = LGBMRegressor( 100 | n_estimators=500, learning_rate=0.05, 101 | colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0) 102 | 103 | elif(strategy == 'ExtraTrees'): 104 | self.__regressor = ExtraTreesRegressor( 105 | n_estimators=400, max_depth=10, max_features='sqrt', 106 | bootstrap=True, n_jobs=-1, random_state=0) 107 | 108 | elif(strategy == 'Tree'): 109 | self.__regressor = DecisionTreeRegressor( 110 | criterion='mse', splitter='best', max_depth=None, 111 | min_samples_split=2, min_samples_leaf=1, 112 | min_weight_fraction_leaf=0.0, max_features=None, 113 | random_state=0, max_leaf_nodes=None, presort=False) 114 | 115 | elif(strategy == "Bagging"): 116 | self.__regressor = BaggingRegressor( 117 | base_estimator=None, n_estimators=500, max_samples=.9, 118 | max_features=.85, bootstrap=False, bootstrap_features=False, 119 | n_jobs=-1, random_state=0) 120 | 121 | elif(strategy == "AdaBoost"): 122 | self.__regressor = AdaBoostRegressor( 123 | base_estimator=None, n_estimators=400, learning_rate=.05, 124 | random_state=0) 125 | 126 | elif(strategy == "Linear"): 127 | self.__regressor = Ridge( 128 | alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, 129 | max_iter=None, tol=0.001, solver='auto', random_state=0) 130 | 131 | else: 132 | raise ValueError( 133 | "Strategy invalid. Please choose between 'LightGBM'" 134 | ", 'RandomForest', 'ExtraTrees', " 135 | "'Tree', 'Bagging', 'AdaBoost' or 'Linear'") 136 | 137 | def fit(self, df_train, y_train): 138 | """Fits Regressor. 139 | 140 | Parameters 141 | ---------- 142 | df_train : pandas dataframe of shape = (n_train, n_features) 143 | The train dataset with numerical features. 144 | 145 | y_train : pandas series of shape = (n_train, ) 146 | The target for regression tasks. 147 | 148 | Returns 149 | ------- 150 | object 151 | self 152 | 153 | """ 154 | # sanity checks 155 | if((type(df_train) != pd.SparseDataFrame) and 156 | (type(df_train) != pd.DataFrame)): 157 | raise ValueError("df_train must be a DataFrame") 158 | 159 | if (type(y_train) != pd.core.series.Series): 160 | raise ValueError("y_train must be a Series") 161 | 162 | self.__regressor.fit(df_train.values, y_train) 163 | self.__col = df_train.columns 164 | self.__fitOK = True 165 | 166 | return self 167 | 168 | def feature_importances(self): 169 | """Computes feature importances. 170 | 171 | Regressor must be fitted before. 172 | 173 | Returns 174 | ------- 175 | dict 176 | Dictionnary containing a measure of feature importance (value) 177 | for each feature (key). 178 | 179 | """ 180 | if self.__fitOK: 181 | 182 | if (self.get_params()["strategy"] in ["Linear"]): 183 | 184 | importance = {} 185 | f = np.abs(self.get_estimator().coef_) 186 | 187 | for i, col in enumerate(self.__col): 188 | importance[col] = f[i] 189 | 190 | elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest", 191 | "ExtraTrees", "Tree"]): 192 | 193 | importance = {} 194 | f = self.get_estimator().feature_importances_ 195 | 196 | for i, col in enumerate(self.__col): 197 | importance[col] = f[i] 198 | 199 | elif (self.get_params()["strategy"] in ["AdaBoost"]): 200 | 201 | importance = {} 202 | norm = self.get_estimator().estimator_weights_.sum() 203 | 204 | try: 205 | # LGB, RF, ET, Tree and AdaBoost 206 | # TODO: Refactor this part 207 | f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa 208 | 209 | except Exception: 210 | f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa 211 | 212 | for i, col in enumerate(self.__col): 213 | importance[col] = f[i] 214 | 215 | elif (self.get_params()["strategy"] in ["Bagging"]): 216 | 217 | importance = {} 218 | importance_bag = [] 219 | 220 | for i, b in enumerate(self.get_estimator().estimators_): 221 | 222 | d = {} 223 | 224 | try: 225 | # LGB, RF, ET, Tree and AdaBoost 226 | f = b.feature_importances_ 227 | except Exception: 228 | f = np.abs(b.coef_) # Linear 229 | 230 | estimator = self.get_estimator() 231 | items = enumerate(estimator.estimators_features_[i]) 232 | for j, c in items: 233 | d[self.__col[c]] = f[j] 234 | 235 | importance_bag.append(d.copy()) 236 | 237 | for i, col in enumerate(self.__col): 238 | list_filtered = filter(lambda x: x != 0, 239 | [k[col] if col in k else 0 240 | for k in importance_bag]) 241 | importance[col] = np.mean(list(list_filtered)) 242 | 243 | else: 244 | 245 | importance = {} 246 | 247 | return importance 248 | 249 | else: 250 | 251 | raise ValueError("You must call the fit function before !") 252 | 253 | def predict(self, df): 254 | """Predicts the target. 255 | 256 | Parameters 257 | ---------- 258 | df : pandas dataframe of shape = (n, n_features) 259 | The dataset with numerical features. 260 | 261 | Returns 262 | ------- 263 | array of shape = (n, ) 264 | The target to be predicted. 265 | 266 | """ 267 | try: 268 | if not callable(getattr(self.__regressor, "predict")): 269 | raise ValueError("predict attribute is not callable") 270 | except Exception as e: 271 | raise e 272 | 273 | if self.__fitOK: 274 | 275 | # sanity checks 276 | if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): 277 | raise ValueError("df must be a DataFrame") 278 | 279 | return self.__regressor.predict(df.values) 280 | 281 | else: 282 | raise ValueError("You must call the fit function before !") 283 | 284 | def transform(self, df): 285 | """Transform dataframe df. 286 | 287 | Parameters 288 | ---------- 289 | df : pandas dataframe of shape = (n, n_features) 290 | The dataset with numerical features. 291 | 292 | Returns 293 | ------- 294 | pandas dataframe of shape = (n, n_selected_features) 295 | The transformed dataset with its most important features. 296 | 297 | """ 298 | try: 299 | if not callable(getattr(self.__regressor, "transform")): 300 | raise ValueError("transform attribute is not callable") 301 | except Exception as e: 302 | raise e 303 | 304 | if self.__fitOK: 305 | 306 | # sanity checks 307 | if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): 308 | raise ValueError("df must be a DataFrame") 309 | 310 | return self.__regressor.transform(df.values) 311 | else: 312 | raise ValueError("You must call the fit function before !") 313 | 314 | def score(self, df, y, sample_weight=None): 315 | """Return R^2 coefficient of determination of the prediction. 316 | 317 | Parameters 318 | ---------- 319 | df : pandas dataframe of shape = (n, n_features) 320 | The dataset with numerical features. 321 | 322 | y : pandas series of shape = (n,) 323 | The numerical encoded target for classification tasks. 324 | 325 | Returns 326 | ------- 327 | float 328 | R^2 of self.predict(df) wrt. y. 329 | 330 | """ 331 | try: 332 | if not callable(getattr(self.__regressor, "score")): 333 | raise ValueError("score attribute is not callable") 334 | except Exception as e: 335 | raise e 336 | 337 | if self.__fitOK: 338 | 339 | # sanity checks 340 | if((type(df) != pd.SparseDataFrame) and 341 | (type(df) != pd.DataFrame)): 342 | raise ValueError("df must be a DataFrame") 343 | 344 | if (type(y) != pd.core.series.Series): 345 | raise ValueError("y must be a Series") 346 | 347 | return self.__regressor.score(df.values, y, sample_weight) 348 | else: 349 | raise ValueError("You must call the fit function before !") 350 | 351 | def get_estimator(self): 352 | """Return classfier.""" 353 | return copy(self.__regressor) 354 | -------------------------------------------------------------------------------- /mlbox/model/regression/stacking_regressor.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Author: Axel ARONIO DE ROMBLAY 3 | # License: BSD 3 clause 4 | 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.linear_model import LinearRegression 9 | from sklearn.model_selection import KFold, cross_val_predict 10 | from copy import copy as make_copy 11 | from .regressor import Regressor 12 | import warnings 13 | 14 | 15 | class StackingRegressor(): 16 | """A Stacking regressor. 17 | 18 | A stacking regressor is a regressor that uses the predictions of 19 | several first layer estimators (generated with a cross validation method) 20 | for a second layer estimator. 21 | 22 | 23 | Parameters 24 | ---------- 25 | base_estimators : list, default = [Regressor(strategy="LightGBM"), 26 | Regressor(strategy="RandomForest"), 27 | Regressor(strategy="ExtraTrees")] 28 | List of estimators to fit in the first level using a cross validation. 29 | 30 | level_estimator : object, default = LinearRegression() 31 | The estimator used in second and last level 32 | 33 | n_folds : int, default = 5 34 | Number of folds used to generate the meta features for the training set 35 | 36 | copy : bool, default = False 37 | If true, meta features are added to the original dataset 38 | 39 | random_state : None, int or RandomState. default = 1 40 | Pseudo-random number generator state used for shuffling. 41 | If None, use default numpy RNG for shuffling. 42 | 43 | verbose : bool, default = True 44 | Verbose mode. 45 | 46 | """ 47 | 48 | def __init__(self, base_estimators=[Regressor(strategy="LightGBM"), 49 | Regressor(strategy="RandomForest"), 50 | Regressor(strategy="ExtraTrees")], 51 | level_estimator=LinearRegression(), n_folds=5, 52 | copy=False, random_state=1, verbose=True): 53 | """Init method for StackingRegressor.""" 54 | self.base_estimators = base_estimators 55 | if(type(base_estimators) != list): 56 | raise ValueError("base_estimators must be a list") 57 | else: 58 | for i, est in enumerate(self.base_estimators): 59 | self.base_estimators[i] = make_copy(est) 60 | 61 | self.level_estimator = level_estimator 62 | 63 | self.n_folds = n_folds 64 | if(type(n_folds) != int): 65 | raise ValueError("n_folds must be an integer") 66 | 67 | self.copy = copy 68 | if(type(copy) != bool): 69 | raise ValueError("copy must be a boolean") 70 | 71 | self.random_state = random_state 72 | if((type(self.random_state) != int) 73 | and (self.random_state is not None)): 74 | raise ValueError("random_state must be either None or an integer") 75 | 76 | self.verbose = verbose 77 | if(type(self.verbose) != bool): 78 | raise ValueError("verbose must be a boolean") 79 | 80 | self.__fitOK = False 81 | self.__fittransformOK = False 82 | 83 | def get_params(self, deep=True): 84 | """Get parameters of a StackingRegressor object.""" 85 | return {'level_estimator': self.level_estimator, 86 | 'base_estimators': self.base_estimators, 87 | 'n_folds': self.n_folds, 88 | 'copy': self.copy, 89 | 'random_state': self.random_state, 90 | 'verbose': self.verbose} 91 | 92 | def set_params(self, **params): 93 | """Set parameters of a StackingRegressor object.""" 94 | self.__fitOK = False 95 | self.__fittransformOK = False 96 | 97 | for k, v in params.items(): 98 | if k not in self.get_params(): 99 | warnings.warn("Invalid parameter a for stacking_regressor " 100 | "StackingRegressor. Parameter IGNORED. Check the" 101 | " list of available parameters with " 102 | "`stacking_regressor.get_params().keys()`") 103 | else: 104 | setattr(self, k, v) 105 | 106 | def fit_transform(self, df_train, y_train): 107 | """Create meta-features for the training dataset. 108 | 109 | Parameters 110 | ---------- 111 | df_train : pandas DataFrame of shape = (n_samples, n_features) 112 | The training dataset. 113 | 114 | y_train : pandas series of shape = (n_samples, ) 115 | The target 116 | 117 | Returns 118 | ------- 119 | pandas DataFrame of shape = (n_samples, 120 | n_features*int(copy)+n_metafeatures) 121 | The transformed training dataset. 122 | 123 | """ 124 | # sanity checks 125 | if((type(df_train) != pd.SparseDataFrame) & (type(df_train) != pd.DataFrame)): 126 | raise ValueError("df_train must be a DataFrame") 127 | 128 | if(type(y_train) != pd.core.series.Series): 129 | raise ValueError("y_train must be a Series") 130 | 131 | cv = KFold(n_splits=self.n_folds, shuffle=True, 132 | random_state=self.random_state) 133 | 134 | preds = pd.DataFrame([], index=y_train.index) 135 | 136 | if(self.verbose): 137 | print("") 138 | print("[==========================================================" 139 | "===================] LAYER [===============================" 140 | "====================================================]") 141 | print("") 142 | 143 | for c, reg in enumerate(self.base_estimators): 144 | 145 | if(self.verbose): 146 | print("> fitting estimator n°" + str(c + 1) + 147 | " : " + str(reg.get_params()) + " ...") 148 | print("") 149 | 150 | # for each base estimator, we create the meta feature on train set 151 | y_pred = cross_val_predict(estimator=reg, X=df_train, y=y_train, cv=cv) 152 | preds["est" + str(c + 1)] = y_pred 153 | 154 | # and we refit the base estimator on entire train set 155 | reg.fit(df_train, y_train) 156 | 157 | layer = 1 158 | columns = ["layer" + str(layer) + "_" + s for s in preds.columns] 159 | while(len(np.intersect1d(df_train.columns, columns)) > 0): 160 | layer = layer + 1 161 | columns = ["layer" + str(layer) + "_" + s for s in preds.columns] 162 | preds.columns = ["layer" + str(layer) + "_" + s for s in preds.columns] 163 | 164 | self.__fittransformOK = True 165 | 166 | if(self.copy): 167 | # we keep also the initial features 168 | return pd.concat([df_train, preds], axis=1) 169 | 170 | else: 171 | return preds # we keep only the meta features 172 | 173 | def transform(self, df_test): 174 | """Create meta-features for the test dataset. 175 | 176 | Parameters 177 | ---------- 178 | df_test : pandas DataFrame of shape = (n_samples_test, n_features) 179 | The test dataset. 180 | 181 | Returns 182 | ------- 183 | pandas DataFrame of shape = (n_samples_test, 184 | n_features*int(copy)+n_metafeatures) 185 | The transformed test dataset. 186 | 187 | """ 188 | # sanity checks 189 | if((type(df_test) != pd.SparseDataFrame) and 190 | (type(df_test) != pd.DataFrame)): 191 | raise ValueError("df_test must be a DataFrame") 192 | 193 | if(self.__fittransformOK): 194 | 195 | preds_test = pd.DataFrame([], index=df_test.index) 196 | 197 | for c, reg in enumerate(self.base_estimators): 198 | 199 | # we predict the meta feature on test set 200 | y_pred_test = reg.predict(df_test) 201 | preds_test["est" + str(c + 1)] = y_pred_test 202 | 203 | layer = 1 204 | columns = ["layer" + str(layer) + "_" + s 205 | for s in preds_test.columns] 206 | 207 | while(len(np.intersect1d(df_test.columns, columns)) > 0): 208 | layer = layer + 1 209 | columns = ["layer" + str(layer) + "_" + s 210 | for s in preds_test.columns] 211 | 212 | preds_test.columns = [ 213 | "layer" + str(layer) + "_" + s for s in preds_test.columns] 214 | 215 | if(self.copy): 216 | # we keep also the initial features 217 | return pd.concat([df_test, preds_test], axis=1) 218 | else: 219 | return preds_test # we keep only the meta features 220 | 221 | else: 222 | raise ValueError("Call fit_transform before !") 223 | 224 | def fit(self, df_train, y_train): 225 | """Fit the first level estimators and the second level estimator on X. 226 | 227 | Parameters 228 | ---------- 229 | df_train : pandas DataFrame of shape (n_samples, n_features) 230 | Input data 231 | 232 | y_train : pandas series of shape = (n_samples, ) 233 | The target 234 | 235 | Returns 236 | ------- 237 | object 238 | self 239 | 240 | """ 241 | # Fit the base estimators 242 | df_train = self.fit_transform(df_train, y_train) 243 | 244 | if(self.verbose): 245 | print("") 246 | print("[==========================================================" 247 | "===============] PREDICTION LAYER [========================" 248 | "====================================================]") 249 | print("") 250 | print("> fitting estimator : " + 251 | str(self.level_estimator.get_params()) + " ...") 252 | print("") 253 | 254 | # we fit the second level estimator 255 | self.level_estimator.fit(df_train.values, y_train.values) 256 | 257 | self.__fitOK = True 258 | 259 | return self 260 | 261 | 262 | def predict(self, df_test): 263 | """Predict regression target for X_test using the meta-features. 264 | 265 | Parameters 266 | ---------- 267 | df_test : pandas DataFrame of shape = (n_samples_test, n_features) 268 | The testing samples 269 | 270 | Returns 271 | ------- 272 | array of shape = (n_samples_test, ) 273 | The predicted values. 274 | 275 | """ 276 | if(self.__fitOK): 277 | # we predict the meta features on test set 278 | df_test = self.transform(df_test) 279 | 280 | # we predict the target using the meta features 281 | return self.level_estimator.predict(df_test) 282 | 283 | else: 284 | raise ValueError("Call fit before !") 285 | -------------------------------------------------------------------------------- /mlbox/optimisation/__init__.py: -------------------------------------------------------------------------------- 1 | from .optimiser import * 2 | -------------------------------------------------------------------------------- /mlbox/prediction/__init__.py: -------------------------------------------------------------------------------- 1 | from .predictor import * 2 | 3 | -------------------------------------------------------------------------------- /mlbox/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .drift_thresholder import * 2 | from .reader import * 3 | 4 | -------------------------------------------------------------------------------- /mlbox/preprocessing/drift/__init__.py: -------------------------------------------------------------------------------- 1 | from .drift_estimator import * 2 | from .drift_threshold import * 3 | -------------------------------------------------------------------------------- /mlbox/preprocessing/drift/drift_estimator.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Authors: Axel ARONIO DE ROMBLAY 3 | # Alexis BONDU 4 | # License: BSD 3 clause 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.metrics import roc_auc_score 10 | from sklearn.model_selection import KFold, StratifiedKFold, cross_val_predict 11 | 12 | class DriftEstimator(): 13 | 14 | """Estimates the drift between two datasets 15 | 16 | 17 | Parameters 18 | ---------- 19 | estimator : classifier, defaut = RandomForestClassifier(n_estimators = 50, n_jobs=-1, max_features=1., min_samples_leaf = 5, max_depth = 5) 20 | The estimator that estimates the drift between two datasets 21 | 22 | n_folds : int, defaut = 2 23 | Number of folds used to estimate the drift 24 | 25 | stratify : bool, defaut = True 26 | Whether the cv is stratified (same number of train and test samples within each fold) 27 | 28 | random_state : int, defaut = 1 29 | Random state for cv 30 | """ 31 | 32 | def __init__(self, 33 | estimator=RandomForestClassifier(n_estimators=50, 34 | n_jobs=-1, 35 | max_features=1., 36 | min_samples_leaf=5, 37 | max_depth=5), 38 | n_folds=2, 39 | stratify=True, 40 | random_state=1): 41 | 42 | self.estimator = estimator 43 | self.n_folds = n_folds 44 | self.stratify = stratify 45 | self.random_state = random_state 46 | self.__cv = None 47 | self.__pred = None 48 | self.__target = None 49 | self.__fitOK = False 50 | 51 | def get_params(self): 52 | 53 | return {'estimator': self.estimator, 54 | 'n_folds': self.n_folds, 55 | 'stratify': self.stratify, 56 | 'random_state': self.random_state} 57 | 58 | def set_params(self, **params): 59 | 60 | if('estimator' in params.keys()): 61 | self.estimator = params['estimator'] 62 | if('n_folds' in params.keys()): 63 | self.n_folds = params['n_folds'] 64 | if('stratify' in params.keys()): 65 | self.stratify = params['stratify'] 66 | if('random_state' in params.keys()): 67 | self.random_state = params['random_state'] 68 | 69 | def fit(self, df_train, df_test): 70 | 71 | """ 72 | Computes the drift between the two datasets 73 | 74 | Parameters 75 | ---------- 76 | df_train : pandas dataframe of shape = (n_train, p) 77 | The train set 78 | 79 | df_test : pandas dataframe of shape = (n_test, p) 80 | The test set 81 | 82 | Returns 83 | ------- 84 | self : object 85 | Returns self. 86 | """ 87 | 88 | df_train["target"] = 0 89 | df_test["target"] = 1 90 | 91 | self.__target = pd.concat((df_train.target, df_test.target), 92 | ignore_index=True) 93 | 94 | if self.stratify: 95 | self.__cv = StratifiedKFold(n_splits=self.n_folds, 96 | shuffle=True, 97 | random_state=self.random_state) 98 | else: 99 | self.__cv = KFold(n_splits=self.n_folds, 100 | shuffle=True, 101 | random_state=self.random_state) 102 | 103 | X_tmp = pd.concat((df_train, df_test), 104 | ignore_index=True).drop(['target'], axis=1) 105 | 106 | self.__pred = cross_val_predict(estimator=self.estimator, 107 | X=X_tmp, 108 | y=self.__target, 109 | cv=self.__cv, 110 | method="predict_proba")[:,1] 111 | 112 | del df_train["target"] 113 | del df_test["target"] 114 | 115 | self.__fitOK = True 116 | 117 | return self 118 | 119 | def score(self): 120 | 121 | """Returns the global drift measure between two datasets. 122 | 123 | 0. = No drift. 1. = Maximal Drift 124 | 125 | Returns 126 | ------- 127 | float 128 | The drift measure 129 | """ 130 | 131 | S = [] 132 | 133 | if self.__fitOK: 134 | 135 | X_zeros = np.zeros(len(self.__target)) 136 | 137 | for train_index, test_index in self.__cv.split(X=X_zeros, 138 | y=self.__target): 139 | 140 | S.append(roc_auc_score(self.__target.iloc[test_index], 141 | self.__pred[test_index])) 142 | 143 | return (max(np.mean(S), 1-np.mean(S))-0.5) * 2 144 | 145 | else: 146 | raise ValueError('Call the fit function before !') 147 | 148 | def predict(self): 149 | 150 | """Returns the probabilities that the sample belongs to the test dataset 151 | 152 | Returns 153 | ------- 154 | Array of shape = (n_train+n_test,) 155 | The probabilities 156 | """ 157 | 158 | if self.__fitOK: 159 | 160 | return self.__pred 161 | 162 | else: 163 | raise ValueError('Call the fit function before !') 164 | -------------------------------------------------------------------------------- /mlbox/preprocessing/drift/drift_threshold.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Authors: Axel ARONIO DE ROMBLAY 3 | # Alexis BONDU 4 | # License: BSD 3 clause 5 | import sys 6 | 7 | from joblib import Parallel, delayed 8 | from sklearn.tree import DecisionTreeClassifier 9 | 10 | from .drift_estimator import DriftEstimator 11 | 12 | 13 | def sync_fit(df_train, df_test, estimator, n_folds=2, stratify=True, random_state=1): 14 | """Compute the univariate drifts between df_train and df_test datasets. 15 | 16 | Multi-threaded version. 17 | 18 | Parameters 19 | ---------- 20 | df_train : pandas dataframe of shape = (n_train, p) 21 | The train set 22 | 23 | df_test : pandas dataframe of shape = (n_test, p) 24 | The test set 25 | 26 | estimator : classifier, defaut = RandomForestClassifier(n_estimators = 50, 27 | n_jobs=-1, 28 | max_features=1., 29 | min_samples_leaf = 5, 30 | max_depth = 5) 31 | The estimator that estimates the drift between two datasets 32 | 33 | n_folds : int, default = 2 34 | Number of folds used to estimate the drift 35 | 36 | stratify : bool, default = True 37 | Whether the cv is stratified (same number of train and test samples 38 | within each fold) 39 | 40 | random_state : int, default = 1 41 | Random state for cv 42 | 43 | Returns 44 | ------- 45 | float 46 | drift measure 47 | 48 | """ 49 | # We will compute the indices of the CV in each thread 50 | de = DriftEstimator(estimator, n_folds, stratify, random_state) 51 | de.fit(df_train, df_test) 52 | 53 | return de.score() 54 | 55 | 56 | class DriftThreshold(): 57 | """Estimate the univariate drift between two datasets. 58 | 59 | Estimate the univariate drift between two datasets 60 | and select features with low drifts 61 | 62 | Parameters 63 | ---------- 64 | threshold : float, defaut = 0.6 65 | The drift threshold (univariate drift below are kept) 66 | Must be between 0. and 1. 67 | 68 | subsample : float, defaut = 1. 69 | Subsampling parameter for the datasets. 70 | Must be between 0. and 1. 71 | 72 | estimator : classifier, default = DecisionTreeClassifier(max_depth=6) 73 | The estimator that estimates the drift between two datasets. 74 | 75 | n_folds : int, default = 2 76 | Number of folds used to estimate the drift. 77 | 78 | stratify : bool, default = True 79 | Whether the cv is stratified (same number of train and test samples 80 | within each fold) 81 | 82 | random_state : int, default = 1 83 | Seed for for cv and subsampling. 84 | 85 | n_jobs : int, defaut = -1 86 | Number of cores used for processing (-1 for all cores) 87 | 88 | """ 89 | 90 | def __init__(self, 91 | threshold=0.6, 92 | subsample=1., 93 | estimator=DecisionTreeClassifier(max_depth=6), 94 | n_folds=2, 95 | stratify=True, 96 | random_state=1, 97 | n_jobs=-1): 98 | """Init a DriftThreshold object.""" 99 | self.threshold = threshold 100 | self.subsample = subsample 101 | self.estimator = estimator 102 | self.n_folds = n_folds 103 | self.stratify = stratify 104 | self.random_state = random_state 105 | self.n_jobs = n_jobs 106 | self.__Ddrifts = dict() 107 | self.__fitOK = False 108 | 109 | def get_params(self): 110 | """Get parameters of a DriftThreshold object.""" 111 | return {'threshold': self.threshold, 112 | 'subsample': self.subsample, 113 | 'estimator': self.estimator, 114 | 'n_folds': self.n_folds, 115 | 'stratify': self.stratify, 116 | 'random_state': self.random_state, 117 | 'n_jobs': self.n_jobs} 118 | 119 | def set_params(self, **params): 120 | """Set parameters of a DriftThreshold object.""" 121 | if('threshold' in params.keys()): 122 | self.threshold = params['threshold'] 123 | if('subsample' in params.keys()): 124 | self.subsample = params['subsample'] 125 | if('estimator' in params.keys()): 126 | self.estimator = params['estimator'] 127 | if('n_folds' in params.keys()): 128 | self.n_folds = params['n_folds'] 129 | if('stratify' in params.keys()): 130 | self.stratify = params['stratify'] 131 | if('random_state' in params.keys()): 132 | self.random_state = params['random_state'] 133 | if('n_jobs' in params.keys()): 134 | self.n_jobs = params['n_jobs'] 135 | 136 | def fit(self, df_train, df_test): 137 | """Compute the univariate drifts between df_train and df_test datasets. 138 | 139 | Parameters 140 | ---------- 141 | df_train : pandas dataframe of shape = (n_train, p) 142 | The train set 143 | 144 | df_test : pandas dataframe of shape = (n_test, p) 145 | The test set 146 | 147 | Returns 148 | ------- 149 | None 150 | 151 | """ 152 | self.__Ddrifts = dict() 153 | 154 | if sys.platform == 'win32': 155 | Ldrifts = [sync_fit(df_train.sample(frac=self.subsample)[[col]], 156 | df_test.sample(frac=self.subsample)[[col]], 157 | self.estimator, 158 | self.n_folds, 159 | self.stratify, 160 | self.random_state) 161 | for col in df_train.columns] 162 | else: 163 | Ldrifts = Parallel(n_jobs=self.n_jobs)(delayed(sync_fit) 164 | (df_train.sample(frac=self.subsample)[[col]], 165 | df_test.sample(frac=self.subsample)[[col]], 166 | self.estimator, 167 | self.n_folds, 168 | self.stratify, 169 | self.random_state) 170 | for col in df_train.columns) 171 | 172 | for i, col in enumerate(df_train.columns): 173 | 174 | self.__Ddrifts[col] = Ldrifts[i] 175 | 176 | del Ldrifts 177 | 178 | self.__fitOK = True 179 | 180 | def transform(self, df): 181 | """Select the features with low drift. 182 | 183 | Parameters 184 | ---------- 185 | df : pandas dataframe 186 | A dataset with the same features 187 | 188 | Returns 189 | ------- 190 | pandas DataFrame 191 | The transformed dataframe 192 | 193 | """ 194 | if self.__fitOK: 195 | 196 | selected_col = [] 197 | 198 | for i, col in enumerate(df.columns): 199 | 200 | if (self.__Ddrifts[col] < self.threshold): 201 | selected_col.append(col) 202 | 203 | return df[selected_col] 204 | 205 | else: 206 | raise ValueError('Call the fit function before !') 207 | 208 | def get_support(self, complement=False): 209 | """Return the variables kept or dropped. 210 | 211 | Parameters 212 | ---------- 213 | complement : bool, default = True 214 | If True, returns the features to drop 215 | If False, returns the features to keep 216 | 217 | Returns 218 | ------- 219 | list 220 | The list of features to keep or to drop. 221 | 222 | """ 223 | if self.__fitOK: 224 | 225 | keepList = [] 226 | dropList = [] 227 | 228 | for col in self.__Ddrifts: 229 | 230 | if (self.__Ddrifts[col] < self.threshold): 231 | keepList.append(col) 232 | else: 233 | dropList.append(col) 234 | 235 | if complement: 236 | return dropList 237 | else: 238 | return keepList 239 | else: 240 | raise ValueError('Call the fit function before !') 241 | 242 | def drifts(self): 243 | """Return the univariate drifts for all variables. 244 | 245 | Returns 246 | ------- 247 | dict 248 | The dictionnary of drift measures for each features 249 | 250 | """ 251 | if self.__fitOK: 252 | 253 | return self.__Ddrifts 254 | 255 | else: 256 | raise ValueError('Call the fit function before !') 257 | -------------------------------------------------------------------------------- /mlbox/preprocessing/drift_thresholder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Author: Axel ARONIO DE ROMBLAY 3 | # License: BSD 3 clause 4 | 5 | import os 6 | import time 7 | from sklearn.pipeline import Pipeline 8 | from .drift import DriftThreshold 9 | from ..encoding.na_encoder import NA_encoder 10 | from ..encoding.categorical_encoder import Categorical_encoder 11 | 12 | 13 | class Drift_thresholder(): 14 | 15 | """Automatically drops ids and drifting variables between train and test datasets. 16 | 17 | Drops on train and test datasets. The list of drift coefficients is available and 18 | saved as "drifts.txt". To get familiar with drift: 19 | https://github.com/AxeldeRomblay/MLBox/blob/master/docs/webinars/features.pdf 20 | 21 | Parameters 22 | ---------- 23 | threshold : float, defaut = 0.6 24 | Drift threshold under which features are kept. Must be between 0. and 1. 25 | The lower the more you keep non-drifting/stable variables: a feature with 26 | a drift measure of 0. is very stable and a one with 1. is highly unstable. 27 | 28 | inplace : bool, default = False 29 | If True, train and test datasets are transformed. Returns self. 30 | Otherwise, train and test datasets are not transformed. Returns a new dictionnary with 31 | cleaned datasets. 32 | 33 | verbose : bool, default = True 34 | Verbose mode 35 | 36 | to_path : str, default = "save" 37 | Name of the folder where the list of drift coefficients is saved. 38 | """ 39 | 40 | def __init__(self, 41 | threshold=0.6, 42 | inplace=False, 43 | verbose=True, 44 | to_path="save"): 45 | 46 | self.threshold = threshold 47 | self.inplace = inplace 48 | self.verbose = verbose 49 | self.to_path = to_path 50 | self.__Ddrifts = {} 51 | self.__fitOK = False 52 | 53 | 54 | def fit_transform(self, df): 55 | 56 | """Fits and transforms train and test datasets 57 | 58 | Automatically drops ids and drifting variables between train and test datasets. 59 | The list of drift coefficients is available and saved as "drifts.txt" 60 | 61 | Parameters 62 | ---------- 63 | df : dict, defaut = None 64 | Dictionnary containing : 65 | 66 | - 'train' : pandas dataframe for train dataset 67 | - 'test' : pandas dataframe for test dataset 68 | - 'target' : pandas serie for the target on train set 69 | 70 | Returns 71 | ------- 72 | dict 73 | Dictionnary containing : 74 | 75 | - 'train' : transformed pandas dataframe for train dataset 76 | - 'test' : transformed pandas dataframe for test dataset 77 | - 'target' : pandas serie for the target on train set 78 | """ 79 | 80 | ###################################################### 81 | # Deleting IDs 82 | ###################################################### 83 | 84 | # Exception 85 | 86 | if (df["test"].shape[0] == 0): 87 | if (self.verbose): 88 | print("") 89 | print("You have no test dataset...") 90 | 91 | return df 92 | 93 | else: 94 | 95 | start_time = time.time() 96 | 97 | ds = DriftThreshold(self.threshold) 98 | na = NA_encoder(numerical_strategy=0) 99 | ca = Categorical_encoder() 100 | 101 | pp = Pipeline([("na", na), ("ca", ca)]) 102 | pp.fit(df['train'], None) 103 | 104 | # Deleting IDs with drift threshold method 105 | 106 | if (self.verbose): 107 | print("") 108 | print("computing drifts ...") 109 | 110 | ds.fit(pp.transform(df['train']), pp.transform(df['test'])) 111 | 112 | if (self.verbose): 113 | print("CPU time: %s seconds" % (time.time() - start_time)) 114 | print("") 115 | 116 | self.__fitOK = True 117 | self.__Ddrifts = ds.drifts() 118 | drifts_top = sorted(ds.drifts().items(), 119 | key=lambda x: x[1], 120 | reverse=True)[:10] 121 | 122 | if (self.verbose): 123 | print("> Top 10 drifts") 124 | print("") 125 | for d in range(len(drifts_top)): 126 | print(drifts_top[d]) 127 | 128 | if (self.verbose): 129 | print("") 130 | print("> Deleted " 131 | "variables : " + str(ds.get_support(complement=True))) 132 | 133 | ###################################################### 134 | # Dumping Encoders into directory 135 | ###################################################### 136 | 137 | if (self.to_path is not None): 138 | 139 | try: 140 | os.mkdir(self.to_path) 141 | except OSError: 142 | pass 143 | 144 | file = open(self.to_path + '/drifts.txt', "w") 145 | file.write("\n") 146 | file.write( 147 | "*******************************************" 148 | " Drifts coefficients " 149 | "*******************************************\n") 150 | file.write("\n") 151 | 152 | for var, d in sorted(ds.drifts().items(), 153 | key=lambda x: x[1], 154 | reverse=True): 155 | file.write(str(var) + " = " + str(d) + '\n') 156 | 157 | file.close() 158 | 159 | if (self.verbose): 160 | print("> Drift coefficients dumped into directory : " + self.to_path) 161 | 162 | # Returning datasets with no IDs 163 | 164 | if (self.inplace): 165 | 166 | df['train'] = ds.transform(df['train']) 167 | df['test'] = ds.transform(df['test']) 168 | 169 | else: 170 | 171 | return {'train': ds.transform(df['train']), 172 | 'test': ds.transform(df['test']), 173 | 'target': df['target']} 174 | 175 | def drifts(self): 176 | 177 | """Returns the univariate drifts for all variables. 178 | 179 | Returns 180 | ------- 181 | dict 182 | Dictionnary containing the drifts for each feature 183 | """ 184 | 185 | if self.__fitOK: 186 | 187 | return self.__Ddrifts 188 | else: 189 | raise ValueError('Call the fit_transform function before !') 190 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.18.2 2 | scipy==1.4.1 3 | matplotlib==3.0.3 4 | hyperopt==0.2.3 5 | pandas==0.25.3 6 | joblib==0.14.1 7 | scikit-learn==0.22.1 8 | tensorflow==2.0.0 9 | lightgbm==2.3.1 10 | tables==3.5.2 11 | xlrd==1.2.0 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup file for installing mlbox package.""" 2 | # !/usr/bin/env python 3 | # -*- coding: utf-8 -*- 4 | 5 | from setuptools import setup 6 | 7 | 8 | with open('requirements.txt', 'rt') as fh: 9 | requirements = fh.read().splitlines() 10 | 11 | with open('README.rst') as readme_file: 12 | readme = readme_file.read() 13 | 14 | with open('VERSION.txt') as version_file: 15 | version = version_file.read() 16 | 17 | 18 | setup( 19 | name='mlbox', 20 | version=version, 21 | description="A powerful Automated Machine Learning python library.", 22 | long_description=readme, 23 | author="Axel ARONIO DE ROMBLAY", 24 | author_email='axelderomblay@gmail.com', 25 | url='https://github.com/AxeldeRomblay/mlbox', 26 | packages=['mlbox', 'mlbox.encoding', 'mlbox.model', 27 | 'mlbox.optimisation', 'mlbox.prediction', 28 | 'mlbox.preprocessing', 29 | 'mlbox.model.classification', 30 | 'mlbox.model.regression', 31 | 'mlbox.preprocessing.drift'], 32 | package_dir={'mlbox': 'mlbox', 33 | 'mlbox.encoding': 'mlbox/encoding', 34 | 'mlbox.model': 'mlbox/model', 35 | 'mlbox.optimisation': 'mlbox/optimisation', 36 | 'mlbox.prediction': 'mlbox/prediction', 37 | 'mlbox.preprocessing': 'mlbox/preprocessing', 38 | 'mlbox.model.classification': 'mlbox/model/classification', 39 | 'mlbox.model.regression': 'mlbox/model/regression', 40 | 'mlbox.preprocessing.drift': 'mlbox/preprocessing/drift' 41 | }, 42 | include_package_data=True, 43 | install_requires=requirements, 44 | zip_safe=False, 45 | license='BSD-3', 46 | keywords='mlbox auto-ml stacking pipeline optimisation', 47 | classifiers=[ 48 | 49 | 'Development Status :: 5 - Production/Stable', 50 | 51 | 'Intended Audience :: Developers', 52 | 'Intended Audience :: Science/Research', 53 | 54 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 55 | 'Topic :: Scientific/Engineering :: Information Analysis', 56 | 'Topic :: Software Development :: Libraries :: Python Modules', 57 | 58 | 'License :: OSI Approved :: BSD License', 59 | 60 | 'Natural Language :: English', 61 | 62 | 'Operating System :: MacOS', 63 | 'Operating System :: Microsoft :: Windows', 64 | 'Operating System :: POSIX :: Linux', 65 | 66 | 'Programming Language :: Python :: 3.5', 67 | 'Programming Language :: Python :: 3.6', 68 | 'Programming Language :: Python :: 3.7' 69 | ], 70 | test_suite='tests', 71 | tests_require=requirements 72 | ) 73 | -------------------------------------------------------------------------------- /tests/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/.DS_Store -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /tests/data_for_tests/clean_target.csv: -------------------------------------------------------------------------------- 1 | Survived 2 | 0 3 | 1 4 | 1 5 | 1 6 | 0 7 | 0 8 | 0 9 | 0 10 | 1 11 | 1 12 | 1 13 | 1 14 | 0 15 | 0 16 | 0 17 | 1 18 | 0 19 | 1 20 | 0 21 | 1 22 | 0 23 | 1 24 | 1 25 | 1 26 | 0 27 | 1 28 | 0 29 | 0 30 | 1 31 | 0 32 | 0 33 | 1 34 | 1 35 | 0 36 | 0 37 | 0 38 | 1 39 | 0 40 | 0 41 | 1 42 | 0 43 | 0 44 | 0 45 | 1 46 | 1 47 | 0 48 | 0 49 | 1 50 | 0 51 | 0 52 | 0 53 | 0 54 | 1 55 | 1 56 | 0 57 | 1 58 | 1 59 | 0 60 | 1 61 | 0 62 | 0 63 | 1 64 | 0 65 | 0 66 | 0 67 | 1 68 | 1 69 | 0 70 | 1 71 | 0 72 | 0 73 | 0 74 | 0 75 | 0 76 | 1 77 | 0 78 | 0 79 | 0 80 | 1 81 | 1 82 | 0 83 | 1 84 | 1 85 | 0 86 | 1 87 | 1 88 | 0 89 | 0 90 | 1 91 | 0 92 | 0 93 | 0 94 | 0 95 | 0 96 | 0 97 | 0 98 | 0 99 | 1 100 | 1 101 | 0 102 | 0 103 | 0 104 | 0 105 | 0 106 | 0 107 | 0 108 | 1 109 | 1 110 | 0 111 | 1 112 | 0 113 | 0 114 | 0 115 | 0 116 | 0 117 | 0 118 | 0 119 | 0 120 | 0 121 | 0 122 | 0 123 | 0 124 | 0 125 | 1 126 | 0 127 | 1 128 | 0 129 | 1 130 | 1 131 | 0 132 | 0 133 | 0 134 | 0 135 | 1 136 | 0 137 | 0 138 | 1 139 | 0 140 | 0 141 | 0 142 | 0 143 | 1 144 | 1 145 | 0 146 | 0 147 | 0 148 | 1 149 | 0 150 | 0 151 | 0 152 | 0 153 | 1 154 | 0 155 | 0 156 | 0 157 | 0 158 | 1 159 | 0 160 | 0 161 | 0 162 | 0 163 | 1 164 | 0 165 | 0 166 | 0 167 | 1 168 | 1 169 | 0 170 | 0 171 | 0 172 | 0 173 | 0 174 | 1 175 | 0 176 | 0 177 | 0 178 | 0 179 | 0 180 | 0 181 | 0 182 | 0 183 | 0 184 | 0 185 | 1 186 | 1 187 | 0 188 | 1 189 | 1 190 | 0 191 | 0 192 | 1 193 | 0 194 | 1 195 | 1 196 | 1 197 | 1 198 | 0 199 | 0 200 | 1 201 | 0 202 | 0 203 | 0 204 | 0 205 | 0 206 | 1 207 | 0 208 | 0 209 | 1 210 | 1 211 | 1 212 | 0 213 | 1 214 | 0 215 | 0 216 | 0 217 | 1 218 | 1 219 | 0 220 | 1 221 | 0 222 | 1 223 | 0 224 | 0 225 | 0 226 | 1 227 | 0 228 | 1 229 | 0 230 | 0 231 | 0 232 | 1 233 | 0 234 | 0 235 | 1 236 | 0 237 | 0 238 | 0 239 | 1 240 | 0 241 | 0 242 | 0 243 | 1 244 | 0 245 | 0 246 | 0 247 | 0 248 | 0 249 | 1 250 | 1 251 | 0 252 | 0 253 | 0 254 | 0 255 | 0 256 | 0 257 | 1 258 | 1 259 | 1 260 | 1 261 | 1 262 | 0 263 | 1 264 | 0 265 | 0 266 | 0 267 | 0 268 | 0 269 | 1 270 | 1 271 | 1 272 | 0 273 | 1 274 | 1 275 | 0 276 | 1 277 | 1 278 | 0 279 | 0 280 | 0 281 | 1 282 | 0 283 | 0 284 | 0 285 | 1 286 | 0 287 | 0 288 | 1 289 | 0 290 | 1 291 | 1 292 | 1 293 | 1 294 | 0 295 | 0 296 | 0 297 | 0 298 | 0 299 | 0 300 | 1 301 | 1 302 | 1 303 | 1 304 | 0 305 | 1 306 | 0 307 | 1 308 | 1 309 | 1 310 | 0 311 | 1 312 | 1 313 | 1 314 | 0 315 | 0 316 | 0 317 | 1 318 | 1 319 | 0 320 | 1 321 | 1 322 | 0 323 | 0 324 | 1 325 | 1 326 | 0 327 | 1 328 | 0 329 | 1 330 | 1 331 | 1 332 | 1 333 | 0 334 | 0 335 | 0 336 | 1 337 | 0 338 | 0 339 | 1 340 | 1 341 | 0 342 | 1 343 | 1 344 | 0 345 | 0 346 | 0 347 | 1 348 | 1 349 | 1 350 | 1 351 | 0 352 | 0 353 | 0 354 | 0 355 | 0 356 | 0 357 | 0 358 | 1 359 | 0 360 | 1 361 | 1 362 | 0 363 | 0 364 | 0 365 | 0 366 | 0 367 | 0 368 | 1 369 | 1 370 | 1 371 | 1 372 | 1 373 | 0 374 | 0 375 | 0 376 | 0 377 | 1 378 | 1 379 | 0 380 | 0 381 | 0 382 | 1 383 | 1 384 | 0 385 | 1 386 | 0 387 | 0 388 | 0 389 | 1 390 | 0 391 | 1 392 | 1 393 | 1 394 | 0 395 | 1 396 | 1 397 | 0 398 | 0 399 | 0 400 | 0 401 | 1 402 | 1 403 | 0 404 | 0 405 | 0 406 | 0 407 | 0 408 | 0 409 | 1 410 | 0 411 | 0 412 | 0 413 | 0 414 | 1 415 | 0 416 | 1 417 | 0 418 | 1 419 | 1 420 | 0 421 | 0 422 | 0 423 | 0 424 | 0 425 | 0 426 | 0 427 | 0 428 | 1 429 | 1 430 | 0 431 | 1 432 | 1 433 | 1 434 | 1 435 | 0 436 | 0 437 | 1 438 | 0 439 | 1 440 | 0 441 | 0 442 | 1 443 | 0 444 | 0 445 | 1 446 | 1 447 | 1 448 | 1 449 | 1 450 | 1 451 | 1 452 | 0 453 | 0 454 | 0 455 | 1 456 | 0 457 | 1 458 | 0 459 | 1 460 | 1 461 | 0 462 | 1 463 | 0 464 | 0 465 | 0 466 | 0 467 | 0 468 | 0 469 | 0 470 | 0 471 | 1 472 | 0 473 | 0 474 | 1 475 | 1 476 | 0 477 | 0 478 | 0 479 | 0 480 | 0 481 | 1 482 | 0 483 | 0 484 | 0 485 | 1 486 | 1 487 | 0 488 | 1 489 | 0 490 | 0 491 | 1 492 | 0 493 | 0 494 | 0 495 | 0 496 | 0 497 | 0 498 | 1 499 | 0 500 | 0 501 | 0 502 | 0 503 | 0 504 | 0 505 | 0 506 | 1 507 | 0 508 | 1 509 | 1 510 | 0 511 | 1 512 | 1 513 | 0 514 | 1 515 | 1 516 | 0 517 | 0 518 | 1 519 | 0 520 | 1 521 | 0 522 | 1 523 | 0 524 | 0 525 | 1 526 | 0 527 | 0 528 | 1 529 | 0 530 | 0 531 | 0 532 | 1 533 | 0 534 | 0 535 | 1 536 | 0 537 | 1 538 | 0 539 | 1 540 | 0 541 | 1 542 | 1 543 | 0 544 | 0 545 | 1 546 | 0 547 | 0 548 | 1 549 | 1 550 | 0 551 | 1 552 | 1 553 | 0 554 | 0 555 | 1 556 | 1 557 | 0 558 | 1 559 | 0 560 | 1 561 | 1 562 | 0 563 | 0 564 | 0 565 | 0 566 | 0 567 | 0 568 | 0 569 | 0 570 | 0 571 | 1 572 | 1 573 | 1 574 | 1 575 | 1 576 | 0 577 | 0 578 | 1 579 | 1 580 | 0 581 | 1 582 | 1 583 | 1 584 | 0 585 | 0 586 | 0 587 | 1 588 | 0 589 | 1 590 | 0 591 | 0 592 | 0 593 | 1 594 | 0 595 | 0 596 | 0 597 | 0 598 | 1 599 | 0 600 | 0 601 | 1 602 | 1 603 | 0 604 | 0 605 | 0 606 | 1 607 | 0 608 | 0 609 | 1 610 | 1 611 | 1 612 | 0 613 | 0 614 | 1 615 | 0 616 | 0 617 | 1 618 | 0 619 | 0 620 | 1 621 | 0 622 | 0 623 | 1 624 | 1 625 | 0 626 | 0 627 | 0 628 | 0 629 | 1 630 | 0 631 | 0 632 | 1 633 | 0 634 | 1 635 | 0 636 | 0 637 | 1 638 | 0 639 | 0 640 | 0 641 | 0 642 | 0 643 | 1 644 | 0 645 | 1 646 | 1 647 | 1 648 | 0 649 | 1 650 | 0 651 | 1 652 | 0 653 | 1 654 | 0 655 | 1 656 | 0 657 | 0 658 | 0 659 | 0 660 | 0 661 | 0 662 | 1 663 | 0 664 | 0 665 | 0 666 | 1 667 | 0 668 | 0 669 | 0 670 | 0 671 | 1 672 | 1 673 | 0 674 | 0 675 | 1 676 | 0 677 | 0 678 | 0 679 | 1 680 | 0 681 | 1 682 | 0 683 | 1 684 | 0 685 | 0 686 | 0 687 | 0 688 | 0 689 | 0 690 | 0 691 | 1 692 | 1 693 | 1 694 | 1 695 | 0 696 | 0 697 | 0 698 | 0 699 | 1 700 | 0 701 | 0 702 | 1 703 | 1 704 | 0 705 | 0 706 | 0 707 | 0 708 | 1 709 | 1 710 | 1 711 | 1 712 | 1 713 | 0 714 | 1 715 | 0 716 | 0 717 | 0 718 | 1 719 | 1 720 | 0 721 | 0 722 | 1 723 | 0 724 | 0 725 | 0 726 | 1 727 | 0 728 | 1 729 | 1 730 | 0 731 | 0 732 | 1 733 | 0 734 | 0 735 | 0 736 | 0 737 | 0 738 | 0 739 | 1 740 | 0 741 | 0 742 | 1 743 | 0 744 | 1 745 | 0 746 | 1 747 | 0 748 | 0 749 | 1 750 | 0 751 | 0 752 | 1 753 | 1 754 | 0 755 | 0 756 | 1 757 | 1 758 | 0 759 | 0 760 | 0 761 | 1 762 | 0 763 | 0 764 | 1 765 | 1 766 | 0 767 | 1 768 | 0 769 | 0 770 | 0 771 | 0 772 | 0 773 | 0 774 | 0 775 | 0 776 | 1 777 | 0 778 | 0 779 | 1 780 | 0 781 | 1 782 | 1 783 | 1 784 | 0 785 | 0 786 | 0 787 | 0 788 | 1 789 | 0 790 | 1 791 | 0 792 | 0 793 | 0 794 | 0 795 | 0 796 | 0 797 | 0 798 | 1 799 | 1 800 | 0 801 | 0 802 | 0 803 | 1 804 | 1 805 | 1 806 | 1 807 | 0 808 | 0 809 | 0 810 | 0 811 | 1 812 | 0 813 | 0 814 | 0 815 | 0 816 | 0 817 | 0 818 | 0 819 | 0 820 | 0 821 | 0 822 | 1 823 | 1 824 | 0 825 | 1 826 | 0 827 | 0 828 | 0 829 | 1 830 | 1 831 | 1 832 | 1 833 | 1 834 | 0 835 | 0 836 | 0 837 | 1 838 | 0 839 | 0 840 | 1 841 | 1 842 | 0 843 | 0 844 | 1 845 | 0 846 | 0 847 | 0 848 | 0 849 | 0 850 | 0 851 | 1 852 | 0 853 | 0 854 | 0 855 | 1 856 | 0 857 | 1 858 | 1 859 | 1 860 | 1 861 | 0 862 | 0 863 | 0 864 | 1 865 | 0 866 | 0 867 | 1 868 | 1 869 | 0 870 | 0 871 | 1 872 | 0 873 | 1 874 | 0 875 | 0 876 | 1 877 | 1 878 | 0 879 | 0 880 | 0 881 | 1 882 | 1 883 | 0 884 | 0 885 | 0 886 | 0 887 | 0 888 | 0 889 | 1 890 | 0 891 | 1 892 | 0 893 | -------------------------------------------------------------------------------- /tests/data_for_tests/clean_test.csv: -------------------------------------------------------------------------------- 1 | ,Age,Fare,Parch,Pclass,SibSp 2 | 0,34.5,7.8292,0,3,0 3 | 1,47,7,0,3,1 4 | 2,62,9.6875,0,2,0 5 | 3,27,8.6625,0,3,0 6 | 4,22,12.2875,1,3,1 7 | 5,14,9.225,0,3,0 8 | 6,30,7.6292,0,3,0 9 | 7,26,29,1,2,1 10 | 8,18,7.2292,0,3,0 11 | 9,21,24.15,0,3,2 12 | 10,24,7.8958,0,3,0 13 | 11,46,26,0,1,0 14 | 12,23,82.2667,0,1,1 15 | 13,63,26,0,2,1 16 | 14,47,61.175,0,1,1 17 | 15,24,27.7208,0,2,1 18 | 16,35,12.35,0,2,0 19 | 17,21,7.225,0,3,0 20 | 18,27,7.925,0,3,1 21 | 19,45,7.225,0,3,0 22 | 20,55,59.4,0,1,1 23 | 21,9,3.1708,1,3,0 24 | 22,24,31.6833,0,1,0 25 | 23,21,61.3792,1,1,0 26 | 24,48,262.375,3,1,1 27 | 25,50,14.5,0,3,1 28 | 26,22,61.9792,1,1,0 29 | 27,22.5,7.225,0,3,0 30 | 28,41,30.5,0,1,0 31 | 29,24,21.6792,0,3,2 32 | 30,50,26,0,2,1 33 | 31,24,31.5,0,2,2 34 | 32,33,20.575,2,3,1 35 | 33,24,23.45,2,3,1 36 | 34,30,57.75,0,1,1 37 | 35,18.5,7.2292,0,3,0 38 | 36,24,8.05,0,3,0 39 | 37,21,8.6625,0,3,0 40 | 38,25,9.5,0,3,0 41 | 39,24,56.4958,0,3,0 42 | 40,39,13.4167,1,3,0 43 | 41,24,26.55,0,1,0 44 | 42,41,7.85,0,3,0 45 | 43,30,13,0,2,0 46 | 44,45,52.5542,0,1,1 47 | 45,25,7.925,0,3,0 48 | 46,45,29.7,0,1,0 49 | 47,24,7.75,0,3,0 50 | 48,60,76.2917,0,1,0 51 | 49,36,15.9,2,3,0 52 | 50,24,60,0,1,1 53 | 51,27,15.0333,0,2,0 54 | 52,20,23,1,2,2 55 | 53,28,263,2,1,3 56 | 54,24,15.5792,0,2,0 57 | 55,10,29.125,1,3,4 58 | 56,35,7.8958,0,3,0 59 | 57,25,7.65,0,3,0 60 | 58,24,16.1,0,3,1 61 | 59,36,262.375,0,1,0 62 | 60,17,7.8958,0,3,0 63 | 61,32,13.5,0,2,0 64 | 62,18,7.75,0,3,0 65 | 63,22,7.725,0,3,0 66 | 64,13,262.375,2,1,2 67 | 65,24,21,0,2,0 68 | 66,18,7.8792,0,3,0 69 | 67,47,42.4,0,1,0 70 | 68,31,28.5375,0,1,0 71 | 69,60,263,4,1,1 72 | 70,24,7.75,0,3,0 73 | 71,21,7.8958,0,3,0 74 | 72,29,7.925,0,3,0 75 | 73,28.5,27.7208,0,1,0 76 | 74,35,211.5,0,1,0 77 | 75,32.5,211.5,0,1,0 78 | 76,24,8.05,0,3,0 79 | 77,55,25.7,0,1,2 80 | 78,30,13,0,2,0 81 | 79,24,7.75,0,3,0 82 | 80,6,15.2458,1,3,1 83 | 81,67,221.7792,0,1,1 84 | 82,49,26,0,1,0 85 | 83,24,7.8958,0,3,0 86 | 84,24,10.7083,0,2,0 87 | 85,24,14.4542,0,3,1 88 | 86,27,7.8792,0,3,0 89 | 87,18,8.05,0,3,0 90 | 88,24,7.75,0,3,0 91 | 89,2,23,1,2,1 92 | 90,22,13.9,0,3,1 93 | 91,24,7.775,0,3,0 94 | 92,27,52,2,1,1 95 | 93,24,8.05,0,3,0 96 | 94,25,26,0,1,0 97 | 95,25,7.7958,0,3,0 98 | 96,76,78.85,0,1,1 99 | 97,29,7.925,0,3,0 100 | 98,20,7.8542,0,3,0 101 | 99,33,8.05,0,3,0 102 | 100,43,55.4417,0,1,1 103 | 101,27,26,0,2,1 104 | 102,24,7.75,0,3,0 105 | 103,26,7.775,0,3,0 106 | 104,16,8.5167,1,3,1 107 | 105,28,22.525,0,3,0 108 | 106,21,7.8208,0,3,0 109 | 107,24,7.75,0,3,0 110 | 108,24,8.7125,0,3,0 111 | 109,18.5,13,0,2,0 112 | 110,41,15.0458,0,2,0 113 | 111,24,7.7792,0,3,0 114 | 112,36,31.6792,0,1,0 115 | 113,18.5,7.2833,0,3,0 116 | 114,63,221.7792,0,1,1 117 | 115,18,14.4542,0,3,1 118 | 116,24,6.4375,0,3,0 119 | 117,1,16.7,1,3,1 120 | 118,36,75.2417,0,1,0 121 | 119,29,26,0,2,1 122 | 120,12,15.75,0,2,0 123 | 121,24,7.75,0,3,1 124 | 122,35,57.75,0,1,1 125 | 123,28,7.25,0,3,0 126 | 124,24,7.75,0,3,0 127 | 125,17,16.1,1,3,0 128 | 126,22,7.7958,0,3,0 129 | 127,24,23.25,0,3,2 130 | 128,42,13,0,2,0 131 | 129,24,8.05,0,3,0 132 | 130,32,8.05,0,3,0 133 | 131,53,28.5,0,1,0 134 | 132,24,25.4667,4,3,0 135 | 133,24,6.4375,0,3,1 136 | 134,43,7.8958,0,3,0 137 | 135,24,7.8542,0,3,0 138 | 136,26.5,7.225,0,3,0 139 | 137,26,13,0,2,0 140 | 138,23,8.05,0,3,0 141 | 139,40,46.9,6,3,1 142 | 140,10,46.9,2,3,5 143 | 141,33,151.55,0,1,0 144 | 142,61,262.375,3,1,1 145 | 143,28,26,0,2,0 146 | 144,42,26.55,0,1,0 147 | 145,31,18,0,3,3 148 | 146,24,51.8625,0,1,0 149 | 147,22,8.05,0,3,0 150 | 148,24,26.55,0,1,0 151 | 149,30,26,1,2,1 152 | 150,23,83.1583,1,1,0 153 | 151,24,7.8958,0,3,0 154 | 152,60.5,24,0,3,0 155 | 153,36,12.1833,2,3,0 156 | 154,13,31.3875,2,3,4 157 | 155,24,7.55,0,3,0 158 | 156,29,221.7792,0,1,0 159 | 157,23,7.8542,0,3,0 160 | 158,42,26.55,0,1,0 161 | 159,26,13.775,2,3,0 162 | 160,24,7.7333,0,3,0 163 | 161,7,15.2458,1,3,1 164 | 162,26,13.5,0,2,0 165 | 163,24,7,0,3,0 166 | 164,41,13,0,2,0 167 | 165,26,22.025,1,3,1 168 | 166,48,50.4958,0,1,0 169 | 167,18,34.375,2,3,2 170 | 168,24,27.7208,0,1,0 171 | 169,22,8.9625,0,3,0 172 | 170,24,7.55,0,3,0 173 | 171,27,7.225,0,3,0 174 | 172,23,13.9,0,3,1 175 | 173,24,7.2292,0,3,0 176 | 174,40,31.3875,5,3,1 177 | 175,15,39,2,2,0 178 | 176,20,36.75,0,2,0 179 | 177,54,55.4417,0,1,1 180 | 178,36,39,3,2,0 181 | 179,64,83.1583,2,1,0 182 | 180,30,13,0,2,0 183 | 181,37,83.1583,1,1,1 184 | 182,18,53.1,0,1,1 185 | 183,24,7.75,0,3,0 186 | 184,27,247.5208,1,1,1 187 | 185,40,16,0,2,0 188 | 186,21,21,1,2,0 189 | 187,17,8.05,0,3,2 190 | 188,24,69.55,2,3,8 191 | 189,40,13,0,2,0 192 | 190,34,26,0,2,1 193 | 191,24,26,0,1,0 194 | 192,11.5,14.5,1,3,1 195 | 193,61,12.35,0,2,0 196 | 194,8,32.5,2,2,0 197 | 195,33,7.8542,0,3,0 198 | 196,6,134.5,2,1,0 199 | 197,18,7.775,0,3,0 200 | 198,23,10.5,0,2,0 201 | 199,24,8.1125,0,3,0 202 | 200,24,15.5,0,3,0 203 | 201,0.33,14.4,2,3,0 204 | 202,47,227.525,0,1,1 205 | 203,8,26,1,2,1 206 | 204,25,10.5,0,2,0 207 | 205,24,25.7417,0,1,0 208 | 206,35,7.75,0,3,0 209 | 207,24,10.5,0,2,0 210 | 208,33,27.7208,0,1,0 211 | 209,25,7.8958,0,3,0 212 | 210,32,22.525,0,3,0 213 | 211,24,7.05,0,3,0 214 | 212,17,73.5,0,2,0 215 | 213,60,26,0,2,1 216 | 214,38,7.775,2,3,4 217 | 215,42,42.5,0,1,0 218 | 216,24,7.8792,0,3,0 219 | 217,57,164.8667,1,1,1 220 | 218,50,211.5,1,1,1 221 | 219,24,8.05,0,3,0 222 | 220,30,13.8583,0,2,1 223 | 221,21,8.05,0,3,0 224 | 222,22,10.5,0,2,0 225 | 223,21,7.7958,0,3,0 226 | 224,53,27.4458,0,1,0 227 | 225,24,15.2458,2,3,0 228 | 226,23,7.7958,0,3,0 229 | 227,24,7.75,0,3,0 230 | 228,40.5,15.1,0,3,0 231 | 229,36,13,0,2,0 232 | 230,14,65,0,2,0 233 | 231,21,26.55,0,1,0 234 | 232,21,6.4958,0,3,1 235 | 233,24,7.8792,0,3,0 236 | 234,39,71.2833,0,1,1 237 | 235,20,7.8542,0,3,0 238 | 236,64,75.25,0,1,1 239 | 237,20,7.225,0,3,0 240 | 238,18,13,1,2,1 241 | 239,48,106.425,0,1,1 242 | 240,55,27.7208,0,1,0 243 | 241,45,30,2,2,0 244 | 242,45,134.5,1,1,1 245 | 243,24,7.8875,0,3,0 246 | 244,24,23.45,2,3,1 247 | 245,41,51.8625,0,1,1 248 | 246,22,21,0,2,0 249 | 247,42,32.5,1,2,1 250 | 248,29,26,0,2,1 251 | 249,24,14.4542,0,3,1 252 | 250,0.92,27.75,2,2,1 253 | 251,20,7.925,0,3,0 254 | 252,27,136.7792,0,1,1 255 | 253,24,9.325,0,3,0 256 | 254,32.5,9.5,0,3,0 257 | 255,24,7.55,0,3,0 258 | 256,24,7.75,0,3,0 259 | 257,28,8.05,0,3,0 260 | 258,19,13,0,2,0 261 | 259,21,7.775,0,3,0 262 | 260,36.5,17.4,0,3,1 263 | 261,21,7.8542,0,3,0 264 | 262,29,23,2,2,0 265 | 263,1,12.1833,1,3,1 266 | 264,30,12.7375,0,2,0 267 | 265,24,7.8958,0,3,0 268 | 266,24,0,0,1,0 269 | 267,24,7.55,0,3,0 270 | 268,24,8.05,0,3,0 271 | 269,17,8.6625,0,3,0 272 | 270,46,75.2417,0,1,0 273 | 271,24,7.75,0,3,0 274 | 272,26,136.7792,0,1,1 275 | 273,24,15.5,0,3,1 276 | 274,24,7.225,0,3,0 277 | 275,20,26,0,2,1 278 | 276,28,10.5,0,2,0 279 | 277,40,26,0,2,1 280 | 278,30,21,0,2,1 281 | 279,22,10.5,0,2,0 282 | 280,23,8.6625,0,3,0 283 | 281,0.75,13.775,1,3,1 284 | 282,24,7.75,0,3,0 285 | 283,9,15.2458,1,3,1 286 | 284,2,20.2125,1,3,1 287 | 285,36,7.25,0,3,0 288 | 286,24,7.25,0,3,0 289 | 287,24,82.2667,0,1,1 290 | 288,24,7.2292,0,3,0 291 | 289,24,8.05,0,3,0 292 | 290,24,39.6,0,1,0 293 | 291,30,6.95,0,3,0 294 | 292,24,7.2292,0,3,0 295 | 293,53,81.8583,1,1,1 296 | 294,36,9.5,0,3,0 297 | 295,26,7.8958,0,3,0 298 | 296,1,41.5792,2,2,1 299 | 297,24,21.6792,0,3,2 300 | 298,30,45.5,0,1,0 301 | 299,29,7.8542,0,3,0 302 | 300,32,7.775,0,3,0 303 | 301,24,15.0458,0,2,0 304 | 302,43,21,1,2,0 305 | 303,24,8.6625,0,3,0 306 | 304,24,7.75,0,3,0 307 | 305,64,26.55,1,1,1 308 | 306,30,151.55,2,1,1 309 | 307,0.83,9.35,1,3,0 310 | 308,55,93.5,1,1,1 311 | 309,45,14.1083,0,3,1 312 | 310,18,8.6625,0,3,0 313 | 311,22,7.225,0,3,0 314 | 312,24,7.575,0,3,0 315 | 313,37,7.75,0,3,0 316 | 314,55,135.6333,0,1,0 317 | 315,17,7.7333,0,3,0 318 | 316,57,146.5208,0,1,1 319 | 317,19,10.5,0,2,0 320 | 318,27,7.8542,0,3,0 321 | 319,22,31.5,0,2,2 322 | 320,26,7.775,0,3,0 323 | 321,25,7.2292,0,3,0 324 | 322,26,13,0,2,0 325 | 323,33,26.55,0,1,0 326 | 324,39,211.3375,0,1,0 327 | 325,23,7.05,0,3,0 328 | 326,12,39,1,2,2 329 | 327,46,79.2,0,1,0 330 | 328,29,26,0,2,1 331 | 329,21,13,0,2,0 332 | 330,48,36.75,2,2,0 333 | 331,39,29.7,0,1,0 334 | 332,24,7.225,0,3,0 335 | 333,19,15.7417,1,3,1 336 | 334,27,7.8958,0,3,0 337 | 335,30,26,0,1,0 338 | 336,32,13,0,2,0 339 | 337,39,7.2292,2,3,0 340 | 338,25,31.5,0,2,0 341 | 339,24,7.2292,0,3,0 342 | 340,18,10.5,0,2,0 343 | 341,32,7.5792,0,3,0 344 | 342,24,69.55,9,3,1 345 | 343,58,512.3292,1,1,0 346 | 344,24,14.5,1,3,1 347 | 345,16,7.65,0,3,0 348 | 346,26,13,0,2,0 349 | 347,38,7.2292,0,3,0 350 | 348,24,13.5,0,2,0 351 | 349,31,21,0,2,0 352 | 350,45,63.3583,1,1,0 353 | 351,25,10.5,0,2,0 354 | 352,18,73.5,0,2,0 355 | 353,49,65,2,2,1 356 | 354,0.17,20.575,2,3,1 357 | 355,50,26,0,1,0 358 | 356,59,51.4792,0,1,2 359 | 357,24,7.8792,0,3,0 360 | 358,24,7.75,0,3,0 361 | 359,30,15.55,0,3,1 362 | 360,14.5,69.55,2,3,8 363 | 361,24,37.0042,1,2,1 364 | 362,31,21,0,2,0 365 | 363,27,8.6625,0,3,0 366 | 364,25,55.4417,0,1,1 367 | 365,24,69.55,9,3,1 368 | 366,24,14.4583,0,3,1 369 | 367,22,39.6875,0,3,0 370 | 368,45,59.4,1,1,0 371 | 369,29,13.8583,0,2,0 372 | 370,21,11.5,0,2,1 373 | 371,31,134.5,0,1,0 374 | 372,49,0,0,1,0 375 | 373,44,13,0,2,0 376 | 374,54,81.8583,1,1,1 377 | 375,45,262.375,0,1,0 378 | 376,22,8.6625,0,3,2 379 | 377,21,11.5,0,2,0 380 | 378,55,50,0,1,0 381 | 379,5,31.3875,2,3,4 382 | 380,24,7.75,0,3,0 383 | 381,26,7.8792,0,3,0 384 | 382,24,14.5,0,3,0 385 | 383,19,16.1,0,3,1 386 | 384,24,12.875,0,2,0 387 | 385,24,65,2,2,1 388 | 386,24,7.775,0,3,0 389 | 387,57,13,0,2,0 390 | 388,21,7.75,0,3,0 391 | 389,6,21.075,1,3,3 392 | 390,23,93.5,0,1,0 393 | 391,51,39.4,1,1,0 394 | 392,13,20.25,2,3,0 395 | 393,47,10.5,0,2,0 396 | 394,29,22.025,1,3,3 397 | 395,18,60,0,1,1 398 | 396,24,7.25,0,3,0 399 | 397,48,79.2,1,1,1 400 | 398,22,7.775,0,3,0 401 | 399,31,7.7333,0,3,0 402 | 400,30,164.8667,0,1,0 403 | 401,38,21,0,2,1 404 | 402,22,59.4,1,1,0 405 | 403,17,47.1,0,1,0 406 | 404,43,27.7208,0,1,1 407 | 405,20,13.8625,0,2,0 408 | 406,23,10.5,0,2,1 409 | 407,50,211.5,1,1,1 410 | 408,24,7.7208,0,3,0 411 | 409,3,13.775,1,3,1 412 | 410,24,7.75,0,3,0 413 | 411,37,90,0,1,1 414 | 412,28,7.775,0,3,0 415 | 413,24,8.05,0,3,0 416 | 414,39,108.9,0,1,0 417 | 415,38.5,7.25,0,3,0 418 | 416,24,8.05,0,3,0 419 | 417,24,22.3583,1,3,1 420 | -------------------------------------------------------------------------------- /tests/data_for_tests/train.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/data_for_tests/train.h5 -------------------------------------------------------------------------------- /tests/data_for_tests/train.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/data_for_tests/train.xls -------------------------------------------------------------------------------- /tests/test_categorical_encoder.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.encoding.categorical_encoder module.""" 7 | import pytest 8 | import pandas as pd 9 | 10 | from mlbox.encoding.categorical_encoder import Categorical_encoder 11 | 12 | 13 | def test_init_encoder(): 14 | """Test init method of Categorical_encoder class.""" 15 | encoder = Categorical_encoder() 16 | assert encoder.strategy == "label_encoding" 17 | assert not (encoder.verbose) 18 | assert encoder._Categorical_encoder__Lcat == [] 19 | assert encoder._Categorical_encoder__Lnum == [] 20 | assert encoder._Categorical_encoder__Enc == dict() 21 | assert encoder._Categorical_encoder__K == dict() 22 | assert not encoder._Categorical_encoder__weights 23 | assert not encoder._Categorical_encoder__fitOK 24 | 25 | 26 | def test_get_params_encoder(): 27 | """Test get_params method of Categorical_encoder class.""" 28 | encoder = Categorical_encoder() 29 | dict = {'strategy': "label_encoding", 30 | 'verbose': False} 31 | assert encoder.get_params() == dict 32 | 33 | 34 | def test_set_params_encoder(): 35 | """Test set_params method of Categorical_encoder class.""" 36 | encoder = Categorical_encoder() 37 | encoder.set_params(strategy="label_encoding") 38 | assert encoder.strategy == "label_encoding" 39 | encoder.set_params(strategy="dummification") 40 | assert encoder.strategy == "dummification" 41 | encoder.set_params(strategy="random_projection") 42 | assert encoder.strategy == "random_projection" 43 | encoder.set_params(strategy="entity_embedding") 44 | assert encoder.strategy == "entity_embedding" 45 | encoder.set_params(verbose=True) 46 | assert encoder.verbose 47 | encoder.set_params(verbose=False) 48 | assert not encoder.verbose 49 | with pytest.warns(UserWarning) as record: 50 | encoder.set_params(_Categorical_encoder__Lcat=[]) 51 | assert len(record) == 1 52 | 53 | 54 | def test_fit_encoder(): 55 | """Test method fit of Categorical_encoder class.""" 56 | df = pd.read_csv("data_for_tests/train.csv") 57 | encoder = Categorical_encoder(strategy="wrong_strategy") 58 | with pytest.raises(ValueError): 59 | encoder.fit(df, df["Survived"]) 60 | encoder.set_params(strategy="label_encoding") 61 | encoder.fit(df, df["Survived"]) 62 | assert encoder._Categorical_encoder__fitOK 63 | encoder.set_params(strategy="dummification") 64 | encoder.fit(df, df["Survived"]) 65 | assert encoder._Categorical_encoder__fitOK 66 | encoder.set_params(strategy="random_projection") 67 | encoder.fit(df, df["Survived"]) 68 | assert encoder._Categorical_encoder__fitOK 69 | encoder.set_params(strategy="entity_embedding") 70 | encoder.fit(df, df["Survived"]) 71 | assert encoder._Categorical_encoder__fitOK 72 | 73 | 74 | def test_transform_encoder(): 75 | """Test transform method of Categorical_encoder class.""" 76 | df = pd.read_csv("data_for_tests/train.csv") 77 | encoder = Categorical_encoder() 78 | with pytest.raises(ValueError): 79 | encoder.transform(df) 80 | encoder.fit(df, df["Survived"]) 81 | df_encoded = encoder.transform(df) 82 | assert (df.columns == df_encoded.columns).all() 83 | encoder.set_params(strategy="dummification") 84 | encoder.fit(df, df["Survived"]) 85 | df_encoded = encoder.transform(df) 86 | assert (type(df_encoded) == pd.SparseDataFrame) | (type(df_encoded) == pd.DataFrame) 87 | encoder.set_params(strategy="random_projection") 88 | encoder.fit(df, df["Survived"]) 89 | df_encoded = encoder.transform(df) 90 | assert type(df_encoded) == pd.DataFrame 91 | encoder.set_params(strategy="entity_embedding") 92 | encoder.fit(df, df["Survived"]) 93 | df_encoded = encoder.transform(df) 94 | assert type(df_encoded) == pd.DataFrame 95 | 96 | -------------------------------------------------------------------------------- /tests/test_classification_feature_selector.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.model.classification.feature_selector module.""" 7 | import pytest 8 | import pandas as pd 9 | 10 | from mlbox.model.classification.feature_selector import Clf_feature_selector 11 | 12 | 13 | def test_init_Clf_feature_selector(): 14 | """Test init method of Clf_feature_selector class.""" 15 | feature_selector = Clf_feature_selector() 16 | assert feature_selector.strategy == "l1" 17 | assert feature_selector.threshold == 0.3 18 | assert not feature_selector._Clf_feature_selector__fitOK 19 | assert feature_selector._Clf_feature_selector__to_discard == [] 20 | 21 | 22 | def test_get_params_Clf_feature_selector(): 23 | """Test get_params method of Clf_feature_selector class.""" 24 | feature_selector = Clf_feature_selector() 25 | dict = {'strategy': "l1", 26 | 'threshold': 0.3} 27 | assert feature_selector.get_params() == dict 28 | 29 | 30 | def test_set_params_Clf_feature_selector(): 31 | """Test set_params method of Clf_feature_selector class.""" 32 | feature_selector = Clf_feature_selector() 33 | feature_selector.set_params(strategy="variance") 34 | assert feature_selector.strategy == "variance" 35 | feature_selector.set_params(threshold=0.2) 36 | assert feature_selector.threshold == 0.2 37 | with pytest.warns(UserWarning) as record: 38 | feature_selector.set_params(wrong_strategy="wrong_strategy") 39 | assert len(record) == 1 40 | 41 | 42 | def test_fit_Clf_feature_selector(): 43 | """Test fit method of Clf_feature_selector class.""" 44 | feature_selector = Clf_feature_selector() 45 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 46 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 47 | with pytest.raises(ValueError): 48 | feature_selector.fit(None, y_train) 49 | with pytest.raises(ValueError): 50 | feature_selector.fit(df_train, None) 51 | feature_selector.fit(df_train, y_train) 52 | assert feature_selector._Clf_feature_selector__fitOK 53 | feature_selector.set_params(strategy="variance") 54 | feature_selector.fit(df_train, y_train) 55 | assert feature_selector._Clf_feature_selector__fitOK 56 | feature_selector.set_params(strategy="rf_feature_importance") 57 | feature_selector.fit(df_train, y_train) 58 | assert feature_selector._Clf_feature_selector__fitOK 59 | feature_selector.set_params(strategy="wrond_strategy") 60 | with pytest.raises(ValueError): 61 | feature_selector.fit(df_train, y_train) 62 | 63 | 64 | def test_transform_Clf_feature_selector(): 65 | """Test transform method of Clf_feature_selector class.""" 66 | feature_selector = Clf_feature_selector(threshold=0) 67 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 68 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 69 | with pytest.raises(ValueError): 70 | feature_selector.transform(df_train) 71 | feature_selector.fit(df_train, y_train) 72 | with pytest.raises(ValueError): 73 | feature_selector.transform(None) 74 | df_transformed = feature_selector.transform(df_train) 75 | assert (df_transformed.columns == df_train.columns).all() 76 | 77 | 78 | def test_fit_transform_Clf_feature_selector(): 79 | """Test fit_transform method of Clf_feature_selector class.""" 80 | feature_selector = Clf_feature_selector(threshold=0) 81 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 82 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 83 | df_transformed = feature_selector.fit_transform(df_train, y_train) 84 | assert (df_transformed.columns == df_train.columns).all() 85 | -------------------------------------------------------------------------------- /tests/test_classifier.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.model.classification.classifier module.""" 7 | import pytest 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from mlbox.model.classification.classifier import Classifier 12 | from lightgbm import LGBMClassifier 13 | 14 | 15 | def test_init_classifier(): 16 | """Test init method of Classifier class.""" 17 | classifier = Classifier() 18 | assert classifier._Classifier__strategy == "LightGBM" 19 | assert classifier._Classifier__classif_params == {} 20 | assert classifier._Classifier__classifier 21 | assert not classifier._Classifier__col 22 | assert not classifier._Classifier__fitOK 23 | 24 | 25 | def test_get_params_classifier(): 26 | """Test get_params method of Classifier class.""" 27 | classifier = Classifier() 28 | params = classifier.get_params() 29 | assert params == {'strategy': "LightGBM"} 30 | assert not classifier._Classifier__classif_params 31 | 32 | 33 | def test_set_params_classifier(): 34 | """Test set_params method of Classifier class.""" 35 | classifier = Classifier() 36 | classifier.set_params(strategy="LightGBM") 37 | assert classifier._Classifier__strategy == "LightGBM" 38 | classifier.set_params(strategy="RandomForest") 39 | assert classifier._Classifier__strategy == "RandomForest" 40 | classifier.set_params(strategy="ExtraTrees") 41 | assert classifier._Classifier__strategy == "ExtraTrees" 42 | classifier.set_params(strategy="RandomForest") 43 | assert classifier._Classifier__strategy == "RandomForest" 44 | classifier.set_params(strategy="Tree") 45 | assert classifier._Classifier__strategy == "Tree" 46 | classifier.set_params(strategy="AdaBoost") 47 | assert classifier._Classifier__strategy == "AdaBoost" 48 | classifier.set_params(strategy="Linear") 49 | assert classifier._Classifier__strategy == "Linear" 50 | with pytest.warns(UserWarning) as record: 51 | classifier.set_params(wrong_strategy="wrong_strategy") 52 | assert len(record) == 1 53 | 54 | 55 | def test_set_classifier(): 56 | """Test set method of Classifier class.""" 57 | classifier = Classifier() 58 | with pytest.raises(ValueError): 59 | classifier._Classifier__set_classifier("wrong_strategy") 60 | 61 | 62 | def test_fit_classifier(): 63 | """Test fit method of Classifier class.""" 64 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 65 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 66 | classifier = Classifier() 67 | classifier.fit(df_train, y_train) 68 | assert np.all(classifier._Classifier__col == df_train.columns) 69 | assert classifier._Classifier__fitOK 70 | 71 | 72 | def test_feature_importances_classifier(): 73 | """Test feature_importances method of Classifier class.""" 74 | classifier = Classifier() 75 | with pytest.raises(ValueError): 76 | classifier.feature_importances() 77 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 78 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 79 | classifier.set_params(strategy="LightGBM") 80 | classifier.fit(df_train, y_train) 81 | importance = classifier.feature_importances() 82 | assert importance != {} 83 | classifier.set_params(strategy="Linear") 84 | classifier.fit(df_train, y_train) 85 | importance = classifier.feature_importances() 86 | assert importance != {} 87 | classifier.set_params(strategy="RandomForest") 88 | classifier.fit(df_train, y_train) 89 | importance = classifier.feature_importances() 90 | assert importance != {} 91 | classifier.set_params(strategy="AdaBoost") 92 | classifier.fit(df_train, y_train) 93 | importance = classifier.feature_importances() 94 | assert importance != {} 95 | classifier.set_params(strategy="Bagging") 96 | classifier.fit(df_train, y_train) 97 | importance = classifier.feature_importances() 98 | assert importance != {} 99 | 100 | 101 | def test_predict_classifier(): 102 | """Test predict method of Classifier class.""" 103 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 104 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 105 | classifier = Classifier() 106 | with pytest.raises(ValueError): 107 | classifier.predict(df_train) 108 | classifier.fit(df_train, y_train) 109 | with pytest.raises(ValueError): 110 | classifier.predict(None) 111 | assert len(classifier.predict(df_train)) > 0 112 | 113 | 114 | def test_predict_log_proba_classifier(): 115 | """Test predict_log_proba method of Classifier class.""" 116 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 117 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 118 | classifier = Classifier(strategy="Linear") 119 | with pytest.raises(ValueError): 120 | classifier.predict_log_proba(df_train) 121 | classifier.fit(df_train, y_train) 122 | with pytest.raises(ValueError): 123 | classifier.predict_log_proba(None) 124 | assert len(classifier.predict_log_proba(df_train)) > 0 125 | 126 | 127 | def test_predict_proba_classifier(): 128 | """Test predict_proba method of Classifier class.""" 129 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 130 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 131 | classifier = Classifier() 132 | with pytest.raises(ValueError): 133 | classifier.predict_proba(df_train) 134 | classifier.fit(df_train, y_train) 135 | with pytest.raises(ValueError): 136 | classifier.predict_proba(None) 137 | assert len(classifier.predict_proba(df_train)) > 0 138 | 139 | 140 | def test_score_classifier(): 141 | """Test score method of Classifier class.""" 142 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 143 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 144 | classifier = Classifier() 145 | with pytest.raises(ValueError): 146 | classifier.score(df_train, y_train) 147 | classifier.fit(df_train, y_train) 148 | with pytest.raises(ValueError): 149 | classifier.score(None, y_train) 150 | with pytest.raises(ValueError): 151 | classifier.score(df_train, None) 152 | assert classifier.score(df_train, y_train) > 0 153 | 154 | 155 | def test_get_estimator_classifier(): 156 | """Test get_estimator method of Classifier class.""" 157 | classifier = Classifier() 158 | estimator = classifier.get_estimator() 159 | assert isinstance(estimator, type(LGBMClassifier())) 160 | -------------------------------------------------------------------------------- /tests/test_drift_estimator.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.preprocessing.drift.drift_estimator module.""" 7 | import pytest 8 | import pandas as pd 9 | 10 | from mlbox.preprocessing.drift.drift_estimator import DriftEstimator 11 | 12 | 13 | def test_init_drift_estimator(): 14 | """Test init method of DriftEstimator class.""" 15 | drift_estimator = DriftEstimator() 16 | assert drift_estimator.n_folds == 2 17 | assert drift_estimator.stratify 18 | assert drift_estimator.random_state == 1 19 | assert not drift_estimator._DriftEstimator__cv 20 | assert not drift_estimator._DriftEstimator__pred 21 | assert not drift_estimator._DriftEstimator__target 22 | assert not drift_estimator._DriftEstimator__fitOK 23 | 24 | 25 | def test_get_params_drift_estimator(): 26 | """Test get_params method of DriftEstimator class.""" 27 | drift_estimator = DriftEstimator() 28 | dict = {'estimator': drift_estimator.estimator, 29 | 'n_folds': 2, 30 | 'stratify': True, 31 | 'random_state': 1} 32 | assert drift_estimator.get_params() == dict 33 | 34 | 35 | def test_set_params_drift_estimator(): 36 | """Test set_params method of DriftEstimator class.""" 37 | drift_estimator = DriftEstimator() 38 | dict = {'estimator': drift_estimator.estimator, 39 | 'n_folds': 3, 40 | 'stratify': False, 41 | 'random_state': 2} 42 | drift_estimator.set_params(**dict) 43 | assert drift_estimator.get_params() == dict 44 | 45 | 46 | def test_fit_drift_estimator(): 47 | """Test fit method of DriftEstimator class.""" 48 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 49 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 50 | drift_estimator = DriftEstimator() 51 | drift_estimator.fit(df_train, df_test) 52 | assert drift_estimator._DriftEstimator__fitOK 53 | 54 | 55 | def test_score_drift_estimator(): 56 | """Test score method of DriftEstimator class.""" 57 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 58 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 59 | drift_estimator = DriftEstimator() 60 | with pytest.raises(ValueError): 61 | drift_estimator.score() 62 | drift_estimator.fit(df_train, df_test) 63 | assert drift_estimator.score() > 0 64 | 65 | 66 | def test_predict_drift_estimator(): 67 | """Test predict method of DriftEstimator class.""" 68 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 69 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 70 | drift_estimator = DriftEstimator() 71 | with pytest.raises(ValueError): 72 | drift_estimator.predict() 73 | drift_estimator.fit(df_train, df_test) 74 | results = drift_estimator.predict() 75 | assert len(results) == 1309 76 | -------------------------------------------------------------------------------- /tests/test_drift_threshold.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.preprocessing.drift module.""" 7 | import pytest 8 | import pandas as pd 9 | 10 | from mlbox.preprocessing.drift import DriftThreshold 11 | from mlbox.preprocessing.drift import sync_fit 12 | from sklearn.tree import DecisionTreeClassifier 13 | from sklearn.ensemble import RandomForestClassifier 14 | 15 | 16 | def test_init_drift_threshold(): 17 | """Test init method of DriftThreshold class.""" 18 | drift_threshold = DriftThreshold() 19 | assert drift_threshold.threshold == 0.6 20 | assert drift_threshold.subsample == 1. 21 | assert isinstance(drift_threshold.estimator, 22 | type(DecisionTreeClassifier())) 23 | assert drift_threshold.n_folds == 2 24 | assert drift_threshold.stratify 25 | assert drift_threshold.random_state == 1 26 | assert drift_threshold.n_jobs == -1 27 | assert not drift_threshold._DriftThreshold__fitOK 28 | 29 | 30 | def test_get_params_drift_threshold(): 31 | """Test get_params method of DriftThreshold class.""" 32 | drift_threshold = DriftThreshold() 33 | dict = {'threshold': 0.6, 34 | 'subsample': 1., 35 | 'n_folds': 2, 36 | 'stratify': True, 37 | 'random_state': 1, 38 | 'n_jobs': -1} 39 | dict_get_params = drift_threshold.get_params() 40 | assert dict_get_params["threshold"] == dict["threshold"] 41 | assert dict_get_params["subsample"] == dict["subsample"] 42 | assert dict_get_params["n_folds"] == dict["n_folds"] 43 | assert dict_get_params["stratify"] == dict["stratify"] 44 | assert dict_get_params["random_state"] == dict["random_state"] 45 | assert dict_get_params["n_jobs"] == dict["n_jobs"] 46 | 47 | 48 | def test_set_params_drift_threshold(): 49 | """Test set_params method of DriftThreshold class.""" 50 | drift_threshold = DriftThreshold() 51 | dict = {'threshold': 0.6, 52 | 'subsample': 1., 53 | 'estimator': DecisionTreeClassifier(max_depth=6), 54 | 'n_folds': 2, 55 | 'stratify': True, 56 | 'random_state': 1, 57 | 'n_jobs': -1} 58 | drift_threshold.set_params(**dict) 59 | dict_get_params = drift_threshold.get_params() 60 | assert dict_get_params["threshold"] == dict["threshold"] 61 | assert dict_get_params["subsample"] == dict["subsample"] 62 | assert dict_get_params["n_folds"] == dict["n_folds"] 63 | assert dict_get_params["stratify"] == dict["stratify"] 64 | assert dict_get_params["random_state"] == dict["random_state"] 65 | assert dict_get_params["n_jobs"] == dict["n_jobs"] 66 | 67 | 68 | def test_fit_drift_threshold(): 69 | """Test fit method of DriftThreshold class.""" 70 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 71 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 72 | drift_threshold = DriftThreshold() 73 | drift_threshold.fit(df_train, df_test) 74 | assert drift_threshold._DriftThreshold__fitOK 75 | 76 | 77 | def test_transform_drift_threshold(): 78 | """Test transform method of DriftThreshold class.""" 79 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 80 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 81 | drift_threshold = DriftThreshold() 82 | with pytest.raises(ValueError): 83 | drift_threshold.transform(df_train) 84 | drift_threshold.fit(df_train, df_test) 85 | df_transformed = drift_threshold.transform(df_train) 86 | assert (df_train.columns == df_transformed.columns).all() 87 | 88 | 89 | def test_get_support_drift_threshold(): 90 | """Test get_support method of DriftThreshold class.""" 91 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 92 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 93 | drift_threshold = DriftThreshold() 94 | with pytest.raises(ValueError): 95 | drift_threshold.get_support() 96 | drift_threshold.fit(df_train, df_test) 97 | keep_list = drift_threshold.get_support() 98 | drop_list = drift_threshold.get_support(complement=True) 99 | for name in ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']: 100 | assert (name in keep_list) 101 | assert not drop_list 102 | 103 | 104 | def test_drifts_drift_threshold(): 105 | """Test drifts method of DriftThreshold class.""" 106 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 107 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 108 | drift_threshold = DriftThreshold() 109 | with pytest.raises(ValueError): 110 | drift_threshold.drifts() 111 | drift_threshold.fit(df_train, df_test) 112 | drifts = drift_threshold.drifts() 113 | for name in ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']: 114 | assert (name in list(drifts.keys())) 115 | 116 | 117 | def test_sync_fit_drift_threshold(): 118 | """Test method sync_fit of drift_threshold module.""" 119 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 120 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 121 | estimator = RandomForestClassifier(n_estimators=50, 122 | n_jobs=-1, 123 | max_features=1., 124 | min_samples_leaf=5, 125 | max_depth=5) 126 | 127 | score = sync_fit(df_train, df_test, estimator) 128 | assert 0 <= score 129 | -------------------------------------------------------------------------------- /tests/test_drift_thresholder.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.preprocessing.drift_thresholder module.""" 7 | import pytest 8 | 9 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder 10 | from mlbox.preprocessing.reader import Reader 11 | 12 | 13 | def test_init_drift_thresholder(): 14 | """Test init method of Drift_thresholder class.""" 15 | drift_thresholder = Drift_thresholder() 16 | assert drift_thresholder.threshold == 0.6 17 | assert not drift_thresholder.inplace 18 | assert drift_thresholder.verbose 19 | assert drift_thresholder.to_path == "save" 20 | assert drift_thresholder._Drift_thresholder__Ddrifts == {} 21 | assert not drift_thresholder._Drift_thresholder__fitOK 22 | 23 | 24 | def test_fit_transform(): 25 | """Test fit transform method of Drift_thresholder class.""" 26 | drift_thresholder = Drift_thresholder() 27 | reader = Reader(sep=",") 28 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"], 29 | target_name="Survived") 30 | drift_thresholder.fit_transform(dict) 31 | assert not drift_thresholder._Drift_thresholder__fitOK 32 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv", 33 | "data_for_tests/test.csv"], 34 | target_name="Survived") 35 | drift_thresholder.fit_transform(dict) 36 | assert drift_thresholder._Drift_thresholder__fitOK 37 | dict = reader.train_test_split(Lpath=["data_for_tests/inplace_train.csv", 38 | "data_for_tests/inplace_test.csv"], 39 | target_name="Survived") 40 | drift_thresholder.inplace = True 41 | drift_thresholder.fit_transform(dict) 42 | assert drift_thresholder._Drift_thresholder__fitOK 43 | 44 | 45 | def test_drifts(): 46 | """Test drifts method of Drift_thresholder class.""" 47 | drift_thresholder = Drift_thresholder() 48 | with pytest.raises(ValueError): 49 | drift_thresholder.drifts() 50 | reader = Reader(sep=",") 51 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv", 52 | "data_for_tests/test.csv"], 53 | target_name="Survived") 54 | drift_thresholder.fit_transform(dict) 55 | drifts = drift_thresholder.drifts() 56 | assert drifts != {} 57 | -------------------------------------------------------------------------------- /tests/test_na_encoder.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.encoding.na_encoder module.""" 7 | import pytest 8 | import pandas as pd 9 | 10 | from mlbox.encoding.na_encoder import NA_encoder 11 | 12 | 13 | def test_init_NA_encoder(): 14 | """Test init method of NA_encoder class.""" 15 | encoder = NA_encoder() 16 | assert encoder.numerical_strategy == "mean" 17 | assert encoder.categorical_strategy == "" 18 | assert encoder._NA_encoder__Lcat == [] 19 | assert encoder._NA_encoder__Lnum == [] 20 | assert not encoder._NA_encoder__imp 21 | assert encoder._NA_encoder__mode == dict() 22 | assert not encoder._NA_encoder__fitOK 23 | 24 | 25 | def test_get_params_NA_encoder(): 26 | """Test get_params method of NA_encoder class.""" 27 | encoder = NA_encoder() 28 | dict = {'numerical_strategy': "mean", 29 | 'categorical_strategy': ""} 30 | assert encoder.get_params() == dict 31 | 32 | 33 | def test_set_params_NA_encoder(): 34 | """Test set_params method of NA_encoder class.""" 35 | encoder = NA_encoder() 36 | 37 | encoder.set_params(numerical_strategy="mean") 38 | assert encoder.numerical_strategy == "mean" 39 | encoder.set_params(numerical_strategy="median") 40 | assert encoder.numerical_strategy == "median" 41 | encoder.set_params(numerical_strategy="most_frequent") 42 | assert encoder.numerical_strategy == "most_frequent" 43 | encoder.set_params(numerical_strategy=3.0) 44 | assert encoder.numerical_strategy == 3.0 45 | 46 | encoder.set_params(categorical_strategy="") 47 | assert encoder.categorical_strategy == "" 48 | encoder.set_params(categorical_strategy="most_frequent") 49 | assert encoder.categorical_strategy == "most_frequent" 50 | encoder.set_params(categorical_strategy="string_test") 51 | assert encoder.categorical_strategy == "string_test" 52 | 53 | with pytest.warns(UserWarning) as record: 54 | encoder.set_params(_Categorical_encoder__Lcat=[]) 55 | assert len(record) == 1 56 | 57 | 58 | def test_fit_NA_encoder(): 59 | """Test fit method of NA_encoder class.""" 60 | df = pd.read_csv("data_for_tests/train.csv") 61 | 62 | encoder = NA_encoder(numerical_strategy="wrong_strategy") 63 | with pytest.raises(ValueError): 64 | encoder.fit(df, df["Survived"]) 65 | encoder.set_params(numerical_strategy="mean") 66 | encoder.fit(df, df["Survived"]) 67 | assert encoder._NA_encoder__fitOK 68 | encoder.set_params(numerical_strategy="median") 69 | encoder.fit(df, df["Survived"]) 70 | assert encoder._NA_encoder__fitOK 71 | encoder.set_params(numerical_strategy="most_frequent") 72 | encoder.fit(df, df["Survived"]) 73 | assert encoder._NA_encoder__fitOK 74 | encoder.set_params(numerical_strategy=3.0) 75 | encoder.fit(df, df["Survived"]) 76 | assert encoder._NA_encoder__fitOK 77 | 78 | encoder = NA_encoder(categorical_strategy=2) 79 | with pytest.raises(ValueError): 80 | encoder.fit(df, df["Survived"]) 81 | encoder.set_params(categorical_strategy="") 82 | encoder.fit(df, df["Survived"]) 83 | assert encoder._NA_encoder__fitOK 84 | encoder.set_params(categorical_strategy="most_frequent") 85 | encoder.fit(df, df["Survived"]) 86 | 87 | 88 | def test_transform_NA_encoder(): 89 | """Test transform method of NA_encoder class.""" 90 | df = pd.read_csv("data_for_tests/train.csv") 91 | encoder = NA_encoder() 92 | with pytest.raises(ValueError): 93 | encoder.transform(df) 94 | encoder.fit(df, df["Survived"]) 95 | df_encoded = encoder.transform(df) 96 | assert (df.columns == df_encoded.columns).all() 97 | -------------------------------------------------------------------------------- /tests/test_optimiser.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.optimisation.optimiser module.""" 7 | import pytest 8 | import numpy as np 9 | 10 | from mlbox.optimisation.optimiser import Optimiser 11 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder 12 | from mlbox.preprocessing.reader import Reader 13 | from mlbox.optimisation import make_scorer 14 | 15 | 16 | def test_init_optimiser(): 17 | """Test init method of Optimiser class.""" 18 | with pytest.warns(UserWarning) as record: 19 | optimiser = Optimiser() 20 | assert len(record) == 1 21 | assert not optimiser.scoring 22 | assert optimiser.n_folds == 2 23 | assert optimiser.random_state == 1 24 | assert optimiser.to_path == "save" 25 | assert optimiser.verbose 26 | 27 | 28 | def test_get_params_optimiser(): 29 | """Test get_params method of optimiser class.""" 30 | with pytest.warns(UserWarning) as record: 31 | optimiser = Optimiser() 32 | assert len(record) == 1 33 | dict = {'scoring': None, 34 | 'n_folds': 2, 35 | 'random_state': 1, 36 | 'to_path': "save", 37 | 'verbose': True} 38 | assert optimiser.get_params() == dict 39 | 40 | 41 | def test_set_params_optimiser(): 42 | """Test set_params method of Optimiser class.""" 43 | with pytest.warns(UserWarning) as record: 44 | optimiser = Optimiser() 45 | assert len(record) == 1 46 | optimiser.set_params(scoring='accuracy') 47 | assert optimiser.scoring == 'accuracy' 48 | optimiser.set_params(n_folds=3) 49 | assert optimiser.n_folds == 3 50 | optimiser.set_params(random_state=2) 51 | assert optimiser.random_state == 2 52 | optimiser.set_params(to_path="name") 53 | assert optimiser.to_path == "name" 54 | optimiser.set_params(verbose=False) 55 | assert not optimiser.verbose 56 | with pytest.warns(UserWarning) as record: 57 | optimiser.set_params(wrong_key=3) 58 | assert len(record) == 1 59 | 60 | 61 | def test_evaluate_classification_optimiser(): 62 | """Test evaluate method of Optimiser class for classication.""" 63 | reader = Reader(sep=",") 64 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv", 65 | "data_for_tests/test.csv"], 66 | target_name="Survived") 67 | drift_thresholder = Drift_thresholder() 68 | drift_thresholder = drift_thresholder.fit_transform(dict) 69 | 70 | with pytest.warns(UserWarning) as record: 71 | opt = Optimiser(scoring=None, n_folds=3) 72 | assert len(record) == 1 73 | score = opt.evaluate(None, dict) 74 | assert -np.Inf <= score 75 | 76 | with pytest.warns(UserWarning) as record: 77 | opt = Optimiser(scoring="roc_auc", n_folds=3) 78 | assert len(record) == 1 79 | score = opt.evaluate(None, dict) 80 | assert 0. <= score <= 1. 81 | 82 | with pytest.warns(UserWarning) as record: 83 | opt = Optimiser(scoring="wrong_scoring", n_folds=3) 84 | assert len(record) == 1 85 | with pytest.warns(UserWarning) as record: 86 | score = opt.evaluate(None, dict) 87 | assert opt.scoring == "neg_log_loss" 88 | 89 | 90 | def test_evaluate_regression_optimiser(): 91 | """Test evaluate method of Optimiser class for regression.""" 92 | reader = Reader(sep=",") 93 | dict = reader.train_test_split(Lpath=["data_for_tests/train_regression.csv", 94 | "data_for_tests/test_regression.csv"], 95 | target_name="SalePrice") 96 | drift_thresholder = Drift_thresholder() 97 | drift_thresholder = drift_thresholder.fit_transform(dict) 98 | 99 | mape = make_scorer(lambda y_true, 100 | y_pred: 100*np.sum( 101 | np.abs(y_true-y_pred)/y_true 102 | )/len(y_true), 103 | greater_is_better=False, 104 | needs_proba=False) 105 | with pytest.warns(UserWarning) as record: 106 | opt = Optimiser(scoring=mape, n_folds=3) 107 | assert len(record) == 1 108 | score = opt.evaluate(None, dict) 109 | assert -np.Inf <= score 110 | 111 | with pytest.warns(UserWarning) as record: 112 | opt = Optimiser(scoring=None, n_folds=3) 113 | assert len(record) == 1 114 | score = opt.evaluate(None, dict) 115 | assert -np.Inf <= score 116 | 117 | with pytest.warns(UserWarning) as record: 118 | opt = Optimiser(scoring="wrong_scoring", n_folds=3) 119 | assert len(record) == 1 120 | with pytest.warns(UserWarning) as record: 121 | score = opt.evaluate(None, dict) 122 | assert -np.Inf <= score 123 | 124 | 125 | def test_evaluate_and_optimise_classification(): 126 | """Test evaluate_and_optimise method of Optimiser class.""" 127 | reader = Reader(sep=",") 128 | 129 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv", 130 | "data_for_tests/test.csv"], 131 | target_name="Survived") 132 | drift_thresholder = Drift_thresholder() 133 | drift_thresholder = drift_thresholder.fit_transform(dict) 134 | 135 | with pytest.warns(UserWarning) as record: 136 | opt = Optimiser(scoring='accuracy', n_folds=3) 137 | assert len(record) == 1 138 | dict_error = dict.copy() 139 | dict_error["target"] = dict_error["target"].astype(str) 140 | with pytest.raises(ValueError): 141 | score = opt.evaluate(None, dict_error) 142 | 143 | with pytest.warns(UserWarning) as record: 144 | opt = Optimiser(scoring='accuracy', n_folds=3) 145 | assert len(record) == 1 146 | score = opt.evaluate(None, dict) 147 | assert 0. <= score <= 1. 148 | 149 | space = {'ne__numerical_strategy': {"search": "choice", "space": [0]}, 150 | 'ce__strategy': {"search": "choice", 151 | "space": ["label_encoding"]}, 152 | 'fs__threshold': {"search": "uniform", 153 | "space": [0.01, 0.3]}, 154 | 'est__max_depth': {"search": "choice", 155 | "space": [3, 4, 5, 6, 7]} 156 | 157 | } 158 | 159 | best = opt.optimise(space, dict, 1) 160 | assert type(best) == type(dict) 161 | -------------------------------------------------------------------------------- /tests/test_predictor.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.prediction0.predictor module.""" 7 | import sys 8 | import pytest 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from mlbox.prediction.predictor import Predictor 13 | from mlbox.optimisation.optimiser import Optimiser 14 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder 15 | from mlbox.preprocessing.reader import Reader 16 | from mlbox.optimisation import make_scorer 17 | 18 | if sys.version_info[0] >= 3: 19 | from unittest.mock import patch 20 | 21 | 22 | set_backend = "import matplotlib\nmatplotlib.use('Agg')\n" 23 | 24 | 25 | def test_init_predictor(): 26 | """Test init method of Predictor class.""" 27 | predictor = Predictor() 28 | assert predictor.to_path == "save" 29 | assert predictor.verbose 30 | 31 | 32 | def test_get_params_predictor(): 33 | """Test get_params method of Predictor class.""" 34 | predictor = Predictor() 35 | dict = {'to_path': "save", 36 | 'verbose': True} 37 | assert predictor.get_params() == dict 38 | 39 | 40 | def test_set_params_predictor(): 41 | """Test set_params method of Predictor class.""" 42 | predictor = Predictor() 43 | predictor.set_params(to_path="name") 44 | assert predictor.to_path == "name" 45 | predictor.set_params(verbose=False) 46 | assert not predictor.verbose 47 | with pytest.warns(UserWarning) as record: 48 | predictor.set_params(wrong_key=3) 49 | assert len(record) == 1 50 | 51 | 52 | def test_fit_predict_predictor_classification(): 53 | """Test fit_predict method of Predictor class for classification.""" 54 | reader = Reader(sep=",") 55 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv", 56 | "data_for_tests/test.csv"], 57 | target_name="Survived") 58 | drift_thresholder = Drift_thresholder() 59 | drift_thresholder = drift_thresholder.fit_transform(dict) 60 | 61 | with pytest.warns(UserWarning) as record: 62 | opt = Optimiser(scoring='accuracy', n_folds=3) 63 | assert len(record) == 1 64 | 65 | space = {'ne__numerical_strategy': {"search": "choice", "space": [0]}, 66 | 'ce__strategy': {"search": "choice", 67 | "space": ["entity_embedding"]}, 68 | 'fs__threshold': {"search": "uniform", 69 | "space": [0.01, 0.3]}, 70 | 'est__max_depth': {"search": "choice", 71 | "space": [3, 4, 5, 6, 7]} 72 | 73 | } 74 | 75 | optimal_hyper_parameters = opt.optimise(space, dict, 1) 76 | 77 | predictor = Predictor(verbose=False) 78 | predictor.fit_predict(optimal_hyper_parameters, dict) 79 | pred_df = pd.read_csv("save/Survived_predictions.csv") 80 | assert np.all(list(pred_df.columns) == ['Unnamed: 0', 81 | '0.0', 82 | '1.0', 83 | 'Survived_predicted']) 84 | assert np.shape(pred_df) == (418, 4) 85 | 86 | 87 | if sys.version_info[0] >= 3: 88 | @patch('matplotlib.pyplot.show') 89 | def test_fit_predict_predictor_regression(mock_show): 90 | """Test fit_predict method of Predictor class for regression.""" 91 | rd = Reader(sep=',') 92 | dt = rd.train_test_split(Lpath=["data_for_tests/train_regression.csv", 93 | "data_for_tests/test_regression.csv"], 94 | target_name="SalePrice") 95 | 96 | drift_thresholder = Drift_thresholder() 97 | df = drift_thresholder.fit_transform(dt) 98 | 99 | mape = make_scorer(lambda y_true, 100 | y_pred: 100*np.sum( 101 | np.abs(y_true-y_pred)/y_true 102 | )/len(y_true), 103 | greater_is_better=False, 104 | needs_proba=False) 105 | opt = Optimiser(scoring=mape, n_folds=3) 106 | 107 | opt.evaluate(None, df) 108 | 109 | space = { 110 | 'ne__numerical_strategy': {"search": "choice", 111 | "space": [0]}, 112 | 'ce__strategy': {"search": "choice", 113 | "space": ["random_projection"]}, 114 | 'fs__threshold': {"search": "uniform", 115 | "space": [0.01, 0.3]}, 116 | 'est__max_depth': {"search": "choice", 117 | "space": [3, 4, 5, 6, 7]} 118 | 119 | } 120 | 121 | best = opt.optimise(space, df, 1) 122 | 123 | prd = Predictor(verbose=True) 124 | prd.fit_predict(best, df) 125 | pred_df = pd.read_csv("save/SalePrice_predictions.csv") 126 | assert np.all(list(pred_df.columns) == ['Unnamed: 0', 127 | 'SalePrice_predicted']) 128 | assert np.shape(pred_df) == (1459, 2) 129 | 130 | else: 131 | def test_fit_predict_predictor_regression(): 132 | """Test fit_predict method of Predictor class for regression.""" 133 | rd = Reader(sep=',') 134 | dt = rd.train_test_split(Lpath=["data_for_tests/train_regression.csv", 135 | "data_for_tests/test_regression.csv"], 136 | target_name="SalePrice") 137 | 138 | drift_thresholder = Drift_thresholder() 139 | df = drift_thresholder.fit_transform(dt) 140 | 141 | mape = make_scorer(lambda y_true, 142 | y_pred: 100*np.sum( 143 | np.abs(y_true-y_pred)/y_true 144 | )/len(y_true), 145 | greater_is_better=False, 146 | needs_proba=False) 147 | opt = Optimiser(scoring=mape, n_folds=3) 148 | 149 | opt.evaluate(None, df) 150 | 151 | space = { 152 | 'ne__numerical_strategy': {"search": "choice", 153 | "space": [0]}, 154 | 'ce__strategy': {"search": "choice", 155 | "space": ["label_encoding", 156 | "random_projection", 157 | "entity_embedding"]}, 158 | 'fs__threshold': {"search": "uniform", 159 | "space": [0.01, 0.3]}, 160 | 'est__max_depth': {"search": "choice", 161 | "space": [3, 4, 5, 6, 7]} 162 | 163 | } 164 | 165 | best = opt.optimise(space, df, 1) 166 | 167 | prd = Predictor(verbose=False) 168 | prd.fit_predict(best, df) 169 | pred_df = pd.read_csv("save/SalePrice_predictions.csv") 170 | assert np.all(list(pred_df.columns) == ['Unnamed: 0', 171 | 'SalePrice_predicted']) 172 | assert np.shape(pred_df) == (1459, 2) 173 | -------------------------------------------------------------------------------- /tests/test_reader.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.preprocessing.reader module.""" 7 | import sys 8 | 9 | import pytest 10 | import pandas as pd 11 | import numpy as np 12 | 13 | 14 | from mlbox.preprocessing.reader import convert_list 15 | from mlbox.preprocessing.reader import convert_float_and_dates 16 | from mlbox.preprocessing.reader import Reader 17 | 18 | 19 | def test_init_reader(): 20 | """Test init method of Reader class.""" 21 | reader = Reader() 22 | assert not reader.sep 23 | assert reader.header == 0 24 | assert not reader.to_hdf5 25 | assert reader.to_path == "save" 26 | assert reader.verbose 27 | 28 | 29 | def test_clean_reader(): 30 | """Test clean method of Reader class.""" 31 | reader = Reader() 32 | with pytest.raises(ValueError): 33 | reader.clean(path=None, drop_duplicate=False) 34 | with pytest.raises(ValueError): 35 | reader.clean(path="data_for_tests/train.csv") 36 | reader = Reader(sep=",") 37 | df = reader.clean(path="data_for_tests/train.csv") 38 | assert np.shape(df) == (891, 12) 39 | with pytest.raises(ValueError): 40 | reader.clean(path="data_for_tests/train.wrong_extension") 41 | df_drop = reader.clean(path="data_for_tests/train.csv", 42 | drop_duplicate=True) 43 | assert np.shape(df_drop) == (891, 12) 44 | assert np.all(df["Name"] == df_drop["Name"]) 45 | reader = Reader() 46 | df_excel = reader.clean(path="data_for_tests/train.xls") 47 | assert np.shape(df_excel) == (891, 12) 48 | assert np.all(df["Name"] == df_excel["Name"]) 49 | if (sys.platform == "win32" and sys.version_info[0] <=3 and sys.version_info[1] <=5): 50 | pass 51 | else: 52 | if sys.version_info[0] >= 3: 53 | df_hdf = reader.clean(path="data_for_tests/train.h5") 54 | assert np.shape(df_hdf) == (891, 12) 55 | assert np.all(df["Name"] == df_hdf["Name"]) 56 | df_json = reader.clean(path="data_for_tests/train.json") 57 | assert np.shape(df_json) == (891, 12) 58 | 59 | 60 | def test_train_test_split_reader(): 61 | """Test train_test_split method of Reader class.""" 62 | reader = Reader(sep=",") 63 | with pytest.raises(ValueError): 64 | reader.train_test_split(Lpath=None, target_name="target") 65 | with pytest.raises(ValueError): 66 | reader.train_test_split(Lpath=["data_for_tests/train.csv"], 67 | target_name=None) 68 | with pytest.raises(ValueError): 69 | reader = Reader(to_path=None) 70 | reader.train_test_split(Lpath=["data_for_tests/train.csv"], 71 | target_name="Survived") 72 | reader = Reader(sep=",") 73 | dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"], 74 | target_name="Survived") 75 | assert len(dict) == 3 76 | assert "train" in list(dict.keys()) 77 | assert "test" in list(dict.keys()) 78 | assert "target" in list(dict.keys()) 79 | assert np.all(dict["train"].columns == dict["train"].columns) 80 | if (sys.version_info[0] >= 3 and sys.platform != "win32"): 81 | reader = Reader(to_hdf5=True) 82 | dict = reader.train_test_split(Lpath=["data_for_tests/train.h5"], 83 | target_name="Survived") 84 | assert len(dict) == 3 85 | assert "train" in list(dict.keys()) 86 | assert "test" in list(dict.keys()) 87 | assert "target" in list(dict.keys()) 88 | assert np.all(dict["train"].columns == dict["train"].columns) 89 | 90 | 91 | def test_convert_list_reader(): 92 | """Test convert_list function of reader module.""" 93 | data_list = list() 94 | data_list.append([1, 2]) 95 | data_list.append([3, 4]) 96 | index = ['a', 'b'] 97 | serie = pd.Series(data=data_list, index=index, name="test") 98 | df = convert_list(serie) 99 | assert np.all(df.index == serie.index) 100 | assert np.all(df.columns.values == ['test_item1', 'test_item2']) 101 | 102 | 103 | def test_convert_float_and_dates_reader(): 104 | """Test convert_float_and_dates function of reader module.""" 105 | index = ['a', 'b', 'c'] 106 | values = [1, 2, 3] 107 | serie = pd.Series(data=values, index=index) 108 | serie = convert_float_and_dates(serie) 109 | assert serie.dtype == 'float64' 110 | 111 | index = ['a', 'b', 'c'] 112 | values = np.array(['2007-07-13', '2006-01-13', '2010-08-13'], 113 | dtype='datetime64') 114 | serie = pd.Series(data=values, 115 | index=index, 116 | dtype='datetime64[ns]', 117 | name="test") 118 | df = convert_float_and_dates(serie) 119 | assert np.all(df.index == serie.index) 120 | assert np.all(df.columns.values == ['test_TIMESTAMP', 121 | 'test_YEAR', 122 | 'test_MONTH', 123 | 'test_DAY', 124 | 'test_DAYOFWEEK', 125 | 'test_HOUR']) 126 | 127 | index = ['a', 'b', 'c'] 128 | values = np.array(['2007-07-13', '2006-01-13', '2010-08-13']) 129 | serie = pd.Series(data=values, index=index, name="test") 130 | df = convert_float_and_dates(serie) 131 | assert np.all(df.index == serie.index) 132 | assert np.all(df.columns.values == ['test_TIMESTAMP', 133 | 'test_YEAR', 134 | 'test_MONTH', 135 | 'test_DAY', 136 | 'test_DAYOFWEEK', 137 | 'test_HOUR']) 138 | -------------------------------------------------------------------------------- /tests/test_regression_feature_selector.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | # import pytest 7 | """Test mlbox.model.regression.feature_selector module.""" 8 | import pytest 9 | import pandas as pd 10 | 11 | from mlbox.model.regression.feature_selector import Reg_feature_selector 12 | 13 | 14 | def test_init_Reg_feature_selector(): 15 | """Test init method of Reg_feature_selector class.""" 16 | feature_selector = Reg_feature_selector() 17 | assert feature_selector.strategy == "l1" 18 | assert feature_selector.threshold == 0.3 19 | assert not feature_selector._Reg_feature_selector__fitOK 20 | assert feature_selector._Reg_feature_selector__to_discard == [] 21 | 22 | 23 | def test_get_params_Reg_feature_selector(): 24 | """Test get_params method of Reg_feature_selector class.""" 25 | feature_selector = Reg_feature_selector() 26 | dict = {'strategy': "l1", 27 | 'threshold': 0.3} 28 | assert feature_selector.get_params() == dict 29 | 30 | 31 | def test_set_params_Reg_feature_selector(): 32 | """Test set_params of method Reg_feature_selector class.""" 33 | feature_selector = Reg_feature_selector() 34 | feature_selector.set_params(strategy="variance") 35 | assert feature_selector.strategy == "variance" 36 | feature_selector.set_params(threshold=0.2) 37 | assert feature_selector.threshold == 0.2 38 | with pytest.warns(UserWarning) as record: 39 | feature_selector.set_params(wrong_strategy="wrong_strategy") 40 | assert len(record) == 1 41 | 42 | 43 | def test_fit_Reg_feature_selector(): 44 | """Test fit method of Reg_feature_selector class.""" 45 | feature_selector = Reg_feature_selector() 46 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 47 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 48 | with pytest.raises(ValueError): 49 | feature_selector.fit(None, y_train) 50 | with pytest.raises(ValueError): 51 | feature_selector.fit(df_train, None) 52 | feature_selector.fit(df_train, y_train) 53 | assert feature_selector._Reg_feature_selector__fitOK 54 | feature_selector.set_params(strategy="variance") 55 | feature_selector.fit(df_train, y_train) 56 | assert feature_selector._Reg_feature_selector__fitOK 57 | feature_selector.set_params(strategy="rf_feature_importance") 58 | feature_selector.fit(df_train, y_train) 59 | assert feature_selector._Reg_feature_selector__fitOK 60 | feature_selector.set_params(strategy="wrond_strategy") 61 | with pytest.raises(ValueError): 62 | feature_selector.fit(df_train, y_train) 63 | 64 | 65 | def test_transform_Reg_feature_selector(): 66 | """Test transform method of Reg_feature_selector class.""" 67 | feature_selector = Reg_feature_selector(threshold=0) 68 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 69 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 70 | with pytest.raises(ValueError): 71 | feature_selector.transform(df_train) 72 | feature_selector.fit(df_train, y_train) 73 | with pytest.raises(ValueError): 74 | feature_selector.transform(None) 75 | df_transformed = feature_selector.transform(df_train) 76 | assert (df_transformed.columns == df_train.columns).all() 77 | 78 | 79 | def test_fit_transform_Reg_feature_selector(): 80 | """Test fit_transform method of Reg_feature_selector class.""" 81 | feature_selector = Reg_feature_selector(threshold=0) 82 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 83 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 84 | df_transformed = feature_selector.fit_transform(df_train, y_train) 85 | assert (df_transformed.columns == df_train.columns).all() 86 | -------------------------------------------------------------------------------- /tests/test_regressor.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | # import pytest 7 | """Test mlbox.model.regression.regressor module.""" 8 | import pytest 9 | import pandas as pd 10 | import numpy as np 11 | 12 | from mlbox.model.regression.regressor import Regressor 13 | from lightgbm import LGBMRegressor 14 | 15 | 16 | def test_init_regressor(): 17 | """Test init method of Regressor class.""" 18 | regressor = Regressor() 19 | assert regressor._Regressor__strategy == "LightGBM" 20 | assert regressor._Regressor__regress_params == {} 21 | assert regressor._Regressor__regressor 22 | assert not regressor._Regressor__col 23 | assert not regressor._Regressor__fitOK 24 | 25 | 26 | def test_get_params_regressor(): 27 | """Test get_params method of Regressor class.""" 28 | regressor = Regressor() 29 | params = regressor.get_params() 30 | assert params == {'strategy': "LightGBM"} 31 | assert not regressor._Regressor__regress_params 32 | 33 | 34 | def test_set_params_regressor(): 35 | """Test set_params method of Regressor class.""" 36 | regressor = Regressor() 37 | regressor.set_params(strategy="LightGBM") 38 | assert regressor._Regressor__strategy == "LightGBM" 39 | regressor.set_params(strategy="RandomForest") 40 | assert regressor._Regressor__strategy == "RandomForest" 41 | regressor.set_params(strategy="ExtraTrees") 42 | assert regressor._Regressor__strategy == "ExtraTrees" 43 | regressor.set_params(strategy="RandomForest") 44 | assert regressor._Regressor__strategy == "RandomForest" 45 | regressor.set_params(strategy="Tree") 46 | assert regressor._Regressor__strategy == "Tree" 47 | regressor.set_params(strategy="AdaBoost") 48 | assert regressor._Regressor__strategy == "AdaBoost" 49 | regressor.set_params(strategy="Linear") 50 | assert regressor._Regressor__strategy == "Linear" 51 | regressor.set_params(strategy="Bagging") 52 | assert regressor._Regressor__strategy == "Bagging" 53 | with pytest.warns(UserWarning) as record: 54 | regressor.set_params(wrong_strategy="wrong_strategy") 55 | assert len(record) == 1 56 | 57 | 58 | def test_set_regressor(): 59 | """Test set method of Regressor class.""" 60 | regressor = Regressor() 61 | with pytest.raises(ValueError): 62 | regressor._Regressor__set_regressor("wrong_strategy") 63 | 64 | 65 | def test_fit_regressor(): 66 | """Test fit method of Regressor class.""" 67 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 68 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 69 | regressor = Regressor() 70 | regressor.fit(df_train, y_train) 71 | assert np.all(regressor._Regressor__col == df_train.columns) 72 | assert regressor._Regressor__fitOK 73 | 74 | 75 | def test_feature_importances_regressor(): 76 | """Test feature_importances of Regressor class.""" 77 | regressor = Regressor() 78 | with pytest.raises(ValueError): 79 | regressor.feature_importances() 80 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 81 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 82 | regressor.set_params(strategy="LightGBM") 83 | regressor.fit(df_train, y_train) 84 | importance = regressor.feature_importances() 85 | assert importance != {} 86 | regressor.set_params(strategy="Linear") 87 | regressor.fit(df_train, y_train) 88 | importance = regressor.feature_importances() 89 | assert importance != {} 90 | regressor.set_params(strategy="RandomForest") 91 | regressor.fit(df_train, y_train) 92 | importance = regressor.feature_importances() 93 | assert importance != {} 94 | regressor.set_params(strategy="AdaBoost") 95 | regressor.fit(df_train, y_train) 96 | importance = regressor.feature_importances() 97 | assert importance != {} 98 | regressor.set_params(strategy="Bagging") 99 | regressor.fit(df_train, y_train) 100 | importance = regressor.feature_importances() 101 | assert importance != {} 102 | 103 | 104 | def test_predict_regressor(): 105 | """Test predict method of Regressor class.""" 106 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 107 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 108 | regressor = Regressor() 109 | with pytest.raises(ValueError): 110 | regressor.predict(df_train) 111 | regressor.fit(df_train, y_train) 112 | with pytest.raises(ValueError): 113 | regressor.predict(None) 114 | assert len(regressor.predict(df_train)) > 0 115 | 116 | 117 | def test_score_regressor(): 118 | """Test_score method of Regressor class.""" 119 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 120 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 121 | regressor = Regressor(strategy="Linear") 122 | with pytest.raises(ValueError): 123 | regressor.score(df_train, y_train) 124 | regressor.fit(df_train, y_train) 125 | with pytest.raises(ValueError): 126 | regressor.score(None, y_train) 127 | with pytest.raises(ValueError): 128 | regressor.score(df_train, None) 129 | assert regressor.score(df_train, y_train) > 0 130 | 131 | 132 | def test_get_estimator_regressor(): 133 | """Test get_estimator of Regressor class.""" 134 | regressor = Regressor() 135 | estimator = regressor.get_estimator() 136 | assert isinstance(estimator, type(LGBMRegressor())) 137 | -------------------------------------------------------------------------------- /tests/test_stacking_classifer.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.model.classification.stacking_classifier module.""" 7 | import pytest 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from sklearn.linear_model import LogisticRegression 12 | from mlbox.model.classification.stacking_classifier import StackingClassifier 13 | 14 | 15 | def test_init_stacking_classifier(): 16 | """Test init method of StackingClassifier class.""" 17 | with pytest.raises(ValueError): 18 | stacking_classifier = StackingClassifier(base_estimators=dict()) 19 | with pytest.raises(ValueError): 20 | stacking_classifier = StackingClassifier(n_folds=dict()) 21 | with pytest.raises(ValueError): 22 | stacking_classifier = StackingClassifier(copy="True") 23 | with pytest.raises(ValueError): 24 | stacking_classifier = StackingClassifier(drop_first="True") 25 | with pytest.raises(ValueError): 26 | stacking_classifier = StackingClassifier(random_state="1") 27 | with pytest.raises(ValueError): 28 | stacking_classifier = StackingClassifier(verbose="True") 29 | stacking_classifier = StackingClassifier() 30 | assert len(stacking_classifier.base_estimators) == 3 31 | assert isinstance(stacking_classifier.level_estimator, 32 | type(LogisticRegression())) 33 | assert stacking_classifier.n_folds == 5 34 | assert not stacking_classifier.copy 35 | assert stacking_classifier.drop_first 36 | assert stacking_classifier.random_state == 1 37 | assert stacking_classifier.verbose 38 | assert not stacking_classifier._StackingClassifier__fitOK 39 | assert not stacking_classifier._StackingClassifier__fittransformOK 40 | 41 | 42 | def test_get_params_stacking_classifier(): 43 | """Test get_params method StackingClassifier class.""" 44 | stacking_classifier = StackingClassifier() 45 | dict = stacking_classifier.get_params() 46 | assert len(dict["base_estimators"]) == 3 47 | assert isinstance(dict["level_estimator"], 48 | type(LogisticRegression())) 49 | assert dict["n_folds"] == 5 50 | assert not dict["copy"] 51 | assert dict["drop_first"] 52 | assert dict["random_state"] == 1 53 | assert dict["verbose"] 54 | 55 | 56 | def test_set_params_stacking_classifier(): 57 | """Test set_params method of StackingClassifier class.""" 58 | stacking_classifier = StackingClassifier() 59 | stacking_classifier.set_params(n_folds=6) 60 | assert stacking_classifier.n_folds == 6 61 | stacking_classifier.set_params(copy=True) 62 | assert stacking_classifier.copy 63 | stacking_classifier.set_params(drop_first=False) 64 | assert not stacking_classifier.drop_first 65 | stacking_classifier.set_params(random_state=2) 66 | assert stacking_classifier.random_state == 2 67 | stacking_classifier.set_params(verbose=False) 68 | assert not stacking_classifier.verbose 69 | with pytest.warns(UserWarning) as record: 70 | stacking_classifier.set_params(wrong_parameters=None) 71 | assert len(record) == 1 72 | 73 | 74 | def test_fit_transform_stacking_classifier(): 75 | """Test fit_transform method of StackingClassifier class.""" 76 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 77 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 78 | stacking_classifier = StackingClassifier() 79 | with pytest.raises(ValueError): 80 | stacking_classifier.fit_transform(None, y_train) 81 | with pytest.raises(ValueError): 82 | stacking_classifier.fit_transform(df_train, None) 83 | stacking_classifier.fit_transform(df_train, y_train) 84 | assert stacking_classifier._StackingClassifier__fittransformOK 85 | 86 | 87 | def test_transform_stacking_classifier(): 88 | """Test transform method of StackingClassifier class.""" 89 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 90 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 91 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 92 | stacking_classifier = StackingClassifier() 93 | with pytest.raises(ValueError): 94 | stacking_classifier.transform(None) 95 | with pytest.raises(ValueError): 96 | stacking_classifier.transform(df_test) 97 | stacking_classifier.fit_transform(df_train, y_train) 98 | results = stacking_classifier.transform(df_test) 99 | assert len(results.columns == 3) 100 | 101 | 102 | def test_fit_stacking_classifier(): 103 | """Test fit method of StackingClassifier class.""" 104 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 105 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 106 | stacking_classifier = StackingClassifier(verbose=True) 107 | stacking_classifier.fit(df_train, y_train) 108 | assert stacking_classifier._StackingClassifier__fitOK 109 | 110 | 111 | def test_predict_proba_stacking_classifier(): 112 | """Test predict_proba method of StackingClassifier class.""" 113 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 114 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 115 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 116 | stacking_classifier = StackingClassifier() 117 | with pytest.raises(ValueError): 118 | stacking_classifier.predict_proba(df_test) 119 | stacking_classifier.fit(df_train, y_train) 120 | results = stacking_classifier.predict_proba(df_test) 121 | assert np.shape(results) == (418, 2) 122 | 123 | 124 | def test_predict_stacking_classifier(): 125 | """Test predict method of StackingClassifier class.""" 126 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 127 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 128 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 129 | stacking_classifier = StackingClassifier() 130 | with pytest.raises(ValueError): 131 | stacking_classifier.predict(df_test) 132 | stacking_classifier.fit(df_train, y_train) 133 | results = stacking_classifier.predict(df_test) 134 | assert np.shape(results) == (418,) 135 | -------------------------------------------------------------------------------- /tests/test_stacking_regressor.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Axel ARONIO DE ROMBLAY 4 | # Author: Henri GERARD 5 | # License: BSD 3 clause 6 | """Test mlbox.model.regression.stacking_regressor module.""" 7 | import pytest 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from sklearn.linear_model import LinearRegression 12 | from mlbox.model.regression.stacking_regressor import StackingRegressor 13 | 14 | 15 | def test_init_stacking_regressor(): 16 | """Test init method of StackingRegressor class.""" 17 | with pytest.raises(ValueError): 18 | stacking_regressor = StackingRegressor(base_estimators=dict()) 19 | with pytest.raises(ValueError): 20 | stacking_regressor = StackingRegressor(n_folds=dict()) 21 | with pytest.raises(ValueError): 22 | stacking_regressor = StackingRegressor(copy="True") 23 | with pytest.raises(ValueError): 24 | stacking_regressor = StackingRegressor(random_state="1") 25 | with pytest.raises(ValueError): 26 | stacking_regressor = StackingRegressor(verbose="True") 27 | stacking_regressor = StackingRegressor() 28 | assert len(stacking_regressor.base_estimators) == 3 29 | assert isinstance(stacking_regressor.level_estimator, 30 | type(LinearRegression())) 31 | assert stacking_regressor.n_folds == 5 32 | assert not stacking_regressor.copy 33 | assert stacking_regressor.random_state == 1 34 | assert stacking_regressor.verbose 35 | assert not stacking_regressor._StackingRegressor__fitOK 36 | assert not stacking_regressor._StackingRegressor__fittransformOK 37 | 38 | 39 | def test_get_params_stacking_regressor(): 40 | """Test get_params method of StackingRegressor class.""" 41 | stacking_regressor = StackingRegressor() 42 | dict = stacking_regressor.get_params() 43 | assert len(dict["base_estimators"]) == 3 44 | assert isinstance(dict["level_estimator"], 45 | type(LinearRegression())) 46 | assert dict["n_folds"] == 5 47 | assert not dict["copy"] 48 | assert dict["random_state"] == 1 49 | assert dict["verbose"] 50 | 51 | 52 | def test_set_params_stacking_regressor(): 53 | """Test set_params method of StackingRegressor class.""" 54 | stacking_regressor = StackingRegressor() 55 | stacking_regressor.set_params(n_folds=6) 56 | assert stacking_regressor.n_folds == 6 57 | stacking_regressor.set_params(copy=True) 58 | assert stacking_regressor.copy 59 | stacking_regressor.set_params(random_state=2) 60 | assert stacking_regressor.random_state == 2 61 | stacking_regressor.set_params(verbose=False) 62 | assert not stacking_regressor.verbose 63 | with pytest.warns(UserWarning) as record: 64 | stacking_regressor.set_params(wrong_parameters=None) 65 | assert len(record) == 1 66 | 67 | 68 | def test_fit_transform_stacking_regressor(): 69 | """Test fit_transform method of Stacking regressor class.""" 70 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 71 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 72 | stacking_regressor = StackingRegressor() 73 | with pytest.raises(ValueError): 74 | stacking_regressor.fit_transform(None, y_train) 75 | with pytest.raises(ValueError): 76 | stacking_regressor.fit_transform(df_train, None) 77 | stacking_regressor.fit_transform(df_train, y_train) 78 | assert stacking_regressor._StackingRegressor__fittransformOK 79 | 80 | 81 | def test_transform_stacking_regressor(): 82 | """Test transform method of StackingRegressor class.""" 83 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 84 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 85 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 86 | stacking_regressor = StackingRegressor() 87 | with pytest.raises(ValueError): 88 | stacking_regressor.transform(None) 89 | with pytest.raises(ValueError): 90 | stacking_regressor.transform(df_test) 91 | stacking_regressor.fit_transform(df_train, y_train) 92 | results = stacking_regressor.transform(df_test) 93 | assert len(results.columns == 3) 94 | 95 | 96 | def test_fit_stacking_regressor(): 97 | """Test fit method of StackingRegressor class.""" 98 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 99 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 100 | stacking_regressor = StackingRegressor(verbose=True) 101 | stacking_regressor.fit(df_train, y_train) 102 | assert stacking_regressor._StackingRegressor__fitOK 103 | 104 | 105 | def test_predict_stacking_regressor(): 106 | """Test predict method of StackingRegressor class.""" 107 | df_train = pd.read_csv("data_for_tests/clean_train.csv") 108 | y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) 109 | df_test = pd.read_csv("data_for_tests/clean_test.csv") 110 | stacking_regressor = StackingRegressor() 111 | with pytest.raises(ValueError): 112 | stacking_regressor.predict(df_test) 113 | stacking_regressor.fit(df_train, y_train) 114 | results = stacking_regressor.predict(df_test) 115 | assert np.shape(results) == (418,) 116 | --------------------------------------------------------------------------------