├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── VERSION.txt
├── codecov.yml
├── docs
    ├── Makefile
    ├── authors.rst
    ├── conf.py
    ├── contributing.rst
    ├── features.rst
    ├── history.rst
    ├── index.rst
    ├── installation.rst
    ├── introduction.rst
    ├── logos
    │   ├── logo.png
    │   ├── small_logo.ico
    │   └── small_logo.png
    ├── make.bat
    └── webinars
    │   ├── auto-ML.pdf
    │   └── features.pdf
├── examples
    ├── classification
    │   ├── classification.py
    │   ├── example.ipynb
    │   ├── test_classification.csv
    │   └── train_classification.csv
    └── regression
    │   ├── example.ipynb
    │   ├── regression.py
    │   ├── test_regression.csv
    │   └── train_regression.csv
├── mlbox
    ├── __init__.py
    ├── encoding
    │   ├── __init__.py
    │   ├── categorical_encoder.py
    │   └── na_encoder.py
    ├── model
    │   ├── __init__.py
    │   ├── classification
    │   │   ├── __init__.py
    │   │   ├── classifier.py
    │   │   ├── feature_selector.py
    │   │   └── stacking_classifier.py
    │   └── regression
    │   │   ├── __init__.py
    │   │   ├── feature_selector.py
    │   │   ├── regressor.py
    │   │   └── stacking_regressor.py
    ├── optimisation
    │   ├── __init__.py
    │   └── optimiser.py
    ├── prediction
    │   ├── __init__.py
    │   └── predictor.py
    └── preprocessing
    │   ├── __init__.py
    │   ├── drift
    │       ├── __init__.py
    │       ├── drift_estimator.py
    │       └── drift_threshold.py
    │   ├── drift_thresholder.py
    │   └── reader.py
├── requirements.txt
├── setup.py
└── tests
    ├── .DS_Store
    ├── __init__.py
    ├── data_for_tests
        ├── clean_target.csv
        ├── clean_test.csv
        ├── clean_train.csv
        ├── inplace_test.csv
        ├── inplace_train.csv
        ├── test.csv
        ├── test_regression.csv
        ├── train.csv
        ├── train.h5
        ├── train.json
        ├── train.xls
        └── train_regression.csv
    ├── test_categorical_encoder.py
    ├── test_classification_feature_selector.py
    ├── test_classifier.py
    ├── test_drift_estimator.py
    ├── test_drift_threshold.py
    ├── test_drift_thresholder.py
    ├── test_na_encoder.py
    ├── test_optimiser.py
    ├── test_predictor.py
    ├── test_reader.py
    ├── test_regression_feature_selector.py
    ├── test_regressor.py
    ├── test_stacking_classifer.py
    └── test_stacking_regressor.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .pytest_cache/
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pycharm
 76 | .idea
 77 | .DS_Store
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # dotenv
 89 | .env
 90 | 
 91 | # virtualenv
 92 | .venv
 93 | venv/
 94 | ENV/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | # save folders
110 | *save/
111 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | matrix:
 3 |   include:
 4 |   - os: linux
 5 |     python: '3.5'
 6 |   - os: linux
 7 |     python: '3.6'
 8 |   - os: linux
 9 |     python: '3.7'
10 |   - os: osx
11 |     language: generic
12 |     python: '3.5'
13 |     before_install:
14 |     - brew install libomp
15 |     - brew upgrade pyenv
16 |     - brew install pyenv-virtualenv
17 |     - pyenv install 3.5.6
18 |     - eval "$(pyenv init -)"
19 |     - pyenv virtualenv 3.5.6 venv
20 |     - pyenv activate venv
21 |   - os: osx
22 |     language: generic
23 |     python: '3.6'
24 |     before_install:
25 |     - brew install libomp
26 |     - brew upgrade pyenv
27 |     - brew install pyenv-virtualenv
28 |     - pyenv install 3.6.7
29 |     - eval "$(pyenv init -)"
30 |     - pyenv virtualenv 3.6.7 venv
31 |     - pyenv activate venv
32 |   - os: osx
33 |     language: generic
34 |     python: '3.7'
35 |     before_install:
36 |     - brew install libomp
37 |     - brew upgrade pyenv
38 |     - brew install pyenv-virtualenv
39 |     - pyenv install 3.7.2
40 |     - eval "$(pyenv init -)"
41 |     - pyenv virtualenv 3.7.2 venv
42 |     - pyenv activate venv
43 |   - os: windows
44 |     language: sh
45 |     python: '3.5'
46 |     before_install:
47 |     - choco install python --version 3.5.4
48 |     - export PATH="/c/Python35:/c/Python35/Scripts:$PATH"
49 |   - os: windows
50 |     language: sh
51 |     python: '3.6'
52 |     before_install:
53 |     - choco install python --version 3.6.7
54 |     - export PATH="/c/Python36:/c/Python36/Scripts:$PATH"
55 |   - os: windows
56 |     language: sh
57 |     python: '3.7'
58 |     before_install:
59 |     - choco install python --version 3.7.2
60 |     - export PATH="/c/Python37:/c/Python37/Scripts:$PATH"
61 | install:
62 | - pip install coverage
63 | - pip install codecov
64 | - pip install -U pytest
65 | - pip install --upgrade setuptools wheel
66 | script:
67 | - python setup.py install
68 | - cd tests
69 | - if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" = "3.7" ] ; then
70 |   coverage run -m --source=../mlbox/ pytest; fi
71 | - if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" != "3.7" ] ; then
72 |   pytest; fi
73 | - if [ "$TRAVIS_OS_NAME" = "osx" ] ; then pytest; fi
74 | - if [ "$TRAVIS_OS_NAME" = "windows" ] ; then pytest; fi
75 | - cd ..
76 | after_success:
77 | - codecov
78 | deploy:
79 |   provider: pypi
80 |   user: AxeldeRomblay
81 |   password:
82 |     secure: l4S5cjkkjhj82j3Tq51/zkBEkjOfSl9xaISu9rmcQNQUbsqp1qrLiKmcMVm0mirNezhTnNdeeCWRyeuvXBNpbRq37KKM6NGScmbAPdCKZeDw6/wDOwjzaMpsnzynq7EiowrgrawwffTa1kP6dgzkG4U/ftjd1jNdNMmOz5MyMnkS2cVv2Uy0o/g7MPQ1hIVAGpoLtnjJ+iGZrQrCWGOr9zp6k003T0xGlS9oEPLM1yid1s1Aeeq8p8Jaee2gGbhpOZ8fySHPcBX2e7TThgoqwfN/wvDzBwko5VPHTaWiVa9FW4zirwyE9EK8LmjAuodF63QOBujO5YTCf1ja5iC5czxZrjNsZCznXmsVqZlyetF2aMofDk++0T0zCmXpMRjivmLV0O/ZSl/HDkMua1TdPuink+FKdGrwCH/IzyeAfT95yVisiRpmgNAhn8/IW/U8v87voquy+YoVL6egSjoB5EyEnzSoojK7qyRPCPmFmKcJHK3aoT3yocwgOSgClqX1gbrYrXAKkXR8lPp7VlZdNKIbKQLu6TILAOVILsAU2MFJbomMAREL/kM9tB3jOj34gKl0qghMOM10BUnWZ3L+MrNamm/0nrnFhlsI8OIVB47ahOnhVZsLk1H2LGZDwBvJTv2gzEG0mUaQaA45/dxJWvR9IZpObEu6T/U/e+uKI+g=
83 |   skip_existing: true
84 |   skip_cleanup: true
85 |   on:
86 |     condition: $TRAVIS_OS_NAME != "windows"
87 |     repo: AxeldeRomblay/MLBox
88 |     branch: master
89 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD-3 License
 2 | Copyright (c) 2017, Axel ARONIO DE ROMBLAY
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 |     * Redistributions in binary form must reproduce the above copyright
11 |       notice, this list of conditions and the following disclaimer in the
12 |       documentation and/or other materials provided with the distribution.
13 |     * Neither the name of MLBox nor the names of its contributors may be used 
14 |       to endorse or promote products derived from this software without specific 
15 |       prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL AXEL ARONIO DE ROMBLAY BE LIABLE FOR ANY
21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include *.rst
3 | include *.txt
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts.
 2 | 
 3 | clean-build: ## remove build artifacts
 4 | 	rm -fr build/
 5 | 	rm -fr dist/
 6 | 	rm -fr .eggs/
 7 | 	find . -name '*.egg-info' -exec rm -fr {} +
 8 | 	find . -name '*.egg' -exec rm -f {} +
 9 | 
10 | clean-pyc: ## remove Python file artifacts
11 | 	find . -name '*.pyc' -exec rm -f {} +
12 | 	find . -name '*.pyo' -exec rm -f {} +
13 | 	find . -name '*~' -exec rm -f {} +
14 | 	find . -name '__pycache__' -exec rm -fr {} +
15 | 
16 | clean-test: ## remove test and coverage artifacts
17 | 	cd tests/; \
18 | 		rm -fr .tox/; \
19 | 		rm -f .coverage; \
20 | 		rm -fr htmlcov/
21 | 
22 | test: ## run tests quickly with the default Python
23 | 	cd tests/; \
24 | 		pytest
25 | 
26 | coverage: ## check code coverage quickly with the default Python
27 | 	cd tests/; \
28 | 		coverage run -m --source=../mlbox/ pytest;\
29 | 		coverage html;\
30 | 		$(BROWSER) htmlcov/index.html
31 | 
32 | docs: ## generate Sphinx HTML documentation, including API docs
33 | 	rm -f docs/mlbox.rst
34 | 	rm -f docs/modules.rst
35 | 	sphinx-apidoc -o docs/ mlbox
36 | 	$(MAKE) -C docs clean
37 | 	$(MAKE) -C docs html
38 | 	$(BROWSER) docs/_build/html/index.html
39 | 
40 | release: ## package and upload a release
41 | 	python setup.py sdist upload
42 | 	python setup.py bdist_wheel upload
43 | 
44 | dist: ## builds source and wheel package
45 | 	python setup.py sdist
46 | 	python setup.py bdist_wheel
47 | 	ls -l dist
48 | 
49 | install: ## install the package to the active Python's site-packages
50 | 	python setup.py install
51 | 
52 | develop: ## install the package to the active Python's site-packages in developer mode
53 | 	python setup.py develop
54 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. image:: docs/logos/logo.png
 2 | 
 3 | |Documentation Status| |PyPI version| |Build Status| |GitHub Issues| |codecov| |License| |Downloads| |Python Versions|
 4 | 
 5 | -----------------------
 6 | 
 7 | **MLBox is a powerful Automated Machine Learning python library.** It provides the following features:
 8 | 
 9 | 
10 | * Fast reading and distributed data preprocessing/cleaning/formatting
11 | * Highly robust feature selection and leak detection
12 | * Accurate hyper-parameter optimization in high-dimensional space
13 | * State-of-the art predictive models for classification and regression (Deep Learning, Stacking, LightGBM,...)
14 | * Prediction with models interpretation
15 | 
16 | 
17 | **For more details**, please refer to the `official documentation <https://mlbox.readthedocs.io/en/latest/>`__
18 | 
19 | 
20 | --------------------------
21 | 
22 | How to Contribute
23 | =================
24 | 
25 | MLBox has been developed and used by many active community members. Your help is very valuable to make it better for everyone.
26 | 
27 | - Check out `call for contributions <https://github.com/AxeldeRomblay/MLBox/labels/call-for-contributions>`__ to see what can be improved, or open an issue if you want something.
28 | - Contribute to the `tests <https://github.com/AxeldeRomblay/MLBox/tree/master/tests>`__ to make it more reliable.
29 | - Contribute to the `documents <https://github.com/AxeldeRomblay/MLBox/tree/master/docs>`__ to make it clearer for everyone.
30 | - Contribute to the `examples <https://github.com/AxeldeRomblay/MLBox/tree/master/examples>`__ to share your experience with other users.
31 | - Open `issue <https://github.com/AxeldeRomblay/MLBox/issues>`__ if you met problems during development.
32 | 
33 | For more details, please refer to `CONTRIBUTING <https://github.com/AxeldeRomblay/MLBox/blob/master/docs/contributing.rst>`__.
34 | 
35 | .. |Documentation Status| image:: https://readthedocs.org/projects/mlbox/badge/?version=latest
36 |    :target: https://mlbox.readthedocs.io/en/latest/
37 | .. |PyPI version| image:: https://badge.fury.io/py/mlbox.svg
38 |    :target: https://pypi.python.org/pypi/mlbox
39 | .. |Build Status| image:: https://travis-ci.org/AxeldeRomblay/MLBox.svg?branch=master
40 |    :target: https://travis-ci.org/AxeldeRomblay/MLBox
41 | .. |GitHub Issues| image:: https://img.shields.io/github/issues/AxeldeRomblay/MLBox.svg
42 |    :target: https://github.com/AxeldeRomblay/MLBox/issues
43 | .. |codecov| image:: https://codecov.io/gh/AxeldeRomblay/MLBox/branch/master/graph/badge.svg
44 |    :target: https://codecov.io/gh/AxeldeRomblay/MLBox
45 | .. |License| image:: https://img.shields.io/badge/License-BSD%203--Clause-blue.svg
46 |    :target: https://github.com/AxeldeRomblay/MLBox/blob/master/LICENSE
47 | .. |Downloads| image:: https://pepy.tech/badge/mlbox
48 |    :target: https://pepy.tech/project/mlbox
49 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/mlbox.svg
50 |    :target: https://pypi.org/project/mlbox
51 | 


--------------------------------------------------------------------------------
/VERSION.txt:
--------------------------------------------------------------------------------
1 | 0.8.5
2 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   token: 989a47e4-aa64-4cbd-8516-52d00e1eb129
 3 |   notify:
 4 |     require_ci_to_pass: yes
 5 | coverage:
 6 |   precision: 2
 7 |   round: up
 8 |   range: "50...100"
 9 |   status:
10 |     project:
11 |       default:
12 |         # Commits pushed to master should not make the overall
13 |         # project coverage decrease by more than 1%
14 |         target: auto
15 |         threshold: 1%
16 |     patch:
17 |       default:
18 |         # Be tolerant on slight code coverage diff on PRs to limit
19 |         # noisy red coverage status on github PRs.
20 |         target: auto
21 |         threshold: 1%
22 |     changes: no
23 | parsers:
24 |   gcov:
25 |     branch_detection:
26 |       conditional: yes
27 |       loop: yes
28 |       method: no
29 |       macro: no
30 | 
31 | comment:
32 |   layout: "header, diff"
33 |   behavior: default
34 |   require_changes: no


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/mlbox.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/mlbox.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/mlbox"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/mlbox"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Authors
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Axel ARONIO DE ROMBLAY 
 9 | 
10 |   * email: <axelderomblay@gmail.com>
11 |   * linkedin: <https://www.linkedin.com/in/axel-de-romblay-6444a990/>
12 | 
13 | Contributors
14 | ------------
15 | 
16 | * Nicolas CHEREL <nicolas.cherel@telecom-paristech.fr>
17 | * Mohamed MASKANI <maskani.mohamed@gmail.com>
18 | * Henri GERARD <hgerard.pro@gmail.com>
19 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # mlbox documentation build configuration file, created by
  5 | # sphinx-quickstart on Tue Jul  9 22:26:36 2013.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | from mock import Mock as MagicMock
 19 | 
 20 | class Mock(MagicMock):
 21 |     @classmethod
 22 |     def __getattr__(cls, name):
 23 |             return MagicMock()
 24 | 
 25 | MOCK_MODULES = ['numpy',
 26 |                 'matplotlib',
 27 |                 'matplotlib.pyplot', 
 28 |                 'hyperopt',
 29 |                 'joblib',
 30 |                 'pandas',
 31 |                 'sklearn',
 32 |                 'sklearn.ensemble',
 33 |                 'sklearn.metrics',
 34 |                 'sklearn.impute',
 35 |                 'sklearn.linear_model',
 36 |                 'sklearn.model_selection',
 37 |                 'sklearn.tree',
 38 |                 'sklearn.pipeline',
 39 |                 'sklearn.preprocessing',
 40 |                 'tensorflow',
 41 |                 'tensorflow.keras.layers',
 42 |                 'tensorflow.keras.models',
 43 |                 'lightgbm'
 44 |                 ]
 45 | 
 46 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 47 | 
 48 | 
 49 | # If extensions (or modules to document with autodoc) are in another
 50 | # directory, add these directories to sys.path here. If the directory is
 51 | # relative to the documentation root, use os.path.abspath to make it
 52 | # absolute, like shown here.
 53 | #sys.path.insert(0, os.path.abspath('.'))
 54 | 
 55 | # Get the project root dir, which is the parent dir of this
 56 | cwd = os.getcwd()
 57 | project_root = os.path.dirname(cwd)
 58 | 
 59 | # Insert the project root dir as the first element in the PYTHONPATH.
 60 | # This lets us ensure that the source package is imported, and that its
 61 | # version is used.
 62 | 
 63 | sys.path.insert(0, project_root)
 64 | 
 65 | #import mlbox
 66 | 
 67 | # -- General configuration ---------------------------------------------
 68 | 
 69 | # If your documentation needs a minimal Sphinx version, state it here.
 70 | #needs_sphinx = '1.0'
 71 | 
 72 | # Add any Sphinx extension module names here, as strings. They can be
 73 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 74 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon']
 75 | napoleon_numpy_docstring = True
 76 | 
 77 | # Add any paths that contain templates here, relative to this directory.
 78 | templates_path = ['_templates']
 79 | 
 80 | # The suffix of source filenames.
 81 | source_suffix = '.rst'
 82 | 
 83 | # The encoding of source files.
 84 | #source_encoding = 'utf-8-sig'
 85 | 
 86 | # The master toctree document.
 87 | master_doc = 'index'
 88 | 
 89 | # General information about the project.
 90 | project = u'MLBox'
 91 | copyright = u"2017, Axel ARONIO DE ROMBLAY"
 92 | 
 93 | # The version info for the project you're documenting, acts as replacement
 94 | # for |version| and |release|, also used in various other places throughout
 95 | # the built documents.
 96 | #
 97 | # The short X.Y version.
 98 | #version = mlbox.__version__
 99 | # The full version, including alpha/beta/rc tags.
100 | #release = mlbox.__version__
101 | 
102 | # The language for content autogenerated by Sphinx. Refer to documentation
103 | # for a list of supported languages.
104 | #language = None
105 | 
106 | # There are two options for replacing |today|: either, you set today to
107 | # some non-false value, then it is used:
108 | #today = ''
109 | # Else, today_fmt is used as the format for a strftime call.
110 | #today_fmt = '%B %d, %Y'
111 | 
112 | # List of patterns, relative to source directory, that match files and
113 | # directories to ignore when looking for source files.
114 | exclude_patterns = ['_build']
115 | 
116 | # The reST default role (used for this markup: `text`) to use for all
117 | # documents.
118 | #default_role = None
119 | 
120 | # If true, '()' will be appended to :func: etc. cross-reference text.
121 | #add_function_parentheses = True
122 | 
123 | # If true, the current module name will be prepended to all description
124 | # unit titles (such as .. function::).
125 | #add_module_names = True
126 | 
127 | # If true, sectionauthor and moduleauthor directives will be shown in the
128 | # output. They are ignored by default.
129 | #show_authors = False
130 | 
131 | # The name of the Pygments (syntax highlighting) style to use.
132 | pygments_style = 'sphinx'
133 | 
134 | # A list of ignored prefixes for module index sorting.
135 | #modindex_common_prefix = []
136 | 
137 | # If true, keep warnings as "system message" paragraphs in the built
138 | # documents.
139 | #keep_warnings = False
140 | 
141 | 
142 | # -- Options for HTML output -------------------------------------------
143 | 
144 | # The theme to use for HTML and HTML Help pages.  See the documentation for
145 | # a list of builtin themes.
146 | html_theme = 'default'
147 | 
148 | # Theme options are theme-specific and customize the look and feel of a
149 | # theme further.  For a list of options available for each theme, see the
150 | # documentation.
151 | #html_theme_options = {}
152 | 
153 | # Add any paths that contain custom themes here, relative to this directory.
154 | #html_theme_path = []
155 | 
156 | # The name for this set of Sphinx documents.  If None, it defaults to
157 | # "<project> v<release> documentation".
158 | html_title = "MLBox Documentation"
159 | 
160 | # A shorter title for the navigation bar.  Default is the same as
161 | # html_title.
162 | html_short_title = "MLBox Documentation"
163 | 
164 | # The name of an image file (relative to this directory) to place at the
165 | # top of the sidebar.
166 | html_logo = "logos/small_logo.png"
167 | 
168 | # The name of an image file (within the static path) to use as favicon
169 | # of the docs.  This file should be a Windows icon file (.ico) being
170 | # 16x16 or 32x32 pixels large.
171 | html_favicon = "logos/small_logo.ico"
172 | 
173 | # Add any paths that contain custom static files (such as style sheets)
174 | # here, relative to this directory. They are copied after the builtin
175 | # static files, so a file named "default.css" will overwrite the builtin
176 | # "default.css".
177 | html_static_path = ['_static']
178 | 
179 | # If not '', a 'Last updated on:' timestamp is inserted at every page
180 | # bottom, using the given strftime format.
181 | #html_last_updated_fmt = '%b %d, %Y'
182 | 
183 | # If true, SmartyPants will be used to convert quotes and dashes to
184 | # typographically correct entities.
185 | #html_use_smartypants = True
186 | 
187 | # Custom sidebar templates, maps document names to template names.
188 | #html_sidebars = {}
189 | 
190 | # Additional templates that should be rendered to pages, maps page names
191 | # to template names.
192 | #html_additional_pages = {}
193 | 
194 | # If false, no module index is generated.
195 | #html_domain_indices = True
196 | 
197 | # If false, no index is generated.
198 | #html_use_index = True
199 | 
200 | # If true, the index is split into individual pages for each letter.
201 | #html_split_index = False
202 | 
203 | # If true, links to the reST sources are added to the pages.
204 | #html_show_sourcelink = True
205 | 
206 | # If true, "Created using Sphinx" is shown in the HTML footer.
207 | # Default is True.
208 | #html_show_sphinx = True
209 | 
210 | # If true, "(C) Copyright ..." is shown in the HTML footer.
211 | # Default is True.
212 | html_show_copyright = True
213 | 
214 | # If true, an OpenSearch description file will be output, and all pages
215 | # will contain a <link> tag referring to it.  The value of this option
216 | # must be the base URL from which the finished HTML is served.
217 | #html_use_opensearch = ''
218 | 
219 | # This is the file name suffix for HTML files (e.g. ".xhtml").
220 | #html_file_suffix = None
221 | 
222 | # Output file base name for HTML help builder.
223 | htmlhelp_basename = 'mlboxdoc'
224 | 
225 | 
226 | # -- Options for LaTeX output ------------------------------------------
227 | 
228 | latex_elements = {
229 |     # The paper size ('letterpaper' or 'a4paper').
230 |     #'papersize': 'letterpaper',
231 | 
232 |     # The font size ('10pt', '11pt' or '12pt').
233 |     #'pointsize': '10pt',
234 | 
235 |     # Additional stuff for the LaTeX preamble.
236 |     #'preamble': '',
237 | }
238 | 
239 | # Grouping the document tree into LaTeX files. List of tuples
240 | # (source start file, target name, title, author, documentclass
241 | # [howto/manual]).
242 | latex_documents = [
243 |     ('index', 'mlbox.tex',
244 |      u'MLBox Documentation',
245 |      u'Axel ARONIO DE ROMBLAY', 'manual'),
246 | ]
247 | 
248 | # The name of an image file (relative to this directory) to place at
249 | # the top of the title page.
250 | #latex_logo = None
251 | 
252 | # For "manual" documents, if this is true, then toplevel headings
253 | # are parts, not chapters.
254 | #latex_use_parts = False
255 | 
256 | # If true, show page references after internal links.
257 | #latex_show_pagerefs = False
258 | 
259 | # If true, show URL addresses after external links.
260 | #latex_show_urls = False
261 | 
262 | # Documents to append as an appendix to all manuals.
263 | #latex_appendices = []
264 | 
265 | # If false, no module index is generated.
266 | #latex_domain_indices = True
267 | 
268 | 
269 | # -- Options for manual page output ------------------------------------
270 | 
271 | # One entry per manual page. List of tuples
272 | # (source start file, name, description, authors, manual section).
273 | man_pages = [
274 |     ('index', 'mlbox',
275 |      u'MLBox Documentation',
276 |      [u'Axel ARONIO DE ROMBLAY'], 1)
277 | ]
278 | 
279 | # If true, show URL addresses after external links.
280 | #man_show_urls = False
281 | 
282 | 
283 | # -- Options for Texinfo output ----------------------------------------
284 | 
285 | # Grouping the document tree into Texinfo files. List of tuples
286 | # (source start file, target name, title, author,
287 | #  dir menu entry, description, category)
288 | texinfo_documents = [
289 |     ('index', 'mlbox',
290 |      u'MLBox Documentation',
291 |      u'Axel ARONIO DE ROMBLAY',
292 |      'mlbox',
293 |      'MLBox is a powerful Automated Machine Learning python library.',
294 |      'Miscellaneous'),
295 | ]
296 | 
297 | # Documents to append as an appendix to all manuals.
298 | #texinfo_appendices = []
299 | 
300 | # If false, no module index is generated.
301 | #texinfo_domain_indices = True
302 | 
303 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
304 | #texinfo_show_urls = 'footnote'
305 | 
306 | # If true, do not generate a @detailmenu in the "Top" node's menu.
307 | #texinfo_no_detailmenu = False
308 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Contributing
  3 | ============
  4 | 
  5 | Contributions are welcome, and they are greatly appreciated! Every
  6 | little bit helps, and credit will always be given.
  7 | 
  8 | You can contribute in many ways:
  9 | 
 10 | Types of Contributions
 11 | ----------------------
 12 | 
 13 | Report Bugs
 14 | ~~~~~~~~~~~
 15 | 
 16 | Report bugs at https://github.com/AxeldeRomblay/mlbox/issues.
 17 | 
 18 | If you are reporting a bug, please include:
 19 | 
 20 | * Your operating system name and version.
 21 | * Any details about your local setup that might be helpful in troubleshooting.
 22 | * The smallest possible example to reproduce the bug.
 23 | 
 24 | Fix Bugs
 25 | ~~~~~~~~
 26 | 
 27 | Look through the GitHub issues for bugs. Anything tagged with "bug"
 28 | and "help wanted" is open to whoever wants to implement it.
 29 | 
 30 | Implement Features
 31 | ~~~~~~~~~~~~~~~~~~
 32 | 
 33 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 34 | and "help wanted" is open to whoever wants to implement it.
 35 | 
 36 | Write Documentation
 37 | ~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | MLBox could always use more documentation, whether as part of the
 40 | official MLBox docs, in docstrings, or even on the web in blog posts,
 41 | articles, and such.
 42 | 
 43 | Submit Feedback
 44 | ~~~~~~~~~~~~~~~
 45 | 
 46 | The best way to send feedback is to file an issue at https://github.com/AxeldeRomblay/mlbox/issues.
 47 | 
 48 | If you are proposing a feature:
 49 | 
 50 | * Explain in detail how it would work.
 51 | * Keep the scope as narrow as possible, to make it easier to implement.
 52 | * Remember that this is a volunteer-driven project, and that contributions
 53 |   are welcome :)
 54 | 
 55 | Get Started!
 56 | ------------
 57 | 
 58 | Ready to contribute? Here's how to set up `mlbox` for local development.
 59 | 
 60 | 1. Fork the `mlbox` repo on GitHub.
 61 | 
 62 | 2. Clone your fork::
 63 | 
 64 |     $ git clone git@github.com:your_name_here/mlbox.git
 65 | 
 66 | 3. If you have virtualenv installed, skip this step. Either, run the following::
 67 | 
 68 |     $ pip install virtualenv
 69 |     
 70 | 4. Install your local copy into a virtualenv following this commands to set up your fork for local development::
 71 | 
 72 |     $ cd MLBox
 73 |     $ virtualenv env
 74 |     $ source env/bin/activate
 75 |     $ python setup.py develop
 76 | 
 77 | If you have any troubles with the setup, please refer to the `installation guide <https://mlbox.readthedocs.io/en/latest/installation.html>`__
 78 | 
 79 | 5. Create a branch for local development::
 80 | 
 81 |     $ git checkout -b name-of-your-bugfix-or-feature
 82 | 
 83 | **Now you're set, you can make your changes locally.**
 84 | 
 85 | NOTE : each time you work on your branch, you will need to activate the virtualenv: ``$ source env/bin/activate``. To deactivate it, simply run: ``$ deactivate``.
 86 | 
 87 | 6. When you're done making changes, check that your changes pass the tests.
 88 | 
 89 | NOTE : you need to install **pytest** before running the tests::
 90 | 
 91 |     $ cd tests
 92 |     $ pytest
 93 | 
 94 | 7. Commit your changes and push your branch to GitHub::
 95 | 
 96 |     $ git add .
 97 |     $ git commit -m "Your detailed description of your changes."
 98 |     $ git push origin name-of-your-bugfix-or-feature
 99 | 
100 | 8. Submit a pull request through the GitHub website.
101 | 
102 | Pull Request Guidelines
103 | -----------------------
104 | 
105 | Before you submit a pull request, check that it meets these guidelines:
106 | 
107 | 1. The pull request should include tests.
108 | 2. If the pull request adds functionality, the docs should be updated. Put
109 |    your new functionality into a function with a docstring.
110 | 3. The pull request should work for all supported Python versions and for PyPy. Check
111 |    https://travis-ci.org/AxeldeRomblay/MLBox/pull_requests
112 |    and make sure that the tests pass for all supported Python versions.
113 | 


--------------------------------------------------------------------------------
/docs/features.rst:
--------------------------------------------------------------------------------
 1 | Preprocessing
 2 | =============
 3 | 
 4 | Reading
 5 | -------
 6 | 
 7 | .. autoclass:: mlbox.preprocessing.Reader
 8 |     :members: 
 9 |     
10 | Drift thresholding
11 | ------------------
12 | 
13 | .. autoclass:: mlbox.preprocessing.Drift_thresholder
14 |    :members:
15 | 
16 | Encoding
17 | ========
18 | 
19 | Missing values
20 | --------------
21 | 
22 | .. autoclass:: mlbox.encoding.NA_encoder
23 |     :members:
24 | 
25 | Categorical features
26 | --------------------
27 | 
28 | .. autoclass:: mlbox.encoding.Categorical_encoder
29 |     :members:
30 | 
31 | Model
32 | =====
33 | 
34 | Classification
35 | --------------
36 | 
37 | Feature selection
38 | ~~~~~~~~~~~~~~~~~
39 | 
40 | .. autoclass:: mlbox.model.classification.Clf_feature_selector
41 |     :members:
42 | 
43 | Classification
44 | ~~~~~~~~~~~~~~
45 | 
46 | .. autoclass:: mlbox.model.classification.Classifier
47 |    :members:
48 | 
49 | Stacking
50 | ~~~~~~~~
51 | 
52 | .. autoclass:: mlbox.model.classification.StackingClassifier
53 |    :members:
54 | 
55 | Regression
56 | ----------
57 | 
58 | Feature selection
59 | ~~~~~~~~~~~~~~~~~
60 | 
61 | .. autoclass:: mlbox.model.regression.Reg_feature_selector
62 |     :members:
63 | 
64 | Regression
65 | ~~~~~~~~~~
66 | 
67 | .. autoclass:: mlbox.model.regression.Regressor
68 |    :members:
69 | 
70 | Stacking
71 | ~~~~~~~~
72 | 
73 | .. autoclass:: mlbox.model.regression.StackingRegressor
74 |    :members:
75 |    
76 | 
77 | Optimisation
78 | ============
79 | 
80 | .. autoclass:: mlbox.optimisation.Optimiser
81 |    :members:
82 |    
83 | Prediction
84 | ==========
85 | 
86 | .. autoclass:: mlbox.prediction.Predictor
87 |     :members:
88 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
  1 | History
  2 | =======
  3 | 
  4 | 0.1.0 (2017-02-09)
  5 | ------------------
  6 | * First non-official release.
  7 | 
  8 | 0.1.1 (2017-02-23)
  9 | ------------------
 10 | * add of several estimators : Random Forest, Extra Trees, Logistic Regression, ...
 11 | * improvement in verbose mode for reader.
 12 | 
 13 | 0.1.2 (2017-03-02)
 14 | ------------------
 15 | * add of dropout for entity embeddings.
 16 | * improvement in optimiser.
 17 | 
 18 | 0.2.0 (2017-03-22)
 19 | ------------------
 20 | * add of feature importances for base learners.
 21 | * add of leak detection.
 22 | * add of stacking meta-model.
 23 | * improvement in verbose mode for optimiser (folds variance).
 24 | 
 25 | 0.2.1 (2017-04-26)
 26 | ------------------
 27 | * add of feature importances for bagging and boosting meta-models.
 28 | 
 29 | 0.2.2 (first official release : 2017-06-13)
 30 | -------------------------------------------
 31 | * update of dependencies (Keras 2.0,...).
 32 | * add of LightGBM model.
 33 | 
 34 | 0.3.0 (2017-07-11)
 35 | ------------------
 36 | * Python 2.7 & Python 3.4-3.6 compatibilities
 37 | 
 38 | 0.3.1 (2017-07-12)
 39 | ------------------
 40 | * Availability on PyPI.
 41 | 
 42 | 0.4.0 (2017-07-18)
 43 | ------------------
 44 | * add of pipeline memory.
 45 | 
 46 | 0.4.1 (2017-07-21)
 47 | ------------------
 48 | * improvement in verbose mode for reader (display missing values)
 49 | 
 50 | 0.4.2 (2017-07-25)
 51 | ------------------
 52 | * update of dependencies
 53 | 
 54 | 0.4.3 (2017-07-26)
 55 | ------------------
 56 | * improvement in verbose mode for predictor (display feature importances)
 57 | * wait until modules and engines are imported
 58 | 
 59 | 0.4.4 (2017-08-04)
 60 | ------------------
 61 | * pep8 style
 62 | * normalization of drift coefficients
 63 | * warning size of folder 'save' 
 64 | 
 65 | 0.5.0 (2017-08-24)
 66 | ------------------
 67 | * improvement in verbose mode
 68 | * add of new dates features 
 69 | * add of a new strategy for missing categorical values
 70 | * new parallel computing
 71 | 
 72 | 0.5.1 (2017-08-25)
 73 | ------------------
 74 | * improvement in verbose mode for reader (display target quantiles for regression)
 75 | 
 76 | 0.6.0 (2019-04-26)
 77 | ------------------
 78 | * remove xgboost installation
 79 | 
 80 | 0.7.0 (2019-06-26)
 81 | ------------------
 82 | * add support for Mac OS & Windows
 83 | * update support for python versions
 84 | * improve setup
 85 | * add tests
 86 | * improve documentation & examples
 87 | * minor changes in the package architecture
 88 | 
 89 | 0.8.0 (2019-07-29)
 90 | ------------------
 91 | * remove support for python 2.7 version
 92 | 
 93 | 0.8.1 (2019-08-29)
 94 | ------------------
 95 | * add python 3.7 version
 96 | * update package dependencies
 97 | 
 98 | 0.8.4 (2020-04-13)
 99 | ------------------
100 | * update package dependencies
101 | 
102 | 0.8.5 (2020-08-25)
103 | ------------------
104 | * minor fix (package dependencies)
105 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Home - Welcome to MLBox's official documentation
 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3 | 
 4 | ------------------
 5 | 
 6 | .. image:: logos/logo.png
 7 | 
 8 | 
 9 | **MLBox is a powerful Automated Machine Learning python library.**
10 | It provides the following features:
11 | 
12 | * Fast reading and distributed data preprocessing/cleaning/formatting.
13 | * Highly robust feature selection and leak detection.
14 | * Accurate hyper-parameter optimization in high-dimensional space.
15 | * State-of-the art predictive models for classification and regression (Deep Learning, Stacking, LightGBM,...).
16 | * Prediction with models interpretation.
17 | 
18 | -------------------
19 | 
20 | Links
21 | ~~~~~
22 | 
23 | * **Performance experiments:**
24 |    * `Kaggle competition "Two Sigma Connect: Rental Listing Inquiries" <https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/leaderboard>`__ (rank: **85/2488**)
25 |    * `Kaggle competition "Sberbank Russian Housing Market" <https://www.kaggle.com/c/sberbank-russian-housing-market/leaderboard>`__ (rank: **190/3274**)
26 | 
27 | * **Examples & demos:**
28 |    * `Kaggle kernel on "Titanic" dataset <https://www.kaggle.com/axelderomblay/running-mlbox-auto-ml-package-on-titanic>`__ (classification)
29 |    * `Kaggle kernel on "House Prices" dataset <https://www.kaggle.com/axelderomblay/running-mlbox-auto-ml-package-on-house-prices>`__ (regression)
30 | 
31 | * **Articles, books & tutorials from users:**
32 |    * `Tutorial on Automated Machine Learning using MLBox <https://www.analyticsvidhya.com/blog/2017/07/mlbox-library-automated-machine-learning/>`__ (Analytics Vidhya article)
33 |    * `MLBox: a short regression tutorial <http://darques.eu/blog/index.php/2017/07/27/mlbox-a-short-regression_tutorial/>`__ (user blog)
34 |    * `Implementing Auto-ML Systems with Open Source Tools <https://www.kdnuggets.com/2018/10/implementing-automated-machine-learning-open-source-path.html>`__ (KDnuggets article)
35 |    * `Hands-On Automated Machine Learning <https://www.oreilly.com/library/view/hands-on-automated-machine/9781788629898/>`__ (O'Reilly book)
36 |    * `Automatic Machine Learning <https://www.youtube.com/watch?v=jn-22XyKsgo>`__ (Youtube tutorial)
37 |    * `Automated Machine Learning with MLBox <https://ryansdataspot.com/2019/05/15/automated-machine-learning-with-mlbox/>`__ (user blog)
38 |    * `Introduction to AutoML with MLBox <https://ahmedbesbes.com/introduction-to-automl-with-mlbox.html>`__ (user blog)
39 | 
40 | * **Webinars & conferences:**
41 |    * `Paris ML Hors Série #13: Automated Machine Learning <https://www.youtube.com/watch?v=zWZBK4-Fxp0>`__
42 |    * `Analytics Vidhya: Automated Machine Learning using MLBox python package <https://www.youtube.com/watch?v=NH1KsHeDf8Y>`__
43 |    * `DataHack Summit 2017 by Analytics Vidhya <https://www.youtube.com/watch?v=hstu3FaMkls&t=859s>`__
44 | 
45 | * **References:**
46 |    * `AutoML.org  <http://ml4aad.org/automl/>`__
47 |    * `Skymind AI Wiki <https://skymind.ai/wiki/automl-automated-machine-learning-ai>`__
48 |    * `TPOT github <https://epistasislab.github.io/tpot/related/>`__
49 |    * `Towards Data Science <https://towardsdatascience.com/gui-fying-the-machine-learning-workflow-towards-rapid-discovery-of-viable-pipelines-cab2552c909f>`__
50 | 
51 | 
52 | .. toctree::
53 |    :maxdepth: 1
54 |    :caption: Tutorials
55 |    :hidden:
56 | 
57 |    installation
58 |    introduction
59 | 
60 | .. toctree::
61 |    :maxdepth: 3
62 |    :caption: Features
63 |    :hidden:
64 | 
65 |    features
66 | 
67 | .. toctree::
68 |    :maxdepth: 1
69 |    :caption: Contribution
70 |    :hidden:
71 | 
72 |    authors
73 |    history
74 |    contributing
75 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
  1 | Installation guide
  2 | ==================
  3 | 
  4 | |Documentation Status| |PyPI version| |Build Status| |GitHub Issues| |codecov| |License| |Downloads| |Python Versions|
  5 | 
  6 | 
  7 | Compatibilities
  8 | ---------------
  9 | 
 10 | * *Operating systems:* **Linux**, **MacOS** & **Windows**.
 11 | * *Python versions:* **3.5** - **3.7**. & **64-bit version** only (32-bit python is not supported)
 12 | 
 13 | 
 14 | Basic requirements
 15 | ------------------
 16 | 
 17 | We suppose that `pip <https://pip.pypa.io/en/stable/installing/>`__ is already installed.
 18 | 
 19 | Also, please make sure you have `setuptools <https://pypi.python.org/pypi/setuptools>`__ and `wheel <https://pythonwheels.com/>`__ installed, which is usually the case if pip is installed.
 20 | If not, you can install both by running the following commands respectively: ``pip install setuptools`` and ``pip install wheel``.
 21 | 
 22 | 
 23 | Preparation (MacOS only)
 24 | ------------------------
 25 | 
 26 | For **MacOS** users only, **OpenMP** is required. You can install it by the following command: ``brew install libomp``.
 27 | 
 28 | 
 29 | Installation
 30 | ------------
 31 | 
 32 | You can choose to install MLBox either from pip or from the Github.
 33 | 
 34 | 
 35 | Install from pip
 36 | ~~~~~~~~~~~~~~~~
 37 | 
 38 | Official releases of MLBox are available on **PyPI**, so you only need to run the following command:
 39 | 
 40 | .. code-block:: console
 41 | 
 42 |     $ pip install mlbox
 43 | 
 44 | 
 45 | Install from the Github
 46 | ~~~~~~~~~~~~~~~~~~~~~~~
 47 | 
 48 | If you want to get the latest features, you can also install MLBox from the Github.
 49 | 
 50 | * **The sources for MLBox can be downloaded** from the `Github repo`_.
 51 | 
 52 |     * You can either clone the public repository:
 53 | 
 54 |     .. code-block:: console
 55 | 
 56 |         $ git clone git://github.com/AxeldeRomblay/mlbox
 57 | 
 58 |     * Or download the `tarball`_:
 59 | 
 60 |     .. code-block:: console
 61 | 
 62 |         $ curl  -OL https://github.com/AxeldeRomblay/mlbox/tarball/master
 63 | 
 64 | 
 65 | * Once you have a copy of the source, **you can install it**:
 66 | 
 67 |     .. code-block:: console
 68 | 
 69 |         $ cd MLBox
 70 |         $ python setup.py install
 71 | 
 72 | 
 73 | Issues
 74 | ------
 75 | 
 76 | If you get any troubles during installation, you can refer to the `issues <https://github.com/AxeldeRomblay/MLBox/issues>`__.
 77 | 
 78 | **Please first check that there are no similar issues opened before opening one**.
 79 | 
 80 | 
 81 | .. _Github repo: https://github.com/AxeldeRomblay/mlbox
 82 | 
 83 | .. _tarball: https://github.com/AxeldeRomblay/mlbox/tarball/master
 84 | 
 85 | .. |Documentation Status| image:: https://readthedocs.org/projects/mlbox/badge/?version=latest
 86 |    :target: http://mlbox.readthedocs.io/en/latest/?badge=latest
 87 | .. |PyPI version| image:: https://badge.fury.io/py/mlbox.svg
 88 |    :target: https://pypi.python.org/pypi/mlbox
 89 | .. |Build Status| image:: https://travis-ci.org/AxeldeRomblay/MLBox.svg?branch=master
 90 |    :target: https://travis-ci.org/AxeldeRomblay/MLBox
 91 | .. |GitHub Issues| image:: https://img.shields.io/github/issues/AxeldeRomblay/MLBox.svg
 92 |    :target: https://github.com/AxeldeRomblay/MLBox/issues
 93 | .. |codecov| image:: https://codecov.io/gh/AxeldeRomblay/MLBox/branch/master/graph/badge.svg
 94 |    :target: https://codecov.io/gh/AxeldeRomblay/MLBox
 95 | .. |License| image:: https://img.shields.io/badge/License-BSD%203--Clause-blue.svg
 96 |    :target: https://github.com/AxeldeRomblay/MLBox/blob/master/LICENSE
 97 | .. |Downloads| image:: https://pepy.tech/badge/mlbox
 98 |    :target: https://pepy.tech/project/mlbox
 99 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/mlbox.svg
100 |    :target: https://pypi.org/project/mlbox
101 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | Getting started: 30 seconds to MLBox
 2 | ====================================
 3 | 
 4 | MLBox main package contains 3 sub-packages : **preprocessing**, **optimisation** and **prediction**. Each one of them are respectively aimed at reading and preprocessing data, testing or optimising a wide range of learners and predicting the target on a test dataset.
 5 | 
 6 | **Here are a few lines to import the MLBox:**
 7 | 
 8 | .. code-block:: python 
 9 | 
10 |    from mlbox.preprocessing import *
11 |    from mlbox.optimisation import *
12 |    from mlbox.prediction import *
13 | 
14 | 
15 | **Then, all you need to give is :** 
16 | 
17 | * the list of paths to your train datasets and test datasets
18 | * the name of the target you try to predict (classification or regression)
19 | 
20 | .. code-block:: python 
21 | 
22 |    paths = ["<file_1>.csv", "<file_2>.csv", ..., "<file_n>.csv"] #to modify
23 |    target_name = "<my_target>" #to modify
24 | 
25 | 
26 | **Now, let the MLBox do the job !**
27 | 
28 | ... to read and preprocess your files : 
29 | 
30 | .. code-block:: python 
31 | 
32 |    data = Reader(sep=",").train_test_split(paths, target_name)  #reading
33 |    data = Drift_thresholder().fit_transform(data)  #deleting non-stable variables
34 | 
35 | ... to evaluate models (here default configuration):
36 | 
37 | .. code-block:: python 
38 | 
39 |    Optimiser().evaluate(None, data)
40 | 
41 | 
42 | ... or to test and optimize the whole Pipeline [**OPTIONAL**]:
43 | 
44 | * missing data encoder, aka 'ne'
45 | * categorical variables encoder, aka 'ce'
46 | * feature selector, aka 'fs'
47 | * meta-features stacker, aka 'stck'
48 | * final estimator, aka 'est'
49 | 
50 | **NB** : please have a look at all the possibilities you have to configure the Pipeline (steps, parameters and values...) 
51 | 
52 | .. code-block:: python 
53 | 
54 |    space = {
55 |    
56 |            'ne__numerical_strategy' : {"space" : [0, 'mean']},
57 | 
58 |            'ce__strategy' : {"space" : ["label_encoding", "random_projection", "entity_embedding"]},
59 | 
60 |            'fs__strategy' : {"space" : ["variance", "rf_feature_importance"]},
61 |            'fs__threshold': {"search" : "choice", "space" : [0.1, 0.2, 0.3]},             
62 | 
63 |            'est__strategy' : {"space" : ["LightGBM"]},
64 |            'est__max_depth' : {"search" : "choice", "space" : [5,6]},
65 |            'est__subsample' : {"search" : "uniform", "space" : [0.6,0.9]}
66 |            
67 |            }
68 | 
69 |    best = opt.optimise(space, data, max_evals = 5)
70 | 
71 | ... finally to predict on the test set with the best parameters (or None for default configuration):
72 | 
73 | .. code-block:: python 
74 | 
75 |    Predictor().fit_predict(best, data)
76 | 
77 | 
78 | **That's all !** You can have a look at the folder "save" where you can find :
79 | 
80 | * your predictions
81 | * feature importances
82 | * drift coefficients of your variables (0.5 = very stable, 1. = not stable at all)
83 | 


--------------------------------------------------------------------------------
/docs/logos/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/logo.png


--------------------------------------------------------------------------------
/docs/logos/small_logo.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/small_logo.ico


--------------------------------------------------------------------------------
/docs/logos/small_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/logos/small_logo.png


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\mlbox.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\mlbox.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/docs/webinars/auto-ML.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/webinars/auto-ML.pdf


--------------------------------------------------------------------------------
/docs/webinars/features.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/docs/webinars/features.pdf


--------------------------------------------------------------------------------
/examples/classification/classification.py:
--------------------------------------------------------------------------------
 1 | """A classification example using mlbox."""
 2 | from mlbox.preprocessing import Reader
 3 | from mlbox.preprocessing import Drift_thresholder
 4 | from mlbox.optimisation import Optimiser
 5 | from mlbox.prediction import Predictor
 6 | 
 7 | # Paths to the train set and the test set.
 8 | paths = ["train_classification.csv", "test_classification.csv"]
 9 | # Name of the feature to predict.
10 | # This columns should only be present in the train set.
11 | target_name = "Survived"
12 | 
13 | # Reading and cleaning all files
14 | # Declare a reader for csv files
15 | rd = Reader(sep=',')
16 | # Return a dictionnary containing three entries
17 | # dict["train"] contains training samples withtout target columns
18 | # dict["test"] contains testing elements withtout target columns
19 | # dict["target"] contains target columns for training samples.
20 | data = rd.train_test_split(paths, target_name)
21 | 
22 | dft = Drift_thresholder()
23 | data = dft.fit_transform(data)
24 | 
25 | # Tuning
26 | # Declare an optimiser. Scoring possibilities for classification lie in :
27 | # {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"}
28 | opt = Optimiser(scoring='accuracy', n_folds=3)
29 | opt.evaluate(None, data)
30 | 
31 | # Space of hyperparameters
32 | # The keys must respect the following syntax : "enc__param".
33 | #   "enc" = "ne" for na encoder
34 | #   "enc" = "ce" for categorical encoder
35 | #   "enc" = "fs" for feature selector [OPTIONAL]
36 | #   "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
37 | #   "enc" = "est" for the final estimator
38 | #   "param" : a correct associated parameter for each step.
39 | #   Ex: "max_depth" for "enc"="est", ...
40 | # The values must respect the syntax: {"search":strategy,"space":list}
41 | #   "strategy" = "choice" or "uniform". Default = "choice"
42 | #   list : a list of values to be tested if strategy="choice".
43 | #   Else, list = [value_min, value_max].
44 | # Available strategies for ne_numerical_strategy are either an integer, a float
45 | #   or in {'mean', 'median', "most_frequent"}
46 | # Available strategies for ce_strategy are:
47 | #   {"label_encoding", "dummification", "random_projection", entity_embedding"}
48 | space = {'ne__numerical_strategy': {"search": "choice", "space": [0]},
49 |          'ce__strategy': {"search": "choice",
50 |                           "space": ["label_encoding",
51 |                                     "random_projection",
52 |                                     "entity_embedding"]},
53 |          'fs__threshold': {"search": "uniform",
54 |                            "space": [0.01, 0.3]},
55 |          'est__max_depth': {"search": "choice",
56 |                             "space": [3, 4, 5, 6, 7]}
57 | 
58 |          }
59 | 
60 | # Optimises hyper-parameters of the whole Pipeline with a given scoring
61 | # function. Algorithm used to optimize : Tree Parzen Estimator.
62 | #
63 | # IMPORTANT : Try to avoid dependent parameters and to set one feature
64 | # selection strategy and one estimator strategy at a time.
65 | best = opt.optimise(space, data, 15)
66 | 
67 | # Make prediction and save the results in save folder.
68 | prd = Predictor()
69 | prd.fit_predict(best, data)
70 | 


--------------------------------------------------------------------------------
/examples/regression/regression.py:
--------------------------------------------------------------------------------
 1 | """A regression example using mlbox."""
 2 | import numpy as np
 3 | 
 4 | from mlbox.preprocessing import Reader
 5 | from mlbox.preprocessing import Drift_thresholder
 6 | from mlbox.optimisation import make_scorer
 7 | from mlbox.optimisation import Optimiser
 8 | from mlbox.prediction import Predictor
 9 | 
10 | # Paths to the train set and the test set.
11 | paths = ["train_regression.csv", "test_regression.csv"]
12 | # Name of the feature to predict.
13 | # This columns should only be present in the train set.
14 | target_name = "SalePrice"
15 | 
16 | # Reading and cleaning all files
17 | # Declare a reader for csv files
18 | rd = Reader(sep=',')
19 | # Return a dictionnary containing three entries
20 | # dict["train"] contains training samples withtout target columns
21 | # dict["test"] contains testing elements withtout target columns
22 | # dict["target"] contains target columns for training samples.
23 | data = rd.train_test_split(paths, target_name)
24 | 
25 | dft = Drift_thresholder()
26 | data = dft.fit_transform(data)
27 | 
28 | # Tuning
29 | mape = make_scorer(lambda y_true,
30 |                    y_pred: 100*np.sum(
31 |                                       np.abs(y_true-y_pred)/y_true
32 |                                       )/len(y_true),
33 |                    greater_is_better=False,
34 |                    needs_proba=False)
35 | # Declare an optimiser. You can declare your own score
36 | # as presented here or use one in
37 | # {"neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"}
38 | opt = Optimiser(scoring=mape, n_folds=3)
39 | opt.evaluate(None, data)
40 | 
41 | # Space of hyperparameters
42 | # The keys must respect the following syntax : "enc__param".
43 | #   "enc" = "ne" for na encoder
44 | #   "enc" = "ce" for categorical encoder
45 | #   "enc" = "fs" for feature selector [OPTIONAL]
46 | #   "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
47 | #   "enc" = "est" for the final estimator
48 | #   "param" : a correct associated parameter for each step.
49 | #   Ex: "max_depth" for "enc"="est", ...
50 | # The values must respect the syntax: {"search":strategy,"space":list}
51 | #   "strategy" = "choice" or "uniform". Default = "choice"
52 | #   list : a list of values to be tested if strategy="choice".
53 | #   Else, list = [value_min, value_max].
54 | # Available strategies for ne_numerical_strategy are either an integer, a float
55 | #   or in {'mean', 'median', "most_frequent"}
56 | # Available strategies for ce_strategy are:
57 | #   {"label_encoding", "dummification", "random_projection", entity_embedding"}
58 | space = {
59 |         'ne__numerical_strategy': {"search": "choice",
60 |                                    "space": [0]},
61 |         'ce__strategy': {"search": "choice",
62 |                          "space": ["label_encoding",
63 |                                    "random_projection",
64 |                                    "entity_embedding"]},
65 |         'fs__threshold': {"search": "uniform",
66 |                           "space": [0.01, 0.3]},
67 |         'est__max_depth': {"search": "choice",
68 |                            "space": [3, 4, 5, 6, 7]}
69 | 
70 |         }
71 | 
72 | # Optimises hyper-parameters of the whole Pipeline with a given scoring
73 | # function. Algorithm used to optimize : Tree Parzen Estimator.
74 | #
75 | # IMPORTANT : Try to avoid dependent parameters and to set one feature
76 | # selection strategy and one estimator strategy at a time.
77 | best = opt.optimise(space, data, 15)
78 | 
79 | # Make prediction and save the results in save folder.
80 | prd = Predictor()
81 | prd.fit_predict(best, data)
82 | 


--------------------------------------------------------------------------------
/mlbox/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | __author__ = """Axel ARONIO DE ROMBLAY"""
 4 | __email__ = 'axelderomblay@gmail.com'
 5 | 
 6 | from .preprocessing import *
 7 | from .encoding import *
 8 | from .optimisation import *
 9 | from .prediction import *
10 | from .model import *
11 | 


--------------------------------------------------------------------------------
/mlbox/encoding/__init__.py:
--------------------------------------------------------------------------------
1 | from .na_encoder import *
2 | from .categorical_encoder import *
3 | 


--------------------------------------------------------------------------------
/mlbox/encoding/na_encoder.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  3 | # License: BSD 3 clause
  4 | 
  5 | import pandas as pd
  6 | import warnings
  7 | 
  8 | from sklearn.impute import SimpleImputer
  9 | 
 10 | 
 11 | class NA_encoder():
 12 |     """Encodes missing values for both numerical and categorical features.
 13 | 
 14 |     Several strategies are possible in each case.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     numerical_strategy : str or float or int. default = "mean"
 19 |         The strategy to encode NA for numerical features.
 20 |         Available strategies = "mean", "median",
 21 |         "most_frequent" or a float/int value
 22 | 
 23 |     categorical_strategy : str, default = '<NULL>'
 24 |         The strategy to encode NA for categorical features.
 25 |         Available strategies = a string or "most_frequent"
 26 | 
 27 |     """
 28 | 
 29 |     def __init__(self,
 30 |                  numerical_strategy='mean',
 31 |                  categorical_strategy='<NULL>'):
 32 |         """Init a NA_encoder.
 33 | 
 34 |         User can choose numerical strategy and categorical strategy.
 35 | 
 36 |         Parameters
 37 |         ----------
 38 |         numerical_strategy : str or float or int. default = "mean"
 39 |             The strategy to encode NA for numerical features.
 40 | 
 41 |         categorical_strategy : str, default = '<NULL>'
 42 |             The strategy to encode NA for categorical features.
 43 | 
 44 |         """
 45 |         self.numerical_strategy = numerical_strategy
 46 |         self.categorical_strategy = categorical_strategy
 47 |         self.__Lcat = []
 48 |         self.__Lnum = []
 49 |         self.__imp = None
 50 |         self.__mode = dict()
 51 |         self.__fitOK = False
 52 | 
 53 |     def get_params(self, deep=True):
 54 |         """Get parameters of a NA_encoder object."""
 55 |         return {'numerical_strategy': self.numerical_strategy,
 56 |                 'categorical_strategy': self.categorical_strategy}
 57 | 
 58 |     def set_params(self, **params):
 59 |         """Set parameters for a NA_encoder object.
 60 | 
 61 |         Set numerical strategy and categorical strategy.
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         numerical_strategy : str or float or int. default = "mean"
 66 |             The strategy to encode NA for numerical features.
 67 | 
 68 |         categorical_strategy : str, default = '<NULL>'
 69 |             The strategy to encode NA for categorical features.
 70 | 
 71 |         """
 72 |         self.__fitOK = False
 73 | 
 74 |         for k, v in params.items():
 75 |             if k not in self.get_params():
 76 |                 warnings.warn("Invalid parameter(s) for encoder NA_encoder. "
 77 |                               "Parameter(s) IGNORED. "
 78 |                               "Check the list of available parameters with "
 79 |                               "`encoder.get_params().keys()`")
 80 |             else:
 81 |                 setattr(self, k, v)
 82 | 
 83 |     def fit(self, df_train, y_train=None):
 84 |         """Fits NA Encoder.
 85 | 
 86 |         Parameters
 87 |         ----------
 88 |         df_train : pandas dataframe of shape = (n_train, n_features)
 89 |             The train dataset with numerical and categorical features.
 90 | 
 91 |         y_train : pandas series of shape = (n_train, ), default = None
 92 |             The target for classification or regression tasks.
 93 | 
 94 |         Returns
 95 |         -------
 96 |         object
 97 |             self
 98 | 
 99 |         """
100 |         self.__Lcat = df_train.dtypes[df_train.dtypes == 'object'].index
101 |         self.__Lnum = df_train.dtypes[df_train.dtypes != 'object'].index
102 | 
103 |         # Dealing with numerical features
104 | 
105 |         if (self.numerical_strategy in ['mean', 'median', "most_frequent"]):
106 | 
107 |             self.__imp = SimpleImputer(strategy=self.numerical_strategy)
108 | 
109 |             if (len(self.__Lnum) != 0):
110 |                 self.__imp.fit(df_train[self.__Lnum])
111 |             else:
112 |                 pass
113 | 
114 |         elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)):
115 | 
116 |             pass
117 | 
118 |         else:
119 | 
120 |             raise ValueError("Numerical strategy for NA encoding is not valid")
121 | 
122 |         # Dealing with categorical features
123 | 
124 |         if (type(self.categorical_strategy) == str):
125 | 
126 |             if (self.categorical_strategy == "most_frequent"):
127 | 
128 |                 na_count = df_train[self.__Lcat].isnull().sum()
129 | 
130 |                 for col in na_count[na_count>0].index:
131 | 
132 |                     try:
133 |                         self.__mode[col] = df_train[col].mode()[0]
134 |                     except:
135 |                         self.__mode[col] = "<NULL>"
136 | 
137 |             else:
138 |                 pass
139 | 
140 |         else:
141 |             raise ValueError("Categorical strategy for NA encoding is not valid")
142 | 
143 |         self.__fitOK = True
144 | 
145 |         return self
146 | 
147 |     def fit_transform(self, df_train, y_train=None):
148 |         """Fits NA Encoder and transforms the dataset.
149 | 
150 |         Parameters
151 |         ----------
152 |         df_train : pandas.Dataframe of shape = (n_train, n_features)
153 |             The train dataset with numerical and categorical features.
154 | 
155 |         y_train : pandas.Series of shape = (n_train, ), default = None
156 |             The target for classification or regression tasks.
157 | 
158 |         Returns
159 |         -------
160 |         pandas.Dataframe of shape = (n_train, n_features)
161 |             The train dataset with no missing values.
162 | 
163 |         """
164 |         self.fit(df_train, y_train)
165 | 
166 |         return self.transform(df_train)
167 | 
168 |     def transform(self, df):
169 |         """Transform the dataset.
170 | 
171 |         Parameters
172 |         ----------
173 |         df : pandas.Dataframe of shape = (n, n_features)
174 |             The dataset with numerical and categorical features.
175 | 
176 |         Returns
177 |         -------
178 |         pandas.Dataframe of shape = (n, n_features)
179 |             The dataset with no missing values.
180 | 
181 |         """
182 |         if(self.__fitOK):
183 | 
184 |             if(len(self.__Lnum) == 0):
185 | 
186 |                 if (self.categorical_strategy != "most_frequent"):
187 |                     return df[self.__Lcat].fillna(self.categorical_strategy)
188 | 
189 |                 else:
190 |                     return df[self.__Lcat].fillna(self.__mode)
191 | 
192 |             else:
193 | 
194 |                 if (self.numerical_strategy in ['mean',
195 |                                                 'median',
196 |                                                 "most_frequent"]):
197 | 
198 |                     if (len(self.__Lcat) != 0):
199 | 
200 |                         if (self.categorical_strategy != "most_frequent"):
201 | 
202 |                             return pd.concat(
203 |                                 (pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
204 |                                               columns=self.__Lnum,
205 |                                               index=df.index),
206 |                                  df[self.__Lcat].fillna(self.categorical_strategy)
207 |                                  ),
208 |                                 axis=1)[df.columns]
209 | 
210 |                         else:
211 | 
212 |                             return pd.concat(
213 |                                 (pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
214 |                                               columns=self.__Lnum,
215 |                                               index=df.index),
216 |                                  df[self.__Lcat].fillna(self.__mode)
217 |                                  ),
218 |                                 axis=1)[df.columns]
219 | 
220 |                     else:
221 | 
222 |                         return pd.DataFrame(
223 |                             self.__imp.transform(df[self.__Lnum]),
224 |                             columns=self.__Lnum,
225 |                             index=df.index
226 |                         )
227 | 
228 |                 elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)):
229 | 
230 |                     if (len(self.__Lcat) != 0):
231 | 
232 |                         if (self.categorical_strategy != "most_frequent"):
233 | 
234 |                             return pd.concat(
235 |                                 (df[self.__Lnum].fillna(self.numerical_strategy),
236 |                                  df[self.__Lcat].fillna(self.categorical_strategy)
237 |                                  ),
238 |                                 axis=1)[df.columns]
239 | 
240 |                         else:
241 | 
242 |                             return pd.concat(
243 |                                 (df[self.__Lnum].fillna(self.numerical_strategy),
244 |                                  df[self.__Lcat].fillna(self.__mode)
245 |                                  ),
246 |                                 axis=1)[df.columns]
247 |                     else:
248 | 
249 |                         return df[self.__Lnum].fillna(self.numerical_strategy)
250 | 
251 |         else:
252 | 
253 |             raise ValueError("Call fit or fit_transform function before")
254 | 


--------------------------------------------------------------------------------
/mlbox/model/__init__.py:
--------------------------------------------------------------------------------
1 | from . import classification
2 | from . import regression
3 | 
4 | __all__ = ['classification', 'regression']
5 | 


--------------------------------------------------------------------------------
/mlbox/model/classification/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature_selector import Clf_feature_selector
2 | from .classifier import Classifier
3 | from .stacking_classifier import StackingClassifier
4 | 
5 | __all__ = ['Clf_feature_selector', 'Classifier', 'StackingClassifier']
6 | 


--------------------------------------------------------------------------------
/mlbox/model/classification/feature_selector.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  3 | # License: BSD 3 clause
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.linear_model import LogisticRegression
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | import warnings
 10 | 
 11 | 
 12 | class Clf_feature_selector():
 13 | 
 14 |     """Selects useful features.
 15 | 
 16 |     Several strategies are possible (filter and wrapper methods).
 17 |     Works for classification problems only (multiclass or binary).
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     strategy : str, defaut = "l1"
 22 |         The strategy to select features.
 23 |         Available strategies = {"variance", "l1", "rf_feature_importance"}
 24 | 
 25 |     threshold : float, defaut = 0.3
 26 |         The percentage of variable to discard according to the strategy.
 27 |         Must be between 0. and 1.
 28 |     """
 29 | 
 30 |     def __init__(self, strategy='l1', threshold=0.3):
 31 | 
 32 |         # 'variance','l1, 'rf_feature_importance'
 33 |         self.strategy = strategy
 34 |         # a float between 0. and 1. defaut : 0.3 ie we drop 0.3 of features
 35 |         self.threshold = threshold
 36 |         self.__fitOK = False
 37 |         self.__to_discard = []
 38 | 
 39 | 
 40 |     def get_params(self, deep=True):
 41 | 
 42 |         return {'strategy': self.strategy,
 43 |                 'threshold': self.threshold}
 44 | 
 45 | 
 46 |     def set_params(self, **params):
 47 | 
 48 |         self.__fitOK = False
 49 | 
 50 |         for k, v in params.items():
 51 |             if k not in self.get_params():
 52 |                 warnings.warn("Invalid parameter a for feature selector"
 53 |                               "Clf_feature_selector. Parameter IGNORED. Check"
 54 |                               "the list of available parameters with"
 55 |                               "`feature_selector.get_params().keys()`")
 56 |             else:
 57 |                 setattr(self, k, v)
 58 | 
 59 | 
 60 |     def fit(self, df_train, y_train):
 61 | 
 62 |         """Fits Clf_feature_selector
 63 | 
 64 |         Parameters
 65 |         ----------
 66 |         df_train : pandas dataframe of shape = (n_train, n_features)
 67 |             The train dataset with numerical features and no NA
 68 | 
 69 |         y_train : pandas series of shape = (n_train, )
 70 |             The target for classification task. Must be encoded.
 71 | 
 72 |         Returns
 73 |         -------
 74 |         object
 75 |             self
 76 |         """
 77 | 
 78 |         # sanity checks
 79 |         if((type(df_train) != pd.SparseDataFrame) and
 80 |            (type(df_train) != pd.DataFrame)):
 81 |             raise ValueError("df_train must be a DataFrame")
 82 | 
 83 |         if (type(y_train) != pd.core.series.Series):
 84 |             raise ValueError("y_train must be a Series")
 85 | 
 86 |         if(self.strategy == 'variance'):
 87 |             coef = df_train.std()
 88 |             abstract_threshold = np.percentile(coef, 100. * self.threshold)
 89 |             self.__to_discard = coef[coef < abstract_threshold].index
 90 |             self.__fitOK = True
 91 | 
 92 |         elif(self.strategy == 'l1'):
 93 |             model = LogisticRegression(C=0.01, penalty='l1', solver="saga",
 94 |                                        n_jobs=-1, random_state=0)  # to be tuned
 95 |             model.fit(df_train, y_train)
 96 |             coef = np.mean(np.abs(model.coef_), axis=0)
 97 |             abstract_threshold = np.percentile(coef, 100. * self.threshold)
 98 |             self.__to_discard = df_train.columns[coef < abstract_threshold]
 99 |             self.__fitOK = True
100 | 
101 |         elif(self.strategy == 'rf_feature_importance'):
102 |             model = RandomForestClassifier(n_estimators=50, n_jobs=-1,
103 |                                            random_state=0)  # to be tuned
104 |             model.fit(df_train, y_train)
105 |             coef = model.feature_importances_
106 |             abstract_threshold = np.percentile(coef, 100. * self.threshold)
107 |             self.__to_discard = df_train.columns[coef < abstract_threshold]
108 |             self.__fitOK = True
109 | 
110 |         else:
111 |             raise ValueError("Strategy invalid. Please choose between "
112 |                              "'variance', 'l1' or 'rf_feature_importance'")
113 | 
114 |         return self
115 | 
116 | 
117 |     def transform(self, df):
118 | 
119 |         """Transforms the dataset
120 | 
121 |         Parameters
122 |         ----------
123 |         df : pandas dataframe of shape = (n, n_features)
124 |             The dataset with numerical features and no NA
125 | 
126 |         Returns
127 |         -------
128 |         pandas dataframe of shape = (n_train, n_features*(1-threshold))
129 |             The train dataset with relevant features
130 |         """
131 | 
132 |         if(self.__fitOK):
133 | 
134 |             # sanity checks
135 |             if((type(df) != pd.SparseDataFrame) and
136 |                (type(df) != pd.DataFrame)):
137 |                 raise ValueError("df must be a DataFrame")
138 | 
139 |             return df.drop(self.__to_discard, axis=1)
140 |         else:
141 |             raise ValueError("call fit or fit_transform function before")
142 | 
143 | 
144 |     def fit_transform(self, df_train, y_train):
145 | 
146 |         """Fits Clf_feature_selector and transforms the dataset
147 |     
148 |         Parameters
149 |         ----------
150 |         df_train : pandas dataframe of shape = (n_train, n_features)
151 |             The train dataset with numerical features and no NA
152 | 
153 |         y_train : pandas series of shape = (n_train, ). 
154 |             The target for classification task. Must be encoded.
155 |     
156 |         Returns
157 |         -------
158 |         pandas dataframe of shape = (n_train, n_features*(1-threshold))
159 |             The train dataset with relevant features
160 |         """
161 | 
162 |         self.fit(df_train, y_train)
163 | 
164 |         return self.transform(df_train)
165 | 


--------------------------------------------------------------------------------
/mlbox/model/regression/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature_selector import Reg_feature_selector
2 | from .regressor import Regressor
3 | from .stacking_regressor import StackingRegressor
4 | 
5 | __all__ = ['Reg_feature_selector', 'Regressor', 'StackingRegressor']
6 | 


--------------------------------------------------------------------------------
/mlbox/model/regression/feature_selector.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  3 | # License: BSD 3 clause
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.linear_model import Lasso
  8 | from sklearn.ensemble import RandomForestRegressor
  9 | import warnings
 10 | 
 11 | 
 12 | class Reg_feature_selector():
 13 | 
 14 |     """Selects useful features.
 15 | 
 16 |     Several strategies are possible (filter and wrapper methods).
 17 |     Works for regression problems only.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     strategy : str, defaut = "l1"
 22 |         The strategy to select features.
 23 |         Available strategies = {"variance", "l1", "rf_feature_importance"}
 24 | 
 25 |     threshold : float, defaut = 0.3
 26 |         The percentage of variable to discard according the strategy.
 27 |         Must be between 0. and 1.
 28 |     """
 29 | 
 30 |     def __init__(self, strategy='l1', threshold=0.3):
 31 |         self.strategy = strategy
 32 |         self.threshold = threshold
 33 |         self.__fitOK = False
 34 |         self.__to_discard = []
 35 | 
 36 | 
 37 |     def get_params(self, deep=True):
 38 |         return {'strategy': self.strategy,
 39 |                 'threshold': self.threshold}
 40 | 
 41 | 
 42 |     def set_params(self, **params):
 43 |         self.__fitOK = False
 44 | 
 45 |         for k, v in params.items():
 46 |             if k not in self.get_params():
 47 |                 warnings.warn("Invalid parameter a for feature selector"
 48 |                               "Reg_feature_selector. Parameter IGNORED. Check "
 49 |                               "the list of available parameters with "
 50 |                               "`feature_selector.get_params().keys()`")
 51 |             else:
 52 |                 setattr(self, k, v)
 53 | 
 54 | 
 55 |     def fit(self, df_train, y_train):
 56 | 
 57 |         """Fits Reg_feature_selector.
 58 | 
 59 |         Parameters
 60 |         ----------
 61 |         df_train : pandas dataframe of shape = (n_train, n_features)
 62 |             The train dataset with numerical features and no NA
 63 | 
 64 |         y_train : pandas series of shape = (n_train, ).
 65 |             The target for regression task.
 66 | 
 67 |         Returns
 68 |         -------
 69 |         sobject
 70 |             self
 71 |         """
 72 | 
 73 |         # sanity checks
 74 |         if((type(df_train) != pd.SparseDataFrame) and
 75 |            (type(df_train) != pd.DataFrame)):
 76 |             raise ValueError("df_train must be a DataFrame")
 77 | 
 78 |         if (type(y_train) != pd.core.series.Series):
 79 |             raise ValueError("y_train must be a Series")
 80 | 
 81 |         if(self.strategy == 'variance'):
 82 |             coef = df_train.std()
 83 |             abstract_threshold = np.percentile(coef, 100. * self.threshold)
 84 |             self.__to_discard = coef[coef < abstract_threshold].index
 85 |             self.__fitOK = True
 86 | 
 87 |         elif(self.strategy == 'l1'):
 88 |             model = Lasso(alpha=100.0, random_state=0)   # to be tuned
 89 |             model.fit(df_train, y_train)
 90 |             coef = np.abs(model.coef_)
 91 |             abstract_threshold = np.percentile(coef, 100. * self.threshold)
 92 |             self.__to_discard = df_train.columns[coef < abstract_threshold]
 93 |             self.__fitOK = True
 94 | 
 95 |         elif(self.strategy == 'rf_feature_importance'):
 96 |             model = RandomForestRegressor(n_estimators=50,
 97 |                                           n_jobs=-1,
 98 |                                           random_state=0)  # to be tuned
 99 |             model.fit(df_train, y_train)
100 |             coef = model.feature_importances_
101 |             abstract_threshold = np.percentile(coef, 100. * self.threshold)
102 |             self.__to_discard = df_train.columns[coef < abstract_threshold]
103 |             self.__fitOK = True
104 | 
105 |         else:
106 |             raise ValueError("Strategy invalid. Please choose between "
107 |                              "'variance', 'l1' or 'rf_feature_importance'")
108 | 
109 |         return self
110 | 
111 | 
112 |     def transform(self, df):
113 | 
114 |         """Transforms the dataset
115 | 
116 |         Parameters
117 |         ----------
118 |         df : pandas dataframe of shape = (n, n_features)
119 |             The dataset with numerical features and no NA
120 | 
121 |         Returns
122 |         -------
123 |         pandas dataframe of shape = (n_train, n_features*(1-threshold))
124 |             The train dataset with relevant features
125 |         """
126 | 
127 |         if(self.__fitOK):
128 | 
129 |             # sanity checks
130 |             if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
131 |                 raise ValueError("df must be a DataFrame")
132 | 
133 |             return df.drop(self.__to_discard, axis=1)
134 |         else:
135 |             raise ValueError("call fit or fit_transform function before")
136 | 
137 | 
138 |     def fit_transform(self, df_train, y_train):
139 | 
140 |         """Fits Reg_feature_selector and transforms the dataset
141 | 
142 |         Parameters
143 |         ----------
144 |         df_train : pandas dataframe of shape = (n_train, n_features)
145 |             The train dataset with numerical features and no NA
146 | 
147 |         y_train : pandas series of shape = (n_train, ).
148 |             The target for regression task.
149 | 
150 |         Returns
151 |         -------
152 |         pandas dataframe of shape = (n_train, n_features*(1-threshold))
153 |             The train dataset with relevant features
154 |         """
155 | 
156 |         self.fit(df_train, y_train)
157 | 
158 |         return self.transform(df_train)
159 | 


--------------------------------------------------------------------------------
/mlbox/model/regression/regressor.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # License: BSD 3 clause
  5 | 
  6 | import warnings
  7 | from copy import copy
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor,
 12 |                               ExtraTreesRegressor, RandomForestRegressor)
 13 | from sklearn.linear_model import Ridge
 14 | from sklearn.tree import DecisionTreeRegressor
 15 | from lightgbm import LGBMRegressor
 16 | 
 17 | 
 18 | class Regressor():
 19 |     """Wrap scikitlearn regressors.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     strategy : str, default = "LightGBM"
 24 |         The choice for the regressor.
 25 |         Available strategies = {"LightGBM", "RandomForest", "ExtraTrees",
 26 |         "Tree", "Bagging", "AdaBoost" or "Linear"}
 27 | 
 28 |     **params : default = None
 29 |         Parameters of the corresponding regressor.
 30 |         Examples : n_estimators, max_depth...
 31 | 
 32 |     """
 33 | 
 34 |     def __init__(self, **params):
 35 |         """Init Regressor object where user can pass a strategy."""
 36 |         if ("strategy" in params):
 37 |             self.__strategy = params["strategy"]
 38 |         else:
 39 |             self.__strategy = "LightGBM"
 40 | 
 41 |         self.__regress_params = {}
 42 | 
 43 |         self.__regressor = None
 44 |         self.__set_regressor(self.__strategy)
 45 |         self.__col = None
 46 | 
 47 |         self.set_params(**params)
 48 |         self.__fitOK = False
 49 | 
 50 |     def get_params(self, deep=True):
 51 |         """Get parameters of Regressor object."""
 52 |         params = {}
 53 |         params["strategy"] = self.__strategy
 54 |         params.update(self.__regress_params)
 55 | 
 56 |         return params
 57 | 
 58 |     def set_params(self, **params):
 59 |         """Set parameters of Regressor object."""
 60 |         self.__fitOK = False
 61 | 
 62 |         if 'strategy' in params.keys():
 63 |             self.__set_regressor(params['strategy'])
 64 | 
 65 |             for k, v in self.__regress_params.items():
 66 |                 if k not in self.get_params().keys():
 67 |                     warnings.warn("Invalid parameter for regressor "
 68 |                                   + str(self.__strategy)
 69 |                                   + ". Parameter IGNORED. Check the list of "
 70 |                                   "available parameters with "
 71 |                                   "`regressor.get_params().keys()`")
 72 |                 else:
 73 |                     setattr(self.__regressor, k, v)
 74 | 
 75 |         for k, v in params.items():
 76 |             if(k == "strategy"):
 77 |                 pass
 78 |             else:
 79 |                 if k not in self.__regressor.get_params().keys():
 80 |                     warnings.warn("Invalid parameter for regressor "
 81 |                                   + str(self.__strategy)
 82 |                                   + ". Parameter IGNORED. Check the list of "
 83 |                                   "available parameters with "
 84 |                                   "`regressor.get_params().keys()`")
 85 |                 else:
 86 |                     setattr(self.__regressor, k, v)
 87 |                     self.__regress_params[k] = v
 88 | 
 89 |     def __set_regressor(self, strategy):
 90 |         """Set strategy of a regressor object."""
 91 |         self.__strategy = strategy
 92 | 
 93 |         if(strategy == 'RandomForest'):
 94 |             self.__regressor = RandomForestRegressor(
 95 |                 n_estimators=400, max_depth=10, max_features='sqrt',
 96 |                 bootstrap=True, n_jobs=-1, random_state=0)
 97 | 
 98 |         elif(strategy == "LightGBM"):
 99 |             self.__regressor = LGBMRegressor(
100 |                 n_estimators=500, learning_rate=0.05,
101 |                 colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0)
102 | 
103 |         elif(strategy == 'ExtraTrees'):
104 |             self.__regressor = ExtraTreesRegressor(
105 |                 n_estimators=400, max_depth=10, max_features='sqrt',
106 |                 bootstrap=True, n_jobs=-1, random_state=0)
107 | 
108 |         elif(strategy == 'Tree'):
109 |             self.__regressor = DecisionTreeRegressor(
110 |                 criterion='mse', splitter='best', max_depth=None,
111 |                 min_samples_split=2, min_samples_leaf=1,
112 |                 min_weight_fraction_leaf=0.0, max_features=None,
113 |                 random_state=0, max_leaf_nodes=None, presort=False)
114 | 
115 |         elif(strategy == "Bagging"):
116 |             self.__regressor = BaggingRegressor(
117 |                 base_estimator=None, n_estimators=500, max_samples=.9,
118 |                 max_features=.85, bootstrap=False, bootstrap_features=False,
119 |                 n_jobs=-1, random_state=0)
120 | 
121 |         elif(strategy == "AdaBoost"):
122 |             self.__regressor = AdaBoostRegressor(
123 |                 base_estimator=None, n_estimators=400, learning_rate=.05,
124 |                 random_state=0)
125 | 
126 |         elif(strategy == "Linear"):
127 |             self.__regressor = Ridge(
128 |                 alpha=1.0, fit_intercept=True, normalize=False, copy_X=True,
129 |                 max_iter=None, tol=0.001, solver='auto', random_state=0)
130 | 
131 |         else:
132 |             raise ValueError(
133 |                 "Strategy invalid. Please choose between 'LightGBM'"
134 |                 ", 'RandomForest', 'ExtraTrees', "
135 |                 "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")
136 | 
137 |     def fit(self, df_train, y_train):
138 |         """Fits Regressor.
139 | 
140 |         Parameters
141 |         ----------
142 |         df_train : pandas dataframe of shape = (n_train, n_features)
143 |             The train dataset with numerical features.
144 | 
145 |         y_train : pandas series of shape = (n_train, )
146 |             The target for regression tasks.
147 | 
148 |         Returns
149 |         -------
150 |         object
151 |             self
152 | 
153 |         """
154 |         # sanity checks
155 |         if((type(df_train) != pd.SparseDataFrame) and
156 |            (type(df_train) != pd.DataFrame)):
157 |             raise ValueError("df_train must be a DataFrame")
158 | 
159 |         if (type(y_train) != pd.core.series.Series):
160 |             raise ValueError("y_train must be a Series")
161 | 
162 |         self.__regressor.fit(df_train.values, y_train)
163 |         self.__col = df_train.columns
164 |         self.__fitOK = True
165 | 
166 |         return self
167 | 
168 |     def feature_importances(self):
169 |         """Computes feature importances.
170 | 
171 |         Regressor must be fitted before.
172 | 
173 |         Returns
174 |         -------
175 |         dict
176 |             Dictionnary containing a measure of feature importance (value)
177 |             for each feature (key).
178 | 
179 |         """
180 |         if self.__fitOK:
181 | 
182 |             if (self.get_params()["strategy"] in ["Linear"]):
183 | 
184 |                 importance = {}
185 |                 f = np.abs(self.get_estimator().coef_)
186 | 
187 |                 for i, col in enumerate(self.__col):
188 |                     importance[col] = f[i]
189 | 
190 |             elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest",
191 |                                                     "ExtraTrees", "Tree"]):
192 | 
193 |                 importance = {}
194 |                 f = self.get_estimator().feature_importances_
195 | 
196 |                 for i, col in enumerate(self.__col):
197 |                     importance[col] = f[i]
198 | 
199 |             elif (self.get_params()["strategy"] in ["AdaBoost"]):
200 | 
201 |                 importance = {}
202 |                 norm = self.get_estimator().estimator_weights_.sum()
203 | 
204 |                 try:
205 |                     # LGB, RF, ET, Tree and AdaBoost
206 |                     # TODO: Refactor this part
207 |                     f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm  # noqa
208 | 
209 |                 except Exception:
210 |                     f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm  # noqa
211 | 
212 |                 for i, col in enumerate(self.__col):
213 |                     importance[col] = f[i]
214 | 
215 |             elif (self.get_params()["strategy"] in ["Bagging"]):
216 | 
217 |                 importance = {}
218 |                 importance_bag = []
219 | 
220 |                 for i, b in enumerate(self.get_estimator().estimators_):
221 | 
222 |                     d = {}
223 | 
224 |                     try:
225 |                         # LGB, RF, ET, Tree and AdaBoost
226 |                         f = b.feature_importances_
227 |                     except Exception:
228 |                         f = np.abs(b.coef_)  # Linear
229 | 
230 |                     estimator = self.get_estimator()
231 |                     items = enumerate(estimator.estimators_features_[i])
232 |                     for j, c in items:
233 |                         d[self.__col[c]] = f[j]
234 | 
235 |                     importance_bag.append(d.copy())
236 | 
237 |                 for i, col in enumerate(self.__col):
238 |                     list_filtered = filter(lambda x: x != 0,
239 |                                            [k[col] if col in k else 0
240 |                                             for k in importance_bag])
241 |                     importance[col] = np.mean(list(list_filtered))
242 | 
243 |             else:
244 | 
245 |                 importance = {}
246 | 
247 |             return importance
248 | 
249 |         else:
250 | 
251 |             raise ValueError("You must call the fit function before !")
252 | 
253 |     def predict(self, df):
254 |         """Predicts the target.
255 | 
256 |         Parameters
257 |         ----------
258 |         df : pandas dataframe of shape = (n, n_features)
259 |             The dataset with numerical features.
260 | 
261 |         Returns
262 |         -------
263 |         array of shape = (n, )
264 |             The target to be predicted.
265 | 
266 |         """
267 |         try:
268 |             if not callable(getattr(self.__regressor, "predict")):
269 |                 raise ValueError("predict attribute is not callable")
270 |         except Exception as e:
271 |             raise e
272 | 
273 |         if self.__fitOK:
274 | 
275 |             # sanity checks
276 |             if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
277 |                 raise ValueError("df must be a DataFrame")
278 | 
279 |             return self.__regressor.predict(df.values)
280 | 
281 |         else:
282 |             raise ValueError("You must call the fit function before !")
283 | 
284 |     def transform(self, df):
285 |         """Transform dataframe df.
286 | 
287 |         Parameters
288 |         ----------
289 |         df : pandas dataframe of shape = (n, n_features)
290 |             The dataset with numerical features.
291 | 
292 |         Returns
293 |         -------
294 |         pandas dataframe of shape = (n, n_selected_features)
295 |             The transformed dataset with its most important features.
296 | 
297 |         """
298 |         try:
299 |             if not callable(getattr(self.__regressor, "transform")):
300 |                 raise ValueError("transform attribute is not callable")
301 |         except Exception as e:
302 |             raise e
303 | 
304 |         if self.__fitOK:
305 | 
306 |             # sanity checks
307 |             if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
308 |                 raise ValueError("df must be a DataFrame")
309 | 
310 |             return self.__regressor.transform(df.values)
311 |         else:
312 |             raise ValueError("You must call the fit function before !")
313 | 
314 |     def score(self, df, y, sample_weight=None):
315 |         """Return R^2 coefficient of determination of the prediction.
316 | 
317 |         Parameters
318 |         ----------
319 |         df : pandas dataframe of shape = (n, n_features)
320 |             The dataset with numerical features.
321 | 
322 |         y : pandas series of shape = (n,)
323 |             The numerical encoded target for classification tasks.
324 | 
325 |         Returns
326 |         -------
327 |         float
328 |             R^2 of self.predict(df) wrt. y.
329 | 
330 |         """
331 |         try:
332 |             if not callable(getattr(self.__regressor, "score")):
333 |                 raise ValueError("score attribute is not callable")
334 |         except Exception as e:
335 |             raise e
336 | 
337 |         if self.__fitOK:
338 | 
339 |             # sanity checks
340 |             if((type(df) != pd.SparseDataFrame) and
341 |                (type(df) != pd.DataFrame)):
342 |                 raise ValueError("df must be a DataFrame")
343 | 
344 |             if (type(y) != pd.core.series.Series):
345 |                 raise ValueError("y must be a Series")
346 | 
347 |             return self.__regressor.score(df.values, y, sample_weight)
348 |         else:
349 |             raise ValueError("You must call the fit function before !")
350 | 
351 |     def get_estimator(self):
352 |         """Return classfier."""
353 |         return copy(self.__regressor)
354 | 


--------------------------------------------------------------------------------
/mlbox/model/regression/stacking_regressor.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  3 | # License: BSD 3 clause
  4 | 
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.linear_model import LinearRegression
  9 | from sklearn.model_selection import KFold, cross_val_predict
 10 | from copy import copy as make_copy
 11 | from .regressor import Regressor
 12 | import warnings
 13 | 
 14 | 
 15 | class StackingRegressor():
 16 |     """A Stacking regressor.
 17 | 
 18 |      A stacking regressor is a regressor that uses the predictions of
 19 |     several first layer estimators (generated with a cross validation method)
 20 |     for a second layer estimator.
 21 | 
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     base_estimators : list, default = [Regressor(strategy="LightGBM"),
 26 |                                        Regressor(strategy="RandomForest"),
 27 |                                        Regressor(strategy="ExtraTrees")]
 28 |         List of estimators to fit in the first level using a cross validation.
 29 | 
 30 |     level_estimator : object, default = LinearRegression()
 31 |         The estimator used in second and last level
 32 | 
 33 |     n_folds : int, default = 5
 34 |         Number of folds used to generate the meta features for the training set
 35 | 
 36 |     copy : bool, default = False
 37 |         If true, meta features are added to the original dataset
 38 | 
 39 |     random_state : None, int or RandomState. default = 1
 40 |         Pseudo-random number generator state used for shuffling.
 41 |         If None, use default numpy RNG for shuffling.
 42 | 
 43 |     verbose : bool, default = True
 44 |         Verbose mode.
 45 | 
 46 |     """
 47 | 
 48 |     def __init__(self, base_estimators=[Regressor(strategy="LightGBM"),
 49 |                                         Regressor(strategy="RandomForest"),
 50 |                                         Regressor(strategy="ExtraTrees")],
 51 |                  level_estimator=LinearRegression(), n_folds=5,
 52 |                  copy=False, random_state=1, verbose=True):
 53 |         """Init method for StackingRegressor."""
 54 |         self.base_estimators = base_estimators
 55 |         if(type(base_estimators) != list):
 56 |             raise ValueError("base_estimators must be a list")
 57 |         else:
 58 |             for i, est in enumerate(self.base_estimators):
 59 |                 self.base_estimators[i] = make_copy(est)
 60 | 
 61 |         self.level_estimator = level_estimator
 62 | 
 63 |         self.n_folds = n_folds
 64 |         if(type(n_folds) != int):
 65 |             raise ValueError("n_folds must be an integer")
 66 | 
 67 |         self.copy = copy
 68 |         if(type(copy) != bool):
 69 |             raise ValueError("copy must be a boolean")
 70 | 
 71 |         self.random_state = random_state
 72 |         if((type(self.random_state) != int)
 73 |            and (self.random_state is not None)):
 74 |             raise ValueError("random_state must be either None or an integer")
 75 | 
 76 |         self.verbose = verbose
 77 |         if(type(self.verbose) != bool):
 78 |             raise ValueError("verbose must be a boolean")
 79 | 
 80 |         self.__fitOK = False
 81 |         self.__fittransformOK = False
 82 | 
 83 |     def get_params(self, deep=True):
 84 |         """Get parameters of a StackingRegressor object."""
 85 |         return {'level_estimator': self.level_estimator,
 86 |                 'base_estimators': self.base_estimators,
 87 |                 'n_folds': self.n_folds,
 88 |                 'copy': self.copy,
 89 |                 'random_state': self.random_state,
 90 |                 'verbose': self.verbose}
 91 | 
 92 |     def set_params(self, **params):
 93 |         """Set parameters of a StackingRegressor object."""
 94 |         self.__fitOK = False
 95 |         self.__fittransformOK = False
 96 | 
 97 |         for k, v in params.items():
 98 |             if k not in self.get_params():
 99 |                 warnings.warn("Invalid parameter a for stacking_regressor "
100 |                               "StackingRegressor. Parameter IGNORED. Check the"
101 |                               " list of available parameters with "
102 |                               "`stacking_regressor.get_params().keys()`")
103 |             else:
104 |                 setattr(self, k, v)
105 | 
106 |     def fit_transform(self, df_train, y_train):
107 |         """Create meta-features for the training dataset.
108 | 
109 |         Parameters
110 |         ----------
111 |         df_train : pandas DataFrame of shape = (n_samples, n_features)
112 |             The training dataset.
113 | 
114 |         y_train : pandas series of shape = (n_samples, )
115 |             The target
116 | 
117 |         Returns
118 |         -------
119 |         pandas DataFrame of shape = (n_samples,
120 |                                      n_features*int(copy)+n_metafeatures)
121 |             The transformed training dataset.
122 | 
123 |         """
124 |         # sanity checks
125 |         if((type(df_train) != pd.SparseDataFrame) & (type(df_train) != pd.DataFrame)):
126 |             raise ValueError("df_train must be a DataFrame")
127 | 
128 |         if(type(y_train) != pd.core.series.Series):
129 |             raise ValueError("y_train must be a Series")
130 | 
131 |         cv = KFold(n_splits=self.n_folds, shuffle=True,
132 |                    random_state=self.random_state)
133 | 
134 |         preds = pd.DataFrame([], index=y_train.index)
135 | 
136 |         if(self.verbose):
137 |             print("")
138 |             print("[=========================================================="
139 |                   "===================] LAYER [==============================="
140 |                   "====================================================]")
141 |             print("")
142 | 
143 |         for c, reg in enumerate(self.base_estimators):
144 | 
145 |             if(self.verbose):
146 |                 print("> fitting estimator n°" + str(c + 1) +
147 |                       " : " + str(reg.get_params()) + " ...")
148 |                 print("")
149 | 
150 |             # for each base estimator, we create the meta feature on train set
151 |             y_pred = cross_val_predict(estimator=reg, X=df_train, y=y_train, cv=cv)
152 |             preds["est" + str(c + 1)] = y_pred
153 | 
154 |             # and we refit the base estimator on entire train set
155 |             reg.fit(df_train, y_train)
156 | 
157 |         layer = 1
158 |         columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
159 |         while(len(np.intersect1d(df_train.columns, columns)) > 0):
160 |             layer = layer + 1
161 |             columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
162 |         preds.columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
163 | 
164 |         self.__fittransformOK = True
165 | 
166 |         if(self.copy):
167 |             # we keep also the initial features
168 |             return pd.concat([df_train, preds], axis=1)
169 | 
170 |         else:
171 |             return preds  # we keep only the meta features
172 | 
173 |     def transform(self, df_test):
174 |         """Create meta-features for the test dataset.
175 | 
176 |         Parameters
177 |         ----------
178 |         df_test : pandas DataFrame of shape = (n_samples_test, n_features)
179 |             The test dataset.
180 | 
181 |         Returns
182 |         -------
183 |         pandas DataFrame of shape = (n_samples_test,
184 |                                      n_features*int(copy)+n_metafeatures)
185 |             The transformed test dataset.
186 | 
187 |         """
188 |         # sanity checks
189 |         if((type(df_test) != pd.SparseDataFrame) and
190 |            (type(df_test) != pd.DataFrame)):
191 |             raise ValueError("df_test must be a DataFrame")
192 | 
193 |         if(self.__fittransformOK):
194 | 
195 |             preds_test = pd.DataFrame([], index=df_test.index)
196 | 
197 |             for c, reg in enumerate(self.base_estimators):
198 | 
199 |                 # we predict the meta feature on test set
200 |                 y_pred_test = reg.predict(df_test)
201 |                 preds_test["est" + str(c + 1)] = y_pred_test
202 | 
203 |             layer = 1
204 |             columns = ["layer" + str(layer) + "_" + s
205 |                        for s in preds_test.columns]
206 | 
207 |             while(len(np.intersect1d(df_test.columns, columns)) > 0):
208 |                 layer = layer + 1
209 |                 columns = ["layer" + str(layer) + "_" + s
210 |                            for s in preds_test.columns]
211 | 
212 |             preds_test.columns = [
213 |                 "layer" + str(layer) + "_" + s for s in preds_test.columns]
214 | 
215 |             if(self.copy):
216 |                 # we keep also the initial features
217 |                 return pd.concat([df_test, preds_test], axis=1)
218 |             else:
219 |                 return preds_test  # we keep only the meta features
220 | 
221 |         else:
222 |             raise ValueError("Call fit_transform before !")
223 | 
224 |     def fit(self, df_train, y_train):
225 |         """Fit the first level estimators and the second level estimator on X.
226 | 
227 |         Parameters
228 |         ----------
229 |         df_train : pandas DataFrame of shape (n_samples, n_features)
230 |             Input data
231 | 
232 |         y_train : pandas series of shape = (n_samples, )
233 |             The target
234 | 
235 |         Returns
236 |         -------
237 |         object
238 |             self
239 | 
240 |         """
241 |         # Fit the base estimators
242 |         df_train = self.fit_transform(df_train, y_train)
243 | 
244 |         if(self.verbose):
245 |             print("")
246 |             print("[=========================================================="
247 |                   "===============] PREDICTION LAYER [========================"
248 |                   "====================================================]")
249 |             print("")
250 |             print("> fitting estimator : " +
251 |                   str(self.level_estimator.get_params()) + " ...")
252 |             print("")
253 | 
254 |         # we fit the second level estimator
255 |         self.level_estimator.fit(df_train.values, y_train.values)
256 | 
257 |         self.__fitOK = True
258 | 
259 |         return self
260 | 
261 | 
262 |     def predict(self, df_test):
263 |         """Predict regression target for X_test using the meta-features.
264 | 
265 |         Parameters
266 |         ----------
267 |         df_test : pandas DataFrame of shape = (n_samples_test, n_features)
268 |             The testing samples
269 | 
270 |         Returns
271 |         -------
272 |         array of shape = (n_samples_test, )
273 |             The predicted values.
274 | 
275 |         """
276 |         if(self.__fitOK):
277 |             # we predict the meta features on test set
278 |             df_test = self.transform(df_test)
279 | 
280 |             # we predict the target using the meta features
281 |             return self.level_estimator.predict(df_test)
282 | 
283 |         else:
284 |             raise ValueError("Call fit before !")
285 | 


--------------------------------------------------------------------------------
/mlbox/optimisation/__init__.py:
--------------------------------------------------------------------------------
1 | from .optimiser import *
2 | 


--------------------------------------------------------------------------------
/mlbox/prediction/__init__.py:
--------------------------------------------------------------------------------
1 | from .predictor import *
2 | 
3 | 


--------------------------------------------------------------------------------
/mlbox/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .drift_thresholder import *
2 | from .reader import *
3 | 
4 | 


--------------------------------------------------------------------------------
/mlbox/preprocessing/drift/__init__.py:
--------------------------------------------------------------------------------
1 | from .drift_estimator import *
2 | from .drift_threshold import *
3 | 


--------------------------------------------------------------------------------
/mlbox/preprocessing/drift/drift_estimator.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Authors: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  3 | #          Alexis BONDU <alexis.bondu@gmail.com>
  4 | # License: BSD 3 clause
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.metrics import roc_auc_score
 10 | from sklearn.model_selection import KFold, StratifiedKFold, cross_val_predict
 11 | 
 12 | class DriftEstimator():
 13 | 
 14 |     """Estimates the drift between two datasets
 15 |     
 16 |         
 17 |     Parameters
 18 |     ----------
 19 |     estimator : classifier, defaut = RandomForestClassifier(n_estimators = 50, n_jobs=-1, max_features=1., min_samples_leaf = 5, max_depth = 5)
 20 |         The estimator that estimates the drift between two datasets
 21 |         
 22 |     n_folds : int, defaut = 2
 23 |         Number of folds used to estimate the drift
 24 | 
 25 |     stratify : bool, defaut = True
 26 |         Whether the cv is stratified (same number of train and test samples within each fold)
 27 | 
 28 |     random_state : int, defaut = 1
 29 |         Random state for cv
 30 |     """
 31 | 
 32 |     def __init__(self,
 33 |                  estimator=RandomForestClassifier(n_estimators=50,
 34 |                                                   n_jobs=-1,
 35 |                                                   max_features=1.,
 36 |                                                   min_samples_leaf=5,
 37 |                                                   max_depth=5),
 38 |                  n_folds=2,
 39 |                  stratify=True,
 40 |                  random_state=1):
 41 | 
 42 |         self.estimator = estimator
 43 |         self.n_folds = n_folds
 44 |         self.stratify = stratify
 45 |         self.random_state = random_state
 46 |         self.__cv = None
 47 |         self.__pred = None
 48 |         self.__target = None
 49 |         self.__fitOK = False
 50 | 
 51 |     def get_params(self):
 52 | 
 53 |         return {'estimator': self.estimator,
 54 |                 'n_folds': self.n_folds,
 55 |                 'stratify': self.stratify,
 56 |                 'random_state': self.random_state}
 57 | 
 58 |     def set_params(self, **params):
 59 | 
 60 |         if('estimator' in params.keys()):
 61 |             self.estimator = params['estimator']
 62 |         if('n_folds' in params.keys()):
 63 |             self.n_folds = params['n_folds']
 64 |         if('stratify' in params.keys()):
 65 |             self.stratify = params['stratify']
 66 |         if('random_state' in params.keys()):
 67 |             self.random_state = params['random_state']
 68 | 
 69 |     def fit(self, df_train, df_test):
 70 | 
 71 |         """
 72 |         Computes the drift between the two datasets
 73 | 
 74 |         Parameters
 75 |         ----------
 76 |         df_train : pandas dataframe of shape = (n_train, p)
 77 |             The train set
 78 | 
 79 |         df_test : pandas dataframe of shape = (n_test, p)
 80 |             The test set
 81 | 
 82 |         Returns
 83 |         -------
 84 |         self : object
 85 |             Returns self.
 86 |         """
 87 | 
 88 |         df_train["target"] = 0
 89 |         df_test["target"] = 1
 90 | 
 91 |         self.__target = pd.concat((df_train.target, df_test.target),
 92 |                                   ignore_index=True)
 93 | 
 94 |         if self.stratify:
 95 |             self.__cv = StratifiedKFold(n_splits=self.n_folds,
 96 |                                         shuffle=True,
 97 |                                         random_state=self.random_state)
 98 |         else:
 99 |             self.__cv = KFold(n_splits=self.n_folds,
100 |                               shuffle=True,
101 |                               random_state=self.random_state)
102 | 
103 |         X_tmp = pd.concat((df_train, df_test),
104 |                           ignore_index=True).drop(['target'], axis=1)
105 | 
106 |         self.__pred = cross_val_predict(estimator=self.estimator,
107 |                                         X=X_tmp,
108 |                                         y=self.__target,
109 |                                         cv=self.__cv,
110 |                                         method="predict_proba")[:,1]
111 | 
112 |         del df_train["target"]
113 |         del df_test["target"]
114 | 
115 |         self.__fitOK = True
116 | 
117 |         return self
118 | 
119 |     def score(self):
120 |         
121 |         """Returns the global drift measure between two datasets.
122 | 
123 |          0. = No drift. 1. = Maximal Drift
124 | 
125 |         Returns
126 |         -------
127 |         float
128 |             The drift measure
129 |         """
130 | 
131 |         S = []
132 | 
133 |         if self.__fitOK:
134 | 
135 |             X_zeros = np.zeros(len(self.__target))
136 | 
137 |             for train_index, test_index in self.__cv.split(X=X_zeros,
138 |                                                            y=self.__target):
139 | 
140 |                 S.append(roc_auc_score(self.__target.iloc[test_index],
141 |                                        self.__pred[test_index]))
142 | 
143 |             return (max(np.mean(S), 1-np.mean(S))-0.5) * 2
144 | 
145 |         else:
146 |             raise ValueError('Call the fit function before !')
147 | 
148 |     def predict(self):
149 | 
150 |         """Returns the probabilities that the sample belongs to the test dataset
151 | 
152 |         Returns
153 |         -------
154 |         Array of shape = (n_train+n_test,)
155 |             The probabilities
156 |         """
157 | 
158 |         if self.__fitOK:
159 | 
160 |             return self.__pred
161 | 
162 |         else:
163 |             raise ValueError('Call the fit function before !')
164 | 


--------------------------------------------------------------------------------
/mlbox/preprocessing/drift/drift_threshold.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Authors: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  3 | #          Alexis BONDU <alexis.bondu@gmail.com>
  4 | # License: BSD 3 clause
  5 | import sys
  6 | 
  7 | from joblib import Parallel, delayed
  8 | from sklearn.tree import DecisionTreeClassifier
  9 | 
 10 | from .drift_estimator import DriftEstimator
 11 | 
 12 | 
 13 | def sync_fit(df_train, df_test, estimator, n_folds=2, stratify=True, random_state=1):
 14 |     """Compute the univariate drifts between df_train and df_test datasets.
 15 | 
 16 |     Multi-threaded version.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     df_train : pandas dataframe of shape = (n_train, p)
 21 |         The train set
 22 | 
 23 |     df_test : pandas dataframe of shape = (n_test, p)
 24 |         The test set
 25 | 
 26 |     estimator : classifier, defaut = RandomForestClassifier(n_estimators = 50,
 27 |                                                             n_jobs=-1,
 28 |                                                             max_features=1.,
 29 |                                                             min_samples_leaf = 5,
 30 |                                                             max_depth = 5)
 31 |         The estimator that estimates the drift between two datasets
 32 | 
 33 |     n_folds : int, default = 2
 34 |         Number of folds used to estimate the drift
 35 | 
 36 |     stratify : bool, default = True
 37 |         Whether the cv is stratified (same number of train and test samples
 38 |         within each fold)
 39 | 
 40 |     random_state : int, default = 1
 41 |         Random state for cv
 42 | 
 43 |     Returns
 44 |     -------
 45 |     float
 46 |         drift measure
 47 | 
 48 |     """
 49 |     # We will compute the indices of the CV in each thread
 50 |     de = DriftEstimator(estimator, n_folds, stratify, random_state)
 51 |     de.fit(df_train, df_test)
 52 | 
 53 |     return de.score()
 54 | 
 55 | 
 56 | class DriftThreshold():
 57 |     """Estimate the univariate drift between two datasets.
 58 | 
 59 |     Estimate the univariate drift between two datasets
 60 |     and select features with low drifts
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     threshold : float, defaut = 0.6
 65 |         The drift threshold (univariate drift below are kept)
 66 |         Must be between 0. and 1.
 67 | 
 68 |     subsample : float, defaut = 1.
 69 |         Subsampling parameter for the datasets.
 70 |         Must be between 0. and 1.
 71 | 
 72 |     estimator : classifier, default = DecisionTreeClassifier(max_depth=6)
 73 |         The estimator that estimates the drift between two datasets.
 74 | 
 75 |     n_folds : int, default = 2
 76 |         Number of folds used to estimate the drift.
 77 | 
 78 |     stratify : bool, default = True
 79 |         Whether the cv is stratified (same number of train and test samples
 80 |         within each fold)
 81 | 
 82 |     random_state : int, default = 1
 83 |         Seed for for cv and subsampling.
 84 | 
 85 |     n_jobs : int, defaut = -1
 86 |         Number of cores used for processing (-1 for all cores)
 87 | 
 88 |     """
 89 | 
 90 |     def __init__(self,
 91 |                  threshold=0.6,
 92 |                  subsample=1.,
 93 |                  estimator=DecisionTreeClassifier(max_depth=6),
 94 |                  n_folds=2,
 95 |                  stratify=True,
 96 |                  random_state=1,
 97 |                  n_jobs=-1):
 98 |         """Init a DriftThreshold object."""
 99 |         self.threshold = threshold
100 |         self.subsample = subsample
101 |         self.estimator = estimator
102 |         self.n_folds = n_folds
103 |         self.stratify = stratify
104 |         self.random_state = random_state
105 |         self.n_jobs = n_jobs
106 |         self.__Ddrifts = dict()
107 |         self.__fitOK = False
108 | 
109 |     def get_params(self):
110 |         """Get parameters of a DriftThreshold object."""
111 |         return {'threshold': self.threshold,
112 |                 'subsample': self.subsample,
113 |                 'estimator': self.estimator,
114 |                 'n_folds': self.n_folds,
115 |                 'stratify': self.stratify,
116 |                 'random_state': self.random_state,
117 |                 'n_jobs': self.n_jobs}
118 | 
119 |     def set_params(self, **params):
120 |         """Set parameters of a DriftThreshold object."""
121 |         if('threshold' in params.keys()):
122 |             self.threshold = params['threshold']
123 |         if('subsample' in params.keys()):
124 |             self.subsample = params['subsample']
125 |         if('estimator' in params.keys()):
126 |             self.estimator = params['estimator']
127 |         if('n_folds' in params.keys()):
128 |             self.n_folds = params['n_folds']
129 |         if('stratify' in params.keys()):
130 |             self.stratify = params['stratify']
131 |         if('random_state' in params.keys()):
132 |             self.random_state = params['random_state']
133 |         if('n_jobs' in params.keys()):
134 |             self.n_jobs = params['n_jobs']
135 | 
136 |     def fit(self, df_train, df_test):
137 |         """Compute the univariate drifts between df_train and df_test datasets.
138 | 
139 |         Parameters
140 |         ----------
141 |         df_train : pandas dataframe of shape = (n_train, p)
142 |             The train set
143 | 
144 |         df_test : pandas dataframe of shape = (n_test, p)
145 |             The test set
146 | 
147 |         Returns
148 |         -------
149 |         None
150 | 
151 |         """
152 |         self.__Ddrifts = dict()
153 | 
154 |         if sys.platform == 'win32':
155 |             Ldrifts = [sync_fit(df_train.sample(frac=self.subsample)[[col]],
156 |                                df_test.sample(frac=self.subsample)[[col]],
157 |                                self.estimator,
158 |                                self.n_folds,
159 |                                self.stratify,
160 |                                self.random_state)
161 |                                for col in df_train.columns]
162 |         else:
163 |             Ldrifts = Parallel(n_jobs=self.n_jobs)(delayed(sync_fit)
164 |                                                (df_train.sample(frac=self.subsample)[[col]],
165 |                                                 df_test.sample(frac=self.subsample)[[col]],
166 |                                                 self.estimator,
167 |                                                 self.n_folds,
168 |                                                 self.stratify,
169 |                                                 self.random_state)
170 |                                                for col in df_train.columns)
171 | 
172 |         for i, col in enumerate(df_train.columns):
173 | 
174 |             self.__Ddrifts[col] = Ldrifts[i]
175 | 
176 |         del Ldrifts
177 | 
178 |         self.__fitOK = True
179 | 
180 |     def transform(self, df):
181 |         """Select the features with low drift.
182 | 
183 |         Parameters
184 |         ----------
185 |         df : pandas dataframe
186 |             A dataset with the same features
187 | 
188 |         Returns
189 |         -------
190 |         pandas DataFrame
191 |             The transformed dataframe
192 | 
193 |         """
194 |         if self.__fitOK:
195 | 
196 |             selected_col = []
197 | 
198 |             for i, col in enumerate(df.columns):
199 | 
200 |                 if (self.__Ddrifts[col] < self.threshold):
201 |                     selected_col.append(col)
202 | 
203 |             return df[selected_col]
204 | 
205 |         else:
206 |             raise ValueError('Call the fit function before !')
207 | 
208 |     def get_support(self, complement=False):
209 |         """Return the variables kept or dropped.
210 | 
211 |         Parameters
212 |         ----------
213 |         complement : bool, default = True
214 |             If True, returns the features to drop
215 |             If False, returns the features to keep
216 | 
217 |         Returns
218 |         -------
219 |         list
220 |             The list of features to keep or to drop.
221 | 
222 |         """
223 |         if self.__fitOK:
224 | 
225 |             keepList = []
226 |             dropList = []
227 | 
228 |             for col in self.__Ddrifts:
229 | 
230 |                 if (self.__Ddrifts[col] < self.threshold):
231 |                     keepList.append(col)
232 |                 else:
233 |                     dropList.append(col)
234 | 
235 |             if complement:
236 |                 return dropList
237 |             else:
238 |                 return keepList
239 |         else:
240 |             raise ValueError('Call the fit function before !')
241 | 
242 |     def drifts(self):
243 |         """Return the univariate drifts for all variables.
244 | 
245 |         Returns
246 |         -------
247 |         dict
248 |             The dictionnary of drift measures for each features
249 | 
250 |         """
251 |         if self.__fitOK:
252 | 
253 |             return self.__Ddrifts
254 | 
255 |         else:
256 |             raise ValueError('Call the fit function before !')
257 | 


--------------------------------------------------------------------------------
/mlbox/preprocessing/drift_thresholder.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  3 | # License: BSD 3 clause
  4 | 
  5 | import os
  6 | import time
  7 | from sklearn.pipeline import Pipeline
  8 | from .drift import DriftThreshold
  9 | from ..encoding.na_encoder import NA_encoder
 10 | from ..encoding.categorical_encoder import Categorical_encoder
 11 | 
 12 | 
 13 | class Drift_thresholder():
 14 | 
 15 |     """Automatically drops ids and drifting variables between train and test datasets.
 16 | 
 17 |     Drops on train and test datasets. The list of drift coefficients is available and
 18 |     saved as "drifts.txt". To get familiar with drift:
 19 |     https://github.com/AxeldeRomblay/MLBox/blob/master/docs/webinars/features.pdf
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     threshold : float, defaut = 0.6
 24 |         Drift threshold under which features are kept. Must be between 0. and 1.
 25 |         The lower the more you keep non-drifting/stable variables: a feature with
 26 |         a drift measure of 0. is very stable and a one with 1. is highly unstable.
 27 | 
 28 |     inplace : bool, default = False
 29 |         If True, train and test datasets are transformed. Returns self.
 30 |         Otherwise, train and test datasets are not transformed. Returns a new dictionnary with
 31 |         cleaned datasets.
 32 | 
 33 |     verbose : bool, default = True
 34 |         Verbose mode
 35 | 
 36 |     to_path : str, default = "save"
 37 |         Name of the folder where the list of drift coefficients is saved.
 38 |     """
 39 | 
 40 |     def __init__(self,
 41 |                  threshold=0.6,
 42 |                  inplace=False,
 43 |                  verbose=True,
 44 |                  to_path="save"):
 45 | 
 46 |         self.threshold = threshold
 47 |         self.inplace = inplace
 48 |         self.verbose = verbose
 49 |         self.to_path = to_path
 50 |         self.__Ddrifts = {}
 51 |         self.__fitOK = False
 52 | 
 53 | 
 54 |     def fit_transform(self, df):
 55 | 
 56 |         """Fits and transforms train and test datasets
 57 | 
 58 |         Automatically drops ids and drifting variables between train and test datasets.
 59 |         The list of drift coefficients is available and saved as "drifts.txt"
 60 | 
 61 |         Parameters
 62 |         ----------
 63 |         df : dict, defaut = None
 64 |             Dictionnary containing :
 65 | 
 66 |             - 'train' : pandas dataframe for train dataset
 67 |             - 'test' : pandas dataframe for test dataset
 68 |             - 'target' : pandas serie for the target on train set
 69 | 
 70 |         Returns
 71 |         -------
 72 |         dict
 73 |             Dictionnary containing :
 74 | 
 75 |             - 'train' : transformed pandas dataframe for train dataset
 76 |             - 'test' : transformed pandas dataframe for test dataset
 77 |             - 'target' : pandas serie for the target on train set
 78 |         """
 79 | 
 80 |         ######################################################
 81 |         #                   Deleting IDs
 82 |         ######################################################
 83 | 
 84 |         # Exception
 85 | 
 86 |         if (df["test"].shape[0] == 0):
 87 |             if (self.verbose):
 88 |                 print("")
 89 |                 print("You have no test dataset...")
 90 | 
 91 |             return df
 92 | 
 93 |         else:
 94 | 
 95 |             start_time = time.time()
 96 | 
 97 |             ds = DriftThreshold(self.threshold)
 98 |             na = NA_encoder(numerical_strategy=0)
 99 |             ca = Categorical_encoder()
100 | 
101 |             pp = Pipeline([("na", na), ("ca", ca)])
102 |             pp.fit(df['train'], None)
103 | 
104 |             # Deleting IDs with drift threshold method
105 | 
106 |             if (self.verbose):
107 |                 print("")
108 |                 print("computing drifts ...")
109 | 
110 |             ds.fit(pp.transform(df['train']), pp.transform(df['test']))
111 | 
112 |             if (self.verbose):
113 |                 print("CPU time: %s seconds" % (time.time() - start_time))
114 |                 print("")
115 | 
116 |             self.__fitOK = True
117 |             self.__Ddrifts = ds.drifts()
118 |             drifts_top = sorted(ds.drifts().items(),
119 |                                 key=lambda x: x[1],
120 |                                 reverse=True)[:10]
121 | 
122 |             if (self.verbose):
123 |                 print("> Top 10 drifts")
124 |                 print("")
125 |                 for d in range(len(drifts_top)):
126 |                     print(drifts_top[d])
127 | 
128 |             if (self.verbose):
129 |                 print("")
130 |                 print("> Deleted "
131 |                       "variables : " + str(ds.get_support(complement=True)))
132 | 
133 |             ######################################################
134 |             #           Dumping Encoders into directory
135 |             ######################################################
136 | 
137 |             if (self.to_path is not None):
138 | 
139 |                 try:
140 |                     os.mkdir(self.to_path)
141 |                 except OSError:
142 |                     pass
143 | 
144 |                 file = open(self.to_path + '/drifts.txt', "w")
145 |                 file.write("\n")
146 |                 file.write(
147 |                     "*******************************************"
148 |                     "  Drifts coefficients "
149 |                     "*******************************************\n")
150 |                 file.write("\n")
151 | 
152 |                 for var, d in sorted(ds.drifts().items(),
153 |                                      key=lambda x: x[1],
154 |                                      reverse=True):
155 |                     file.write(str(var) + " = " + str(d) + '\n')
156 | 
157 |                 file.close()
158 | 
159 |                 if (self.verbose):
160 |                     print("> Drift coefficients dumped into directory : " + self.to_path)
161 | 
162 |             # Returning datasets with no IDs
163 | 
164 |             if (self.inplace):
165 | 
166 |                 df['train'] = ds.transform(df['train'])
167 |                 df['test'] = ds.transform(df['test'])
168 | 
169 |             else:
170 | 
171 |                 return {'train': ds.transform(df['train']),
172 |                         'test': ds.transform(df['test']),
173 |                         'target': df['target']}
174 | 
175 |     def drifts(self):
176 | 
177 |         """Returns the univariate drifts for all variables.
178 | 
179 |         Returns
180 |         -------
181 |         dict
182 |             Dictionnary containing the drifts for each feature
183 |         """
184 | 
185 |         if self.__fitOK:
186 | 
187 |             return self.__Ddrifts
188 |         else:
189 |             raise ValueError('Call the fit_transform function before !')
190 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.18.2
 2 | scipy==1.4.1
 3 | matplotlib==3.0.3
 4 | hyperopt==0.2.3
 5 | pandas==0.25.3
 6 | joblib==0.14.1
 7 | scikit-learn==0.22.1
 8 | tensorflow==2.0.0
 9 | lightgbm==2.3.1
10 | tables==3.5.2
11 | xlrd==1.2.0
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup file for installing mlbox package."""
 2 | # !/usr/bin/env python
 3 | # -*- coding: utf-8 -*-
 4 | 
 5 | from setuptools import setup
 6 | 
 7 | 
 8 | with open('requirements.txt', 'rt') as fh:
 9 |     requirements = fh.read().splitlines()
10 | 
11 | with open('README.rst') as readme_file:
12 |     readme = readme_file.read()
13 | 
14 | with open('VERSION.txt') as version_file:
15 |     version = version_file.read()
16 | 
17 | 
18 | setup(
19 |     name='mlbox',
20 |     version=version,
21 |     description="A powerful Automated Machine Learning python library.",
22 |     long_description=readme,
23 |     author="Axel ARONIO DE ROMBLAY",
24 |     author_email='axelderomblay@gmail.com',
25 |     url='https://github.com/AxeldeRomblay/mlbox',
26 |     packages=['mlbox', 'mlbox.encoding', 'mlbox.model',
27 |               'mlbox.optimisation', 'mlbox.prediction',
28 |               'mlbox.preprocessing',
29 |               'mlbox.model.classification',
30 |               'mlbox.model.regression',
31 |               'mlbox.preprocessing.drift'],
32 |     package_dir={'mlbox': 'mlbox',
33 |                  'mlbox.encoding': 'mlbox/encoding',
34 |                  'mlbox.model': 'mlbox/model',
35 |                  'mlbox.optimisation': 'mlbox/optimisation',
36 |                  'mlbox.prediction': 'mlbox/prediction',
37 |                  'mlbox.preprocessing': 'mlbox/preprocessing',
38 |                  'mlbox.model.classification': 'mlbox/model/classification',
39 |                  'mlbox.model.regression': 'mlbox/model/regression',
40 |                  'mlbox.preprocessing.drift': 'mlbox/preprocessing/drift'
41 |                  },
42 |     include_package_data=True,
43 |     install_requires=requirements,
44 |     zip_safe=False,
45 |     license='BSD-3',
46 |     keywords='mlbox auto-ml stacking pipeline optimisation',
47 |     classifiers=[
48 | 
49 |         'Development Status :: 5 - Production/Stable',
50 | 
51 |         'Intended Audience :: Developers',
52 |         'Intended Audience :: Science/Research',
53 | 
54 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
55 |         'Topic :: Scientific/Engineering :: Information Analysis',
56 |         'Topic :: Software Development :: Libraries :: Python Modules',
57 | 
58 |         'License :: OSI Approved :: BSD License',
59 | 
60 |         'Natural Language :: English',
61 | 
62 |         'Operating System :: MacOS',
63 |         'Operating System :: Microsoft :: Windows',
64 |         'Operating System :: POSIX :: Linux',
65 | 
66 |         'Programming Language :: Python :: 3.5',
67 |         'Programming Language :: Python :: 3.6',
68 |         'Programming Language :: Python :: 3.7'
69 |     ],
70 |     test_suite='tests',
71 |     tests_require=requirements
72 | )
73 | 


--------------------------------------------------------------------------------
/tests/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/.DS_Store


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/tests/data_for_tests/clean_target.csv:
--------------------------------------------------------------------------------
  1 | Survived
  2 | 0
  3 | 1
  4 | 1
  5 | 1
  6 | 0
  7 | 0
  8 | 0
  9 | 0
 10 | 1
 11 | 1
 12 | 1
 13 | 1
 14 | 0
 15 | 0
 16 | 0
 17 | 1
 18 | 0
 19 | 1
 20 | 0
 21 | 1
 22 | 0
 23 | 1
 24 | 1
 25 | 1
 26 | 0
 27 | 1
 28 | 0
 29 | 0
 30 | 1
 31 | 0
 32 | 0
 33 | 1
 34 | 1
 35 | 0
 36 | 0
 37 | 0
 38 | 1
 39 | 0
 40 | 0
 41 | 1
 42 | 0
 43 | 0
 44 | 0
 45 | 1
 46 | 1
 47 | 0
 48 | 0
 49 | 1
 50 | 0
 51 | 0
 52 | 0
 53 | 0
 54 | 1
 55 | 1
 56 | 0
 57 | 1
 58 | 1
 59 | 0
 60 | 1
 61 | 0
 62 | 0
 63 | 1
 64 | 0
 65 | 0
 66 | 0
 67 | 1
 68 | 1
 69 | 0
 70 | 1
 71 | 0
 72 | 0
 73 | 0
 74 | 0
 75 | 0
 76 | 1
 77 | 0
 78 | 0
 79 | 0
 80 | 1
 81 | 1
 82 | 0
 83 | 1
 84 | 1
 85 | 0
 86 | 1
 87 | 1
 88 | 0
 89 | 0
 90 | 1
 91 | 0
 92 | 0
 93 | 0
 94 | 0
 95 | 0
 96 | 0
 97 | 0
 98 | 0
 99 | 1
100 | 1
101 | 0
102 | 0
103 | 0
104 | 0
105 | 0
106 | 0
107 | 0
108 | 1
109 | 1
110 | 0
111 | 1
112 | 0
113 | 0
114 | 0
115 | 0
116 | 0
117 | 0
118 | 0
119 | 0
120 | 0
121 | 0
122 | 0
123 | 0
124 | 0
125 | 1
126 | 0
127 | 1
128 | 0
129 | 1
130 | 1
131 | 0
132 | 0
133 | 0
134 | 0
135 | 1
136 | 0
137 | 0
138 | 1
139 | 0
140 | 0
141 | 0
142 | 0
143 | 1
144 | 1
145 | 0
146 | 0
147 | 0
148 | 1
149 | 0
150 | 0
151 | 0
152 | 0
153 | 1
154 | 0
155 | 0
156 | 0
157 | 0
158 | 1
159 | 0
160 | 0
161 | 0
162 | 0
163 | 1
164 | 0
165 | 0
166 | 0
167 | 1
168 | 1
169 | 0
170 | 0
171 | 0
172 | 0
173 | 0
174 | 1
175 | 0
176 | 0
177 | 0
178 | 0
179 | 0
180 | 0
181 | 0
182 | 0
183 | 0
184 | 0
185 | 1
186 | 1
187 | 0
188 | 1
189 | 1
190 | 0
191 | 0
192 | 1
193 | 0
194 | 1
195 | 1
196 | 1
197 | 1
198 | 0
199 | 0
200 | 1
201 | 0
202 | 0
203 | 0
204 | 0
205 | 0
206 | 1
207 | 0
208 | 0
209 | 1
210 | 1
211 | 1
212 | 0
213 | 1
214 | 0
215 | 0
216 | 0
217 | 1
218 | 1
219 | 0
220 | 1
221 | 0
222 | 1
223 | 0
224 | 0
225 | 0
226 | 1
227 | 0
228 | 1
229 | 0
230 | 0
231 | 0
232 | 1
233 | 0
234 | 0
235 | 1
236 | 0
237 | 0
238 | 0
239 | 1
240 | 0
241 | 0
242 | 0
243 | 1
244 | 0
245 | 0
246 | 0
247 | 0
248 | 0
249 | 1
250 | 1
251 | 0
252 | 0
253 | 0
254 | 0
255 | 0
256 | 0
257 | 1
258 | 1
259 | 1
260 | 1
261 | 1
262 | 0
263 | 1
264 | 0
265 | 0
266 | 0
267 | 0
268 | 0
269 | 1
270 | 1
271 | 1
272 | 0
273 | 1
274 | 1
275 | 0
276 | 1
277 | 1
278 | 0
279 | 0
280 | 0
281 | 1
282 | 0
283 | 0
284 | 0
285 | 1
286 | 0
287 | 0
288 | 1
289 | 0
290 | 1
291 | 1
292 | 1
293 | 1
294 | 0
295 | 0
296 | 0
297 | 0
298 | 0
299 | 0
300 | 1
301 | 1
302 | 1
303 | 1
304 | 0
305 | 1
306 | 0
307 | 1
308 | 1
309 | 1
310 | 0
311 | 1
312 | 1
313 | 1
314 | 0
315 | 0
316 | 0
317 | 1
318 | 1
319 | 0
320 | 1
321 | 1
322 | 0
323 | 0
324 | 1
325 | 1
326 | 0
327 | 1
328 | 0
329 | 1
330 | 1
331 | 1
332 | 1
333 | 0
334 | 0
335 | 0
336 | 1
337 | 0
338 | 0
339 | 1
340 | 1
341 | 0
342 | 1
343 | 1
344 | 0
345 | 0
346 | 0
347 | 1
348 | 1
349 | 1
350 | 1
351 | 0
352 | 0
353 | 0
354 | 0
355 | 0
356 | 0
357 | 0
358 | 1
359 | 0
360 | 1
361 | 1
362 | 0
363 | 0
364 | 0
365 | 0
366 | 0
367 | 0
368 | 1
369 | 1
370 | 1
371 | 1
372 | 1
373 | 0
374 | 0
375 | 0
376 | 0
377 | 1
378 | 1
379 | 0
380 | 0
381 | 0
382 | 1
383 | 1
384 | 0
385 | 1
386 | 0
387 | 0
388 | 0
389 | 1
390 | 0
391 | 1
392 | 1
393 | 1
394 | 0
395 | 1
396 | 1
397 | 0
398 | 0
399 | 0
400 | 0
401 | 1
402 | 1
403 | 0
404 | 0
405 | 0
406 | 0
407 | 0
408 | 0
409 | 1
410 | 0
411 | 0
412 | 0
413 | 0
414 | 1
415 | 0
416 | 1
417 | 0
418 | 1
419 | 1
420 | 0
421 | 0
422 | 0
423 | 0
424 | 0
425 | 0
426 | 0
427 | 0
428 | 1
429 | 1
430 | 0
431 | 1
432 | 1
433 | 1
434 | 1
435 | 0
436 | 0
437 | 1
438 | 0
439 | 1
440 | 0
441 | 0
442 | 1
443 | 0
444 | 0
445 | 1
446 | 1
447 | 1
448 | 1
449 | 1
450 | 1
451 | 1
452 | 0
453 | 0
454 | 0
455 | 1
456 | 0
457 | 1
458 | 0
459 | 1
460 | 1
461 | 0
462 | 1
463 | 0
464 | 0
465 | 0
466 | 0
467 | 0
468 | 0
469 | 0
470 | 0
471 | 1
472 | 0
473 | 0
474 | 1
475 | 1
476 | 0
477 | 0
478 | 0
479 | 0
480 | 0
481 | 1
482 | 0
483 | 0
484 | 0
485 | 1
486 | 1
487 | 0
488 | 1
489 | 0
490 | 0
491 | 1
492 | 0
493 | 0
494 | 0
495 | 0
496 | 0
497 | 0
498 | 1
499 | 0
500 | 0
501 | 0
502 | 0
503 | 0
504 | 0
505 | 0
506 | 1
507 | 0
508 | 1
509 | 1
510 | 0
511 | 1
512 | 1
513 | 0
514 | 1
515 | 1
516 | 0
517 | 0
518 | 1
519 | 0
520 | 1
521 | 0
522 | 1
523 | 0
524 | 0
525 | 1
526 | 0
527 | 0
528 | 1
529 | 0
530 | 0
531 | 0
532 | 1
533 | 0
534 | 0
535 | 1
536 | 0
537 | 1
538 | 0
539 | 1
540 | 0
541 | 1
542 | 1
543 | 0
544 | 0
545 | 1
546 | 0
547 | 0
548 | 1
549 | 1
550 | 0
551 | 1
552 | 1
553 | 0
554 | 0
555 | 1
556 | 1
557 | 0
558 | 1
559 | 0
560 | 1
561 | 1
562 | 0
563 | 0
564 | 0
565 | 0
566 | 0
567 | 0
568 | 0
569 | 0
570 | 0
571 | 1
572 | 1
573 | 1
574 | 1
575 | 1
576 | 0
577 | 0
578 | 1
579 | 1
580 | 0
581 | 1
582 | 1
583 | 1
584 | 0
585 | 0
586 | 0
587 | 1
588 | 0
589 | 1
590 | 0
591 | 0
592 | 0
593 | 1
594 | 0
595 | 0
596 | 0
597 | 0
598 | 1
599 | 0
600 | 0
601 | 1
602 | 1
603 | 0
604 | 0
605 | 0
606 | 1
607 | 0
608 | 0
609 | 1
610 | 1
611 | 1
612 | 0
613 | 0
614 | 1
615 | 0
616 | 0
617 | 1
618 | 0
619 | 0
620 | 1
621 | 0
622 | 0
623 | 1
624 | 1
625 | 0
626 | 0
627 | 0
628 | 0
629 | 1
630 | 0
631 | 0
632 | 1
633 | 0
634 | 1
635 | 0
636 | 0
637 | 1
638 | 0
639 | 0
640 | 0
641 | 0
642 | 0
643 | 1
644 | 0
645 | 1
646 | 1
647 | 1
648 | 0
649 | 1
650 | 0
651 | 1
652 | 0
653 | 1
654 | 0
655 | 1
656 | 0
657 | 0
658 | 0
659 | 0
660 | 0
661 | 0
662 | 1
663 | 0
664 | 0
665 | 0
666 | 1
667 | 0
668 | 0
669 | 0
670 | 0
671 | 1
672 | 1
673 | 0
674 | 0
675 | 1
676 | 0
677 | 0
678 | 0
679 | 1
680 | 0
681 | 1
682 | 0
683 | 1
684 | 0
685 | 0
686 | 0
687 | 0
688 | 0
689 | 0
690 | 0
691 | 1
692 | 1
693 | 1
694 | 1
695 | 0
696 | 0
697 | 0
698 | 0
699 | 1
700 | 0
701 | 0
702 | 1
703 | 1
704 | 0
705 | 0
706 | 0
707 | 0
708 | 1
709 | 1
710 | 1
711 | 1
712 | 1
713 | 0
714 | 1
715 | 0
716 | 0
717 | 0
718 | 1
719 | 1
720 | 0
721 | 0
722 | 1
723 | 0
724 | 0
725 | 0
726 | 1
727 | 0
728 | 1
729 | 1
730 | 0
731 | 0
732 | 1
733 | 0
734 | 0
735 | 0
736 | 0
737 | 0
738 | 0
739 | 1
740 | 0
741 | 0
742 | 1
743 | 0
744 | 1
745 | 0
746 | 1
747 | 0
748 | 0
749 | 1
750 | 0
751 | 0
752 | 1
753 | 1
754 | 0
755 | 0
756 | 1
757 | 1
758 | 0
759 | 0
760 | 0
761 | 1
762 | 0
763 | 0
764 | 1
765 | 1
766 | 0
767 | 1
768 | 0
769 | 0
770 | 0
771 | 0
772 | 0
773 | 0
774 | 0
775 | 0
776 | 1
777 | 0
778 | 0
779 | 1
780 | 0
781 | 1
782 | 1
783 | 1
784 | 0
785 | 0
786 | 0
787 | 0
788 | 1
789 | 0
790 | 1
791 | 0
792 | 0
793 | 0
794 | 0
795 | 0
796 | 0
797 | 0
798 | 1
799 | 1
800 | 0
801 | 0
802 | 0
803 | 1
804 | 1
805 | 1
806 | 1
807 | 0
808 | 0
809 | 0
810 | 0
811 | 1
812 | 0
813 | 0
814 | 0
815 | 0
816 | 0
817 | 0
818 | 0
819 | 0
820 | 0
821 | 0
822 | 1
823 | 1
824 | 0
825 | 1
826 | 0
827 | 0
828 | 0
829 | 1
830 | 1
831 | 1
832 | 1
833 | 1
834 | 0
835 | 0
836 | 0
837 | 1
838 | 0
839 | 0
840 | 1
841 | 1
842 | 0
843 | 0
844 | 1
845 | 0
846 | 0
847 | 0
848 | 0
849 | 0
850 | 0
851 | 1
852 | 0
853 | 0
854 | 0
855 | 1
856 | 0
857 | 1
858 | 1
859 | 1
860 | 1
861 | 0
862 | 0
863 | 0
864 | 1
865 | 0
866 | 0
867 | 1
868 | 1
869 | 0
870 | 0
871 | 1
872 | 0
873 | 1
874 | 0
875 | 0
876 | 1
877 | 1
878 | 0
879 | 0
880 | 0
881 | 1
882 | 1
883 | 0
884 | 0
885 | 0
886 | 0
887 | 0
888 | 0
889 | 1
890 | 0
891 | 1
892 | 0
893 | 


--------------------------------------------------------------------------------
/tests/data_for_tests/clean_test.csv:
--------------------------------------------------------------------------------
  1 | ,Age,Fare,Parch,Pclass,SibSp
  2 | 0,34.5,7.8292,0,3,0
  3 | 1,47,7,0,3,1
  4 | 2,62,9.6875,0,2,0
  5 | 3,27,8.6625,0,3,0
  6 | 4,22,12.2875,1,3,1
  7 | 5,14,9.225,0,3,0
  8 | 6,30,7.6292,0,3,0
  9 | 7,26,29,1,2,1
 10 | 8,18,7.2292,0,3,0
 11 | 9,21,24.15,0,3,2
 12 | 10,24,7.8958,0,3,0
 13 | 11,46,26,0,1,0
 14 | 12,23,82.2667,0,1,1
 15 | 13,63,26,0,2,1
 16 | 14,47,61.175,0,1,1
 17 | 15,24,27.7208,0,2,1
 18 | 16,35,12.35,0,2,0
 19 | 17,21,7.225,0,3,0
 20 | 18,27,7.925,0,3,1
 21 | 19,45,7.225,0,3,0
 22 | 20,55,59.4,0,1,1
 23 | 21,9,3.1708,1,3,0
 24 | 22,24,31.6833,0,1,0
 25 | 23,21,61.3792,1,1,0
 26 | 24,48,262.375,3,1,1
 27 | 25,50,14.5,0,3,1
 28 | 26,22,61.9792,1,1,0
 29 | 27,22.5,7.225,0,3,0
 30 | 28,41,30.5,0,1,0
 31 | 29,24,21.6792,0,3,2
 32 | 30,50,26,0,2,1
 33 | 31,24,31.5,0,2,2
 34 | 32,33,20.575,2,3,1
 35 | 33,24,23.45,2,3,1
 36 | 34,30,57.75,0,1,1
 37 | 35,18.5,7.2292,0,3,0
 38 | 36,24,8.05,0,3,0
 39 | 37,21,8.6625,0,3,0
 40 | 38,25,9.5,0,3,0
 41 | 39,24,56.4958,0,3,0
 42 | 40,39,13.4167,1,3,0
 43 | 41,24,26.55,0,1,0
 44 | 42,41,7.85,0,3,0
 45 | 43,30,13,0,2,0
 46 | 44,45,52.5542,0,1,1
 47 | 45,25,7.925,0,3,0
 48 | 46,45,29.7,0,1,0
 49 | 47,24,7.75,0,3,0
 50 | 48,60,76.2917,0,1,0
 51 | 49,36,15.9,2,3,0
 52 | 50,24,60,0,1,1
 53 | 51,27,15.0333,0,2,0
 54 | 52,20,23,1,2,2
 55 | 53,28,263,2,1,3
 56 | 54,24,15.5792,0,2,0
 57 | 55,10,29.125,1,3,4
 58 | 56,35,7.8958,0,3,0
 59 | 57,25,7.65,0,3,0
 60 | 58,24,16.1,0,3,1
 61 | 59,36,262.375,0,1,0
 62 | 60,17,7.8958,0,3,0
 63 | 61,32,13.5,0,2,0
 64 | 62,18,7.75,0,3,0
 65 | 63,22,7.725,0,3,0
 66 | 64,13,262.375,2,1,2
 67 | 65,24,21,0,2,0
 68 | 66,18,7.8792,0,3,0
 69 | 67,47,42.4,0,1,0
 70 | 68,31,28.5375,0,1,0
 71 | 69,60,263,4,1,1
 72 | 70,24,7.75,0,3,0
 73 | 71,21,7.8958,0,3,0
 74 | 72,29,7.925,0,3,0
 75 | 73,28.5,27.7208,0,1,0
 76 | 74,35,211.5,0,1,0
 77 | 75,32.5,211.5,0,1,0
 78 | 76,24,8.05,0,3,0
 79 | 77,55,25.7,0,1,2
 80 | 78,30,13,0,2,0
 81 | 79,24,7.75,0,3,0
 82 | 80,6,15.2458,1,3,1
 83 | 81,67,221.7792,0,1,1
 84 | 82,49,26,0,1,0
 85 | 83,24,7.8958,0,3,0
 86 | 84,24,10.7083,0,2,0
 87 | 85,24,14.4542,0,3,1
 88 | 86,27,7.8792,0,3,0
 89 | 87,18,8.05,0,3,0
 90 | 88,24,7.75,0,3,0
 91 | 89,2,23,1,2,1
 92 | 90,22,13.9,0,3,1
 93 | 91,24,7.775,0,3,0
 94 | 92,27,52,2,1,1
 95 | 93,24,8.05,0,3,0
 96 | 94,25,26,0,1,0
 97 | 95,25,7.7958,0,3,0
 98 | 96,76,78.85,0,1,1
 99 | 97,29,7.925,0,3,0
100 | 98,20,7.8542,0,3,0
101 | 99,33,8.05,0,3,0
102 | 100,43,55.4417,0,1,1
103 | 101,27,26,0,2,1
104 | 102,24,7.75,0,3,0
105 | 103,26,7.775,0,3,0
106 | 104,16,8.5167,1,3,1
107 | 105,28,22.525,0,3,0
108 | 106,21,7.8208,0,3,0
109 | 107,24,7.75,0,3,0
110 | 108,24,8.7125,0,3,0
111 | 109,18.5,13,0,2,0
112 | 110,41,15.0458,0,2,0
113 | 111,24,7.7792,0,3,0
114 | 112,36,31.6792,0,1,0
115 | 113,18.5,7.2833,0,3,0
116 | 114,63,221.7792,0,1,1
117 | 115,18,14.4542,0,3,1
118 | 116,24,6.4375,0,3,0
119 | 117,1,16.7,1,3,1
120 | 118,36,75.2417,0,1,0
121 | 119,29,26,0,2,1
122 | 120,12,15.75,0,2,0
123 | 121,24,7.75,0,3,1
124 | 122,35,57.75,0,1,1
125 | 123,28,7.25,0,3,0
126 | 124,24,7.75,0,3,0
127 | 125,17,16.1,1,3,0
128 | 126,22,7.7958,0,3,0
129 | 127,24,23.25,0,3,2
130 | 128,42,13,0,2,0
131 | 129,24,8.05,0,3,0
132 | 130,32,8.05,0,3,0
133 | 131,53,28.5,0,1,0
134 | 132,24,25.4667,4,3,0
135 | 133,24,6.4375,0,3,1
136 | 134,43,7.8958,0,3,0
137 | 135,24,7.8542,0,3,0
138 | 136,26.5,7.225,0,3,0
139 | 137,26,13,0,2,0
140 | 138,23,8.05,0,3,0
141 | 139,40,46.9,6,3,1
142 | 140,10,46.9,2,3,5
143 | 141,33,151.55,0,1,0
144 | 142,61,262.375,3,1,1
145 | 143,28,26,0,2,0
146 | 144,42,26.55,0,1,0
147 | 145,31,18,0,3,3
148 | 146,24,51.8625,0,1,0
149 | 147,22,8.05,0,3,0
150 | 148,24,26.55,0,1,0
151 | 149,30,26,1,2,1
152 | 150,23,83.1583,1,1,0
153 | 151,24,7.8958,0,3,0
154 | 152,60.5,24,0,3,0
155 | 153,36,12.1833,2,3,0
156 | 154,13,31.3875,2,3,4
157 | 155,24,7.55,0,3,0
158 | 156,29,221.7792,0,1,0
159 | 157,23,7.8542,0,3,0
160 | 158,42,26.55,0,1,0
161 | 159,26,13.775,2,3,0
162 | 160,24,7.7333,0,3,0
163 | 161,7,15.2458,1,3,1
164 | 162,26,13.5,0,2,0
165 | 163,24,7,0,3,0
166 | 164,41,13,0,2,0
167 | 165,26,22.025,1,3,1
168 | 166,48,50.4958,0,1,0
169 | 167,18,34.375,2,3,2
170 | 168,24,27.7208,0,1,0
171 | 169,22,8.9625,0,3,0
172 | 170,24,7.55,0,3,0
173 | 171,27,7.225,0,3,0
174 | 172,23,13.9,0,3,1
175 | 173,24,7.2292,0,3,0
176 | 174,40,31.3875,5,3,1
177 | 175,15,39,2,2,0
178 | 176,20,36.75,0,2,0
179 | 177,54,55.4417,0,1,1
180 | 178,36,39,3,2,0
181 | 179,64,83.1583,2,1,0
182 | 180,30,13,0,2,0
183 | 181,37,83.1583,1,1,1
184 | 182,18,53.1,0,1,1
185 | 183,24,7.75,0,3,0
186 | 184,27,247.5208,1,1,1
187 | 185,40,16,0,2,0
188 | 186,21,21,1,2,0
189 | 187,17,8.05,0,3,2
190 | 188,24,69.55,2,3,8
191 | 189,40,13,0,2,0
192 | 190,34,26,0,2,1
193 | 191,24,26,0,1,0
194 | 192,11.5,14.5,1,3,1
195 | 193,61,12.35,0,2,0
196 | 194,8,32.5,2,2,0
197 | 195,33,7.8542,0,3,0
198 | 196,6,134.5,2,1,0
199 | 197,18,7.775,0,3,0
200 | 198,23,10.5,0,2,0
201 | 199,24,8.1125,0,3,0
202 | 200,24,15.5,0,3,0
203 | 201,0.33,14.4,2,3,0
204 | 202,47,227.525,0,1,1
205 | 203,8,26,1,2,1
206 | 204,25,10.5,0,2,0
207 | 205,24,25.7417,0,1,0
208 | 206,35,7.75,0,3,0
209 | 207,24,10.5,0,2,0
210 | 208,33,27.7208,0,1,0
211 | 209,25,7.8958,0,3,0
212 | 210,32,22.525,0,3,0
213 | 211,24,7.05,0,3,0
214 | 212,17,73.5,0,2,0
215 | 213,60,26,0,2,1
216 | 214,38,7.775,2,3,4
217 | 215,42,42.5,0,1,0
218 | 216,24,7.8792,0,3,0
219 | 217,57,164.8667,1,1,1
220 | 218,50,211.5,1,1,1
221 | 219,24,8.05,0,3,0
222 | 220,30,13.8583,0,2,1
223 | 221,21,8.05,0,3,0
224 | 222,22,10.5,0,2,0
225 | 223,21,7.7958,0,3,0
226 | 224,53,27.4458,0,1,0
227 | 225,24,15.2458,2,3,0
228 | 226,23,7.7958,0,3,0
229 | 227,24,7.75,0,3,0
230 | 228,40.5,15.1,0,3,0
231 | 229,36,13,0,2,0
232 | 230,14,65,0,2,0
233 | 231,21,26.55,0,1,0
234 | 232,21,6.4958,0,3,1
235 | 233,24,7.8792,0,3,0
236 | 234,39,71.2833,0,1,1
237 | 235,20,7.8542,0,3,0
238 | 236,64,75.25,0,1,1
239 | 237,20,7.225,0,3,0
240 | 238,18,13,1,2,1
241 | 239,48,106.425,0,1,1
242 | 240,55,27.7208,0,1,0
243 | 241,45,30,2,2,0
244 | 242,45,134.5,1,1,1
245 | 243,24,7.8875,0,3,0
246 | 244,24,23.45,2,3,1
247 | 245,41,51.8625,0,1,1
248 | 246,22,21,0,2,0
249 | 247,42,32.5,1,2,1
250 | 248,29,26,0,2,1
251 | 249,24,14.4542,0,3,1
252 | 250,0.92,27.75,2,2,1
253 | 251,20,7.925,0,3,0
254 | 252,27,136.7792,0,1,1
255 | 253,24,9.325,0,3,0
256 | 254,32.5,9.5,0,3,0
257 | 255,24,7.55,0,3,0
258 | 256,24,7.75,0,3,0
259 | 257,28,8.05,0,3,0
260 | 258,19,13,0,2,0
261 | 259,21,7.775,0,3,0
262 | 260,36.5,17.4,0,3,1
263 | 261,21,7.8542,0,3,0
264 | 262,29,23,2,2,0
265 | 263,1,12.1833,1,3,1
266 | 264,30,12.7375,0,2,0
267 | 265,24,7.8958,0,3,0
268 | 266,24,0,0,1,0
269 | 267,24,7.55,0,3,0
270 | 268,24,8.05,0,3,0
271 | 269,17,8.6625,0,3,0
272 | 270,46,75.2417,0,1,0
273 | 271,24,7.75,0,3,0
274 | 272,26,136.7792,0,1,1
275 | 273,24,15.5,0,3,1
276 | 274,24,7.225,0,3,0
277 | 275,20,26,0,2,1
278 | 276,28,10.5,0,2,0
279 | 277,40,26,0,2,1
280 | 278,30,21,0,2,1
281 | 279,22,10.5,0,2,0
282 | 280,23,8.6625,0,3,0
283 | 281,0.75,13.775,1,3,1
284 | 282,24,7.75,0,3,0
285 | 283,9,15.2458,1,3,1
286 | 284,2,20.2125,1,3,1
287 | 285,36,7.25,0,3,0
288 | 286,24,7.25,0,3,0
289 | 287,24,82.2667,0,1,1
290 | 288,24,7.2292,0,3,0
291 | 289,24,8.05,0,3,0
292 | 290,24,39.6,0,1,0
293 | 291,30,6.95,0,3,0
294 | 292,24,7.2292,0,3,0
295 | 293,53,81.8583,1,1,1
296 | 294,36,9.5,0,3,0
297 | 295,26,7.8958,0,3,0
298 | 296,1,41.5792,2,2,1
299 | 297,24,21.6792,0,3,2
300 | 298,30,45.5,0,1,0
301 | 299,29,7.8542,0,3,0
302 | 300,32,7.775,0,3,0
303 | 301,24,15.0458,0,2,0
304 | 302,43,21,1,2,0
305 | 303,24,8.6625,0,3,0
306 | 304,24,7.75,0,3,0
307 | 305,64,26.55,1,1,1
308 | 306,30,151.55,2,1,1
309 | 307,0.83,9.35,1,3,0
310 | 308,55,93.5,1,1,1
311 | 309,45,14.1083,0,3,1
312 | 310,18,8.6625,0,3,0
313 | 311,22,7.225,0,3,0
314 | 312,24,7.575,0,3,0
315 | 313,37,7.75,0,3,0
316 | 314,55,135.6333,0,1,0
317 | 315,17,7.7333,0,3,0
318 | 316,57,146.5208,0,1,1
319 | 317,19,10.5,0,2,0
320 | 318,27,7.8542,0,3,0
321 | 319,22,31.5,0,2,2
322 | 320,26,7.775,0,3,0
323 | 321,25,7.2292,0,3,0
324 | 322,26,13,0,2,0
325 | 323,33,26.55,0,1,0
326 | 324,39,211.3375,0,1,0
327 | 325,23,7.05,0,3,0
328 | 326,12,39,1,2,2
329 | 327,46,79.2,0,1,0
330 | 328,29,26,0,2,1
331 | 329,21,13,0,2,0
332 | 330,48,36.75,2,2,0
333 | 331,39,29.7,0,1,0
334 | 332,24,7.225,0,3,0
335 | 333,19,15.7417,1,3,1
336 | 334,27,7.8958,0,3,0
337 | 335,30,26,0,1,0
338 | 336,32,13,0,2,0
339 | 337,39,7.2292,2,3,0
340 | 338,25,31.5,0,2,0
341 | 339,24,7.2292,0,3,0
342 | 340,18,10.5,0,2,0
343 | 341,32,7.5792,0,3,0
344 | 342,24,69.55,9,3,1
345 | 343,58,512.3292,1,1,0
346 | 344,24,14.5,1,3,1
347 | 345,16,7.65,0,3,0
348 | 346,26,13,0,2,0
349 | 347,38,7.2292,0,3,0
350 | 348,24,13.5,0,2,0
351 | 349,31,21,0,2,0
352 | 350,45,63.3583,1,1,0
353 | 351,25,10.5,0,2,0
354 | 352,18,73.5,0,2,0
355 | 353,49,65,2,2,1
356 | 354,0.17,20.575,2,3,1
357 | 355,50,26,0,1,0
358 | 356,59,51.4792,0,1,2
359 | 357,24,7.8792,0,3,0
360 | 358,24,7.75,0,3,0
361 | 359,30,15.55,0,3,1
362 | 360,14.5,69.55,2,3,8
363 | 361,24,37.0042,1,2,1
364 | 362,31,21,0,2,0
365 | 363,27,8.6625,0,3,0
366 | 364,25,55.4417,0,1,1
367 | 365,24,69.55,9,3,1
368 | 366,24,14.4583,0,3,1
369 | 367,22,39.6875,0,3,0
370 | 368,45,59.4,1,1,0
371 | 369,29,13.8583,0,2,0
372 | 370,21,11.5,0,2,1
373 | 371,31,134.5,0,1,0
374 | 372,49,0,0,1,0
375 | 373,44,13,0,2,0
376 | 374,54,81.8583,1,1,1
377 | 375,45,262.375,0,1,0
378 | 376,22,8.6625,0,3,2
379 | 377,21,11.5,0,2,0
380 | 378,55,50,0,1,0
381 | 379,5,31.3875,2,3,4
382 | 380,24,7.75,0,3,0
383 | 381,26,7.8792,0,3,0
384 | 382,24,14.5,0,3,0
385 | 383,19,16.1,0,3,1
386 | 384,24,12.875,0,2,0
387 | 385,24,65,2,2,1
388 | 386,24,7.775,0,3,0
389 | 387,57,13,0,2,0
390 | 388,21,7.75,0,3,0
391 | 389,6,21.075,1,3,3
392 | 390,23,93.5,0,1,0
393 | 391,51,39.4,1,1,0
394 | 392,13,20.25,2,3,0
395 | 393,47,10.5,0,2,0
396 | 394,29,22.025,1,3,3
397 | 395,18,60,0,1,1
398 | 396,24,7.25,0,3,0
399 | 397,48,79.2,1,1,1
400 | 398,22,7.775,0,3,0
401 | 399,31,7.7333,0,3,0
402 | 400,30,164.8667,0,1,0
403 | 401,38,21,0,2,1
404 | 402,22,59.4,1,1,0
405 | 403,17,47.1,0,1,0
406 | 404,43,27.7208,0,1,1
407 | 405,20,13.8625,0,2,0
408 | 406,23,10.5,0,2,1
409 | 407,50,211.5,1,1,1
410 | 408,24,7.7208,0,3,0
411 | 409,3,13.775,1,3,1
412 | 410,24,7.75,0,3,0
413 | 411,37,90,0,1,1
414 | 412,28,7.775,0,3,0
415 | 413,24,8.05,0,3,0
416 | 414,39,108.9,0,1,0
417 | 415,38.5,7.25,0,3,0
418 | 416,24,8.05,0,3,0
419 | 417,24,22.3583,1,3,1
420 | 


--------------------------------------------------------------------------------
/tests/data_for_tests/train.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/data_for_tests/train.h5


--------------------------------------------------------------------------------
/tests/data_for_tests/train.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AxeldeRomblay/MLBox/4973443bf019f6770691cf2ab23d75671a331d42/tests/data_for_tests/train.xls


--------------------------------------------------------------------------------
/tests/test_categorical_encoder.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding: utf-8
 3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
 4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
 5 | # License: BSD 3 clause
 6 | """Test mlbox.encoding.categorical_encoder module."""
 7 | import pytest
 8 | import pandas as pd
 9 | 
10 | from mlbox.encoding.categorical_encoder import Categorical_encoder
11 | 
12 | 
13 | def test_init_encoder():
14 |     """Test init method of Categorical_encoder class."""
15 |     encoder = Categorical_encoder()
16 |     assert encoder.strategy == "label_encoding"
17 |     assert not (encoder.verbose)
18 |     assert encoder._Categorical_encoder__Lcat == []
19 |     assert encoder._Categorical_encoder__Lnum == []
20 |     assert encoder._Categorical_encoder__Enc == dict()
21 |     assert encoder._Categorical_encoder__K == dict()
22 |     assert not encoder._Categorical_encoder__weights
23 |     assert not encoder._Categorical_encoder__fitOK
24 | 
25 | 
26 | def test_get_params_encoder():
27 |     """Test get_params method of Categorical_encoder class."""
28 |     encoder = Categorical_encoder()
29 |     dict = {'strategy': "label_encoding",
30 |             'verbose': False}
31 |     assert encoder.get_params() == dict
32 | 
33 | 
34 | def test_set_params_encoder():
35 |     """Test set_params method of Categorical_encoder class."""
36 |     encoder = Categorical_encoder()
37 |     encoder.set_params(strategy="label_encoding")
38 |     assert encoder.strategy == "label_encoding"
39 |     encoder.set_params(strategy="dummification")
40 |     assert encoder.strategy == "dummification"
41 |     encoder.set_params(strategy="random_projection")
42 |     assert encoder.strategy == "random_projection"
43 |     encoder.set_params(strategy="entity_embedding")
44 |     assert encoder.strategy == "entity_embedding"
45 |     encoder.set_params(verbose=True)
46 |     assert encoder.verbose
47 |     encoder.set_params(verbose=False)
48 |     assert not encoder.verbose
49 |     with pytest.warns(UserWarning) as record:
50 |         encoder.set_params(_Categorical_encoder__Lcat=[])
51 |     assert len(record) == 1
52 | 
53 | 
54 | def test_fit_encoder():
55 |     """Test method fit of Categorical_encoder class."""
56 |     df = pd.read_csv("data_for_tests/train.csv")
57 |     encoder = Categorical_encoder(strategy="wrong_strategy")
58 |     with pytest.raises(ValueError):
59 |         encoder.fit(df, df["Survived"])
60 |     encoder.set_params(strategy="label_encoding")
61 |     encoder.fit(df, df["Survived"])
62 |     assert encoder._Categorical_encoder__fitOK
63 |     encoder.set_params(strategy="dummification")
64 |     encoder.fit(df, df["Survived"])
65 |     assert encoder._Categorical_encoder__fitOK
66 |     encoder.set_params(strategy="random_projection")
67 |     encoder.fit(df, df["Survived"])
68 |     assert encoder._Categorical_encoder__fitOK
69 |     encoder.set_params(strategy="entity_embedding")
70 |     encoder.fit(df, df["Survived"])
71 |     assert encoder._Categorical_encoder__fitOK
72 | 
73 | 
74 | def test_transform_encoder():
75 |     """Test transform method of Categorical_encoder class."""
76 |     df = pd.read_csv("data_for_tests/train.csv")
77 |     encoder = Categorical_encoder()
78 |     with pytest.raises(ValueError):
79 |         encoder.transform(df)
80 |     encoder.fit(df, df["Survived"])
81 |     df_encoded = encoder.transform(df)
82 |     assert (df.columns == df_encoded.columns).all()
83 |     encoder.set_params(strategy="dummification")
84 |     encoder.fit(df, df["Survived"])
85 |     df_encoded = encoder.transform(df)
86 |     assert (type(df_encoded) == pd.SparseDataFrame) | (type(df_encoded) == pd.DataFrame)
87 |     encoder.set_params(strategy="random_projection")
88 |     encoder.fit(df, df["Survived"])
89 |     df_encoded = encoder.transform(df)
90 |     assert type(df_encoded) == pd.DataFrame
91 |     encoder.set_params(strategy="entity_embedding")
92 |     encoder.fit(df, df["Survived"])
93 |     df_encoded = encoder.transform(df)
94 |     assert type(df_encoded) == pd.DataFrame
95 | 
96 | 


--------------------------------------------------------------------------------
/tests/test_classification_feature_selector.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding: utf-8
 3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
 4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
 5 | # License: BSD 3 clause
 6 | """Test mlbox.model.classification.feature_selector module."""
 7 | import pytest
 8 | import pandas as pd
 9 | 
10 | from mlbox.model.classification.feature_selector import Clf_feature_selector
11 | 
12 | 
13 | def test_init_Clf_feature_selector():
14 |     """Test init method of Clf_feature_selector class."""
15 |     feature_selector = Clf_feature_selector()
16 |     assert feature_selector.strategy == "l1"
17 |     assert feature_selector.threshold == 0.3
18 |     assert not feature_selector._Clf_feature_selector__fitOK
19 |     assert feature_selector._Clf_feature_selector__to_discard == []
20 | 
21 | 
22 | def test_get_params_Clf_feature_selector():
23 |     """Test get_params method of Clf_feature_selector class."""
24 |     feature_selector = Clf_feature_selector()
25 |     dict = {'strategy': "l1",
26 |             'threshold': 0.3}
27 |     assert feature_selector.get_params() == dict
28 | 
29 | 
30 | def test_set_params_Clf_feature_selector():
31 |     """Test set_params method of Clf_feature_selector class."""
32 |     feature_selector = Clf_feature_selector()
33 |     feature_selector.set_params(strategy="variance")
34 |     assert feature_selector.strategy == "variance"
35 |     feature_selector.set_params(threshold=0.2)
36 |     assert feature_selector.threshold == 0.2
37 |     with pytest.warns(UserWarning) as record:
38 |         feature_selector.set_params(wrong_strategy="wrong_strategy")
39 |     assert len(record) == 1
40 | 
41 | 
42 | def test_fit_Clf_feature_selector():
43 |     """Test fit method of Clf_feature_selector class."""
44 |     feature_selector = Clf_feature_selector()
45 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
46 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
47 |     with pytest.raises(ValueError):
48 |         feature_selector.fit(None, y_train)
49 |     with pytest.raises(ValueError):
50 |         feature_selector.fit(df_train, None)
51 |     feature_selector.fit(df_train, y_train)
52 |     assert feature_selector._Clf_feature_selector__fitOK
53 |     feature_selector.set_params(strategy="variance")
54 |     feature_selector.fit(df_train, y_train)
55 |     assert feature_selector._Clf_feature_selector__fitOK
56 |     feature_selector.set_params(strategy="rf_feature_importance")
57 |     feature_selector.fit(df_train, y_train)
58 |     assert feature_selector._Clf_feature_selector__fitOK
59 |     feature_selector.set_params(strategy="wrond_strategy")
60 |     with pytest.raises(ValueError):
61 |         feature_selector.fit(df_train, y_train)
62 | 
63 | 
64 | def test_transform_Clf_feature_selector():
65 |     """Test transform method of Clf_feature_selector class."""
66 |     feature_selector = Clf_feature_selector(threshold=0)
67 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
68 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
69 |     with pytest.raises(ValueError):
70 |         feature_selector.transform(df_train)
71 |     feature_selector.fit(df_train, y_train)
72 |     with pytest.raises(ValueError):
73 |         feature_selector.transform(None)
74 |     df_transformed = feature_selector.transform(df_train)
75 |     assert (df_transformed.columns == df_train.columns).all()
76 | 
77 | 
78 | def test_fit_transform_Clf_feature_selector():
79 |     """Test fit_transform method of Clf_feature_selector class."""
80 |     feature_selector = Clf_feature_selector(threshold=0)
81 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
82 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
83 |     df_transformed = feature_selector.fit_transform(df_train, y_train)
84 |     assert (df_transformed.columns == df_train.columns).all()
85 | 


--------------------------------------------------------------------------------
/tests/test_classifier.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
  5 | # License: BSD 3 clause
  6 | """Test mlbox.model.classification.classifier module."""
  7 | import pytest
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | from mlbox.model.classification.classifier import Classifier
 12 | from lightgbm import LGBMClassifier
 13 | 
 14 | 
 15 | def test_init_classifier():
 16 |     """Test init method of Classifier class."""
 17 |     classifier = Classifier()
 18 |     assert classifier._Classifier__strategy == "LightGBM"
 19 |     assert classifier._Classifier__classif_params == {}
 20 |     assert classifier._Classifier__classifier
 21 |     assert not classifier._Classifier__col
 22 |     assert not classifier._Classifier__fitOK
 23 | 
 24 | 
 25 | def test_get_params_classifier():
 26 |     """Test get_params method of Classifier class."""
 27 |     classifier = Classifier()
 28 |     params = classifier.get_params()
 29 |     assert params == {'strategy': "LightGBM"}
 30 |     assert not classifier._Classifier__classif_params
 31 | 
 32 | 
 33 | def test_set_params_classifier():
 34 |     """Test set_params method of Classifier class."""
 35 |     classifier = Classifier()
 36 |     classifier.set_params(strategy="LightGBM")
 37 |     assert classifier._Classifier__strategy == "LightGBM"
 38 |     classifier.set_params(strategy="RandomForest")
 39 |     assert classifier._Classifier__strategy == "RandomForest"
 40 |     classifier.set_params(strategy="ExtraTrees")
 41 |     assert classifier._Classifier__strategy == "ExtraTrees"
 42 |     classifier.set_params(strategy="RandomForest")
 43 |     assert classifier._Classifier__strategy == "RandomForest"
 44 |     classifier.set_params(strategy="Tree")
 45 |     assert classifier._Classifier__strategy == "Tree"
 46 |     classifier.set_params(strategy="AdaBoost")
 47 |     assert classifier._Classifier__strategy == "AdaBoost"
 48 |     classifier.set_params(strategy="Linear")
 49 |     assert classifier._Classifier__strategy == "Linear"
 50 |     with pytest.warns(UserWarning) as record:
 51 |         classifier.set_params(wrong_strategy="wrong_strategy")
 52 |     assert len(record) == 1
 53 | 
 54 | 
 55 | def test_set_classifier():
 56 |     """Test set method of Classifier class."""
 57 |     classifier = Classifier()
 58 |     with pytest.raises(ValueError):
 59 |         classifier._Classifier__set_classifier("wrong_strategy")
 60 | 
 61 | 
 62 | def test_fit_classifier():
 63 |     """Test fit method of Classifier class."""
 64 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 65 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
 66 |     classifier = Classifier()
 67 |     classifier.fit(df_train, y_train)
 68 |     assert np.all(classifier._Classifier__col == df_train.columns)
 69 |     assert classifier._Classifier__fitOK
 70 | 
 71 | 
 72 | def test_feature_importances_classifier():
 73 |     """Test feature_importances method of Classifier class."""
 74 |     classifier = Classifier()
 75 |     with pytest.raises(ValueError):
 76 |         classifier.feature_importances()
 77 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 78 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
 79 |     classifier.set_params(strategy="LightGBM")
 80 |     classifier.fit(df_train, y_train)
 81 |     importance = classifier.feature_importances()
 82 |     assert importance != {}
 83 |     classifier.set_params(strategy="Linear")
 84 |     classifier.fit(df_train, y_train)
 85 |     importance = classifier.feature_importances()
 86 |     assert importance != {}
 87 |     classifier.set_params(strategy="RandomForest")
 88 |     classifier.fit(df_train, y_train)
 89 |     importance = classifier.feature_importances()
 90 |     assert importance != {}
 91 |     classifier.set_params(strategy="AdaBoost")
 92 |     classifier.fit(df_train, y_train)
 93 |     importance = classifier.feature_importances()
 94 |     assert importance != {}
 95 |     classifier.set_params(strategy="Bagging")
 96 |     classifier.fit(df_train, y_train)
 97 |     importance = classifier.feature_importances()
 98 |     assert importance != {}
 99 | 
100 | 
101 | def test_predict_classifier():
102 |     """Test predict method of Classifier class."""
103 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
104 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
105 |     classifier = Classifier()
106 |     with pytest.raises(ValueError):
107 |         classifier.predict(df_train)
108 |     classifier.fit(df_train, y_train)
109 |     with pytest.raises(ValueError):
110 |         classifier.predict(None)
111 |     assert len(classifier.predict(df_train)) > 0
112 | 
113 | 
114 | def test_predict_log_proba_classifier():
115 |     """Test predict_log_proba method of Classifier class."""
116 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
117 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
118 |     classifier = Classifier(strategy="Linear")
119 |     with pytest.raises(ValueError):
120 |         classifier.predict_log_proba(df_train)
121 |     classifier.fit(df_train, y_train)
122 |     with pytest.raises(ValueError):
123 |         classifier.predict_log_proba(None)
124 |     assert len(classifier.predict_log_proba(df_train)) > 0
125 | 
126 | 
127 | def test_predict_proba_classifier():
128 |     """Test predict_proba method of Classifier class."""
129 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
130 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
131 |     classifier = Classifier()
132 |     with pytest.raises(ValueError):
133 |         classifier.predict_proba(df_train)
134 |     classifier.fit(df_train, y_train)
135 |     with pytest.raises(ValueError):
136 |         classifier.predict_proba(None)
137 |     assert len(classifier.predict_proba(df_train)) > 0
138 | 
139 | 
140 | def test_score_classifier():
141 |     """Test score method of Classifier class."""
142 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
143 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
144 |     classifier = Classifier()
145 |     with pytest.raises(ValueError):
146 |         classifier.score(df_train, y_train)
147 |     classifier.fit(df_train, y_train)
148 |     with pytest.raises(ValueError):
149 |         classifier.score(None, y_train)
150 |     with pytest.raises(ValueError):
151 |         classifier.score(df_train, None)
152 |     assert classifier.score(df_train, y_train) > 0
153 | 
154 | 
155 | def test_get_estimator_classifier():
156 |     """Test get_estimator method of Classifier class."""
157 |     classifier = Classifier()
158 |     estimator = classifier.get_estimator()
159 |     assert isinstance(estimator, type(LGBMClassifier()))
160 | 


--------------------------------------------------------------------------------
/tests/test_drift_estimator.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding: utf-8
 3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
 4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
 5 | # License: BSD 3 clause
 6 | """Test mlbox.preprocessing.drift.drift_estimator module."""
 7 | import pytest
 8 | import pandas as pd
 9 | 
10 | from mlbox.preprocessing.drift.drift_estimator import DriftEstimator
11 | 
12 | 
13 | def test_init_drift_estimator():
14 |     """Test init method of DriftEstimator class."""
15 |     drift_estimator = DriftEstimator()
16 |     assert drift_estimator.n_folds == 2
17 |     assert drift_estimator.stratify
18 |     assert drift_estimator.random_state == 1
19 |     assert not drift_estimator._DriftEstimator__cv
20 |     assert not drift_estimator._DriftEstimator__pred
21 |     assert not drift_estimator._DriftEstimator__target
22 |     assert not drift_estimator._DriftEstimator__fitOK
23 | 
24 | 
25 | def test_get_params_drift_estimator():
26 |     """Test get_params method of DriftEstimator class."""
27 |     drift_estimator = DriftEstimator()
28 |     dict = {'estimator': drift_estimator.estimator,
29 |             'n_folds': 2,
30 |             'stratify': True,
31 |             'random_state': 1}
32 |     assert drift_estimator.get_params() == dict
33 | 
34 | 
35 | def test_set_params_drift_estimator():
36 |     """Test set_params method of DriftEstimator class."""
37 |     drift_estimator = DriftEstimator()
38 |     dict = {'estimator': drift_estimator.estimator,
39 |             'n_folds': 3,
40 |             'stratify': False,
41 |             'random_state': 2}
42 |     drift_estimator.set_params(**dict)
43 |     assert drift_estimator.get_params() == dict
44 | 
45 | 
46 | def test_fit_drift_estimator():
47 |     """Test fit method of DriftEstimator class."""
48 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
49 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
50 |     drift_estimator = DriftEstimator()
51 |     drift_estimator.fit(df_train, df_test)
52 |     assert drift_estimator._DriftEstimator__fitOK
53 | 
54 | 
55 | def test_score_drift_estimator():
56 |     """Test score method of DriftEstimator class."""
57 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
58 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
59 |     drift_estimator = DriftEstimator()
60 |     with pytest.raises(ValueError):
61 |         drift_estimator.score()
62 |     drift_estimator.fit(df_train, df_test)
63 |     assert drift_estimator.score() > 0
64 | 
65 | 
66 | def test_predict_drift_estimator():
67 |     """Test predict method of DriftEstimator class."""
68 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
69 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
70 |     drift_estimator = DriftEstimator()
71 |     with pytest.raises(ValueError):
72 |         drift_estimator.predict()
73 |     drift_estimator.fit(df_train, df_test)
74 |     results = drift_estimator.predict()
75 |     assert len(results) == 1309
76 | 


--------------------------------------------------------------------------------
/tests/test_drift_threshold.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
  5 | # License: BSD 3 clause
  6 | """Test mlbox.preprocessing.drift module."""
  7 | import pytest
  8 | import pandas as pd
  9 | 
 10 | from mlbox.preprocessing.drift import DriftThreshold
 11 | from mlbox.preprocessing.drift import sync_fit
 12 | from sklearn.tree import DecisionTreeClassifier
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | 
 15 | 
 16 | def test_init_drift_threshold():
 17 |     """Test init method of DriftThreshold class."""
 18 |     drift_threshold = DriftThreshold()
 19 |     assert drift_threshold.threshold == 0.6
 20 |     assert drift_threshold.subsample == 1.
 21 |     assert isinstance(drift_threshold.estimator,
 22 |                       type(DecisionTreeClassifier()))
 23 |     assert drift_threshold.n_folds == 2
 24 |     assert drift_threshold.stratify
 25 |     assert drift_threshold.random_state == 1
 26 |     assert drift_threshold.n_jobs == -1
 27 |     assert not drift_threshold._DriftThreshold__fitOK
 28 | 
 29 | 
 30 | def test_get_params_drift_threshold():
 31 |     """Test get_params method of DriftThreshold class."""
 32 |     drift_threshold = DriftThreshold()
 33 |     dict = {'threshold': 0.6,
 34 |             'subsample': 1.,
 35 |             'n_folds': 2,
 36 |             'stratify': True,
 37 |             'random_state': 1,
 38 |             'n_jobs': -1}
 39 |     dict_get_params = drift_threshold.get_params()
 40 |     assert dict_get_params["threshold"] == dict["threshold"]
 41 |     assert dict_get_params["subsample"] == dict["subsample"]
 42 |     assert dict_get_params["n_folds"] == dict["n_folds"]
 43 |     assert dict_get_params["stratify"] == dict["stratify"]
 44 |     assert dict_get_params["random_state"] == dict["random_state"]
 45 |     assert dict_get_params["n_jobs"] == dict["n_jobs"]
 46 | 
 47 | 
 48 | def test_set_params_drift_threshold():
 49 |     """Test set_params method of DriftThreshold class."""
 50 |     drift_threshold = DriftThreshold()
 51 |     dict = {'threshold': 0.6,
 52 |             'subsample': 1.,
 53 |             'estimator': DecisionTreeClassifier(max_depth=6),
 54 |             'n_folds': 2,
 55 |             'stratify': True,
 56 |             'random_state': 1,
 57 |             'n_jobs': -1}
 58 |     drift_threshold.set_params(**dict)
 59 |     dict_get_params = drift_threshold.get_params()
 60 |     assert dict_get_params["threshold"] == dict["threshold"]
 61 |     assert dict_get_params["subsample"] == dict["subsample"]
 62 |     assert dict_get_params["n_folds"] == dict["n_folds"]
 63 |     assert dict_get_params["stratify"] == dict["stratify"]
 64 |     assert dict_get_params["random_state"] == dict["random_state"]
 65 |     assert dict_get_params["n_jobs"] == dict["n_jobs"]
 66 | 
 67 | 
 68 | def test_fit_drift_threshold():
 69 |     """Test fit method of DriftThreshold class."""
 70 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 71 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
 72 |     drift_threshold = DriftThreshold()
 73 |     drift_threshold.fit(df_train, df_test)
 74 |     assert drift_threshold._DriftThreshold__fitOK
 75 | 
 76 | 
 77 | def test_transform_drift_threshold():
 78 |     """Test transform method of DriftThreshold class."""
 79 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 80 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
 81 |     drift_threshold = DriftThreshold()
 82 |     with pytest.raises(ValueError):
 83 |         drift_threshold.transform(df_train)
 84 |     drift_threshold.fit(df_train, df_test)
 85 |     df_transformed = drift_threshold.transform(df_train)
 86 |     assert (df_train.columns == df_transformed.columns).all()
 87 | 
 88 | 
 89 | def test_get_support_drift_threshold():
 90 |     """Test get_support method of DriftThreshold class."""
 91 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 92 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
 93 |     drift_threshold = DriftThreshold()
 94 |     with pytest.raises(ValueError):
 95 |         drift_threshold.get_support()
 96 |     drift_threshold.fit(df_train, df_test)
 97 |     keep_list = drift_threshold.get_support()
 98 |     drop_list = drift_threshold.get_support(complement=True)
 99 |     for name in ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']:
100 |         assert (name in keep_list)
101 |     assert not drop_list
102 | 
103 | 
104 | def test_drifts_drift_threshold():
105 |     """Test drifts method of DriftThreshold class."""
106 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
107 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
108 |     drift_threshold = DriftThreshold()
109 |     with pytest.raises(ValueError):
110 |         drift_threshold.drifts()
111 |     drift_threshold.fit(df_train, df_test)
112 |     drifts = drift_threshold.drifts()
113 |     for name in ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']:
114 |         assert (name in list(drifts.keys()))
115 | 
116 | 
117 | def test_sync_fit_drift_threshold():
118 |     """Test method sync_fit of drift_threshold module."""
119 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
120 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
121 |     estimator = RandomForestClassifier(n_estimators=50,
122 |                                        n_jobs=-1,
123 |                                        max_features=1.,
124 |                                        min_samples_leaf=5,
125 |                                        max_depth=5)
126 | 
127 |     score = sync_fit(df_train, df_test, estimator)
128 |     assert 0 <= score
129 | 


--------------------------------------------------------------------------------
/tests/test_drift_thresholder.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding: utf-8
 3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
 4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
 5 | # License: BSD 3 clause
 6 | """Test mlbox.preprocessing.drift_thresholder module."""
 7 | import pytest
 8 | 
 9 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder
10 | from mlbox.preprocessing.reader import Reader
11 | 
12 | 
13 | def test_init_drift_thresholder():
14 |     """Test init method of Drift_thresholder class."""
15 |     drift_thresholder = Drift_thresholder()
16 |     assert drift_thresholder.threshold == 0.6
17 |     assert not drift_thresholder.inplace
18 |     assert drift_thresholder.verbose
19 |     assert drift_thresholder.to_path == "save"
20 |     assert drift_thresholder._Drift_thresholder__Ddrifts == {}
21 |     assert not drift_thresholder._Drift_thresholder__fitOK
22 | 
23 | 
24 | def test_fit_transform():
25 |     """Test fit transform method of Drift_thresholder class."""
26 |     drift_thresholder = Drift_thresholder()
27 |     reader = Reader(sep=",")
28 |     dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"],
29 |                                    target_name="Survived")
30 |     drift_thresholder.fit_transform(dict)
31 |     assert not drift_thresholder._Drift_thresholder__fitOK
32 |     dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
33 |                                           "data_for_tests/test.csv"],
34 |                                    target_name="Survived")
35 |     drift_thresholder.fit_transform(dict)
36 |     assert drift_thresholder._Drift_thresholder__fitOK
37 |     dict = reader.train_test_split(Lpath=["data_for_tests/inplace_train.csv",
38 |                                           "data_for_tests/inplace_test.csv"],
39 |                                    target_name="Survived")
40 |     drift_thresholder.inplace = True
41 |     drift_thresholder.fit_transform(dict)
42 |     assert drift_thresholder._Drift_thresholder__fitOK
43 | 
44 | 
45 | def test_drifts():
46 |     """Test drifts method of Drift_thresholder class."""
47 |     drift_thresholder = Drift_thresholder()
48 |     with pytest.raises(ValueError):
49 |         drift_thresholder.drifts()
50 |     reader = Reader(sep=",")
51 |     dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
52 |                                           "data_for_tests/test.csv"],
53 |                                    target_name="Survived")
54 |     drift_thresholder.fit_transform(dict)
55 |     drifts = drift_thresholder.drifts()
56 |     assert drifts != {}
57 | 


--------------------------------------------------------------------------------
/tests/test_na_encoder.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding: utf-8
 3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
 4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
 5 | # License: BSD 3 clause
 6 | """Test mlbox.encoding.na_encoder module."""
 7 | import pytest
 8 | import pandas as pd
 9 | 
10 | from mlbox.encoding.na_encoder import NA_encoder
11 | 
12 | 
13 | def test_init_NA_encoder():
14 |     """Test init method of NA_encoder class."""
15 |     encoder = NA_encoder()
16 |     assert encoder.numerical_strategy == "mean"
17 |     assert encoder.categorical_strategy == "<NULL>"
18 |     assert encoder._NA_encoder__Lcat == []
19 |     assert encoder._NA_encoder__Lnum == []
20 |     assert not encoder._NA_encoder__imp
21 |     assert encoder._NA_encoder__mode == dict()
22 |     assert not encoder._NA_encoder__fitOK
23 | 
24 | 
25 | def test_get_params_NA_encoder():
26 |     """Test get_params method of NA_encoder class."""
27 |     encoder = NA_encoder()
28 |     dict = {'numerical_strategy': "mean",
29 |             'categorical_strategy': "<NULL>"}
30 |     assert encoder.get_params() == dict
31 | 
32 | 
33 | def test_set_params_NA_encoder():
34 |     """Test set_params method of NA_encoder class."""
35 |     encoder = NA_encoder()
36 | 
37 |     encoder.set_params(numerical_strategy="mean")
38 |     assert encoder.numerical_strategy == "mean"
39 |     encoder.set_params(numerical_strategy="median")
40 |     assert encoder.numerical_strategy == "median"
41 |     encoder.set_params(numerical_strategy="most_frequent")
42 |     assert encoder.numerical_strategy == "most_frequent"
43 |     encoder.set_params(numerical_strategy=3.0)
44 |     assert encoder.numerical_strategy == 3.0
45 | 
46 |     encoder.set_params(categorical_strategy="<NULL>")
47 |     assert encoder.categorical_strategy == "<NULL>"
48 |     encoder.set_params(categorical_strategy="most_frequent")
49 |     assert encoder.categorical_strategy == "most_frequent"
50 |     encoder.set_params(categorical_strategy="string_test")
51 |     assert encoder.categorical_strategy == "string_test"
52 | 
53 |     with pytest.warns(UserWarning) as record:
54 |         encoder.set_params(_Categorical_encoder__Lcat=[])
55 |     assert len(record) == 1
56 | 
57 | 
58 | def test_fit_NA_encoder():
59 |     """Test fit method of NA_encoder class."""
60 |     df = pd.read_csv("data_for_tests/train.csv")
61 | 
62 |     encoder = NA_encoder(numerical_strategy="wrong_strategy")
63 |     with pytest.raises(ValueError):
64 |         encoder.fit(df, df["Survived"])
65 |     encoder.set_params(numerical_strategy="mean")
66 |     encoder.fit(df, df["Survived"])
67 |     assert encoder._NA_encoder__fitOK
68 |     encoder.set_params(numerical_strategy="median")
69 |     encoder.fit(df, df["Survived"])
70 |     assert encoder._NA_encoder__fitOK
71 |     encoder.set_params(numerical_strategy="most_frequent")
72 |     encoder.fit(df, df["Survived"])
73 |     assert encoder._NA_encoder__fitOK
74 |     encoder.set_params(numerical_strategy=3.0)
75 |     encoder.fit(df, df["Survived"])
76 |     assert encoder._NA_encoder__fitOK
77 | 
78 |     encoder = NA_encoder(categorical_strategy=2)
79 |     with pytest.raises(ValueError):
80 |         encoder.fit(df, df["Survived"])
81 |     encoder.set_params(categorical_strategy="<NULL>")
82 |     encoder.fit(df, df["Survived"])
83 |     assert encoder._NA_encoder__fitOK
84 |     encoder.set_params(categorical_strategy="most_frequent")
85 |     encoder.fit(df, df["Survived"])
86 | 
87 | 
88 | def test_transform_NA_encoder():
89 |     """Test transform method of NA_encoder class."""
90 |     df = pd.read_csv("data_for_tests/train.csv")
91 |     encoder = NA_encoder()
92 |     with pytest.raises(ValueError):
93 |         encoder.transform(df)
94 |     encoder.fit(df, df["Survived"])
95 |     df_encoded = encoder.transform(df)
96 |     assert (df.columns == df_encoded.columns).all()
97 | 


--------------------------------------------------------------------------------
/tests/test_optimiser.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
  5 | # License: BSD 3 clause
  6 | """Test mlbox.optimisation.optimiser module."""
  7 | import pytest
  8 | import numpy as np
  9 | 
 10 | from mlbox.optimisation.optimiser import Optimiser
 11 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder
 12 | from mlbox.preprocessing.reader import Reader
 13 | from mlbox.optimisation import make_scorer
 14 | 
 15 | 
 16 | def test_init_optimiser():
 17 |     """Test init method of Optimiser class."""
 18 |     with pytest.warns(UserWarning) as record:
 19 |         optimiser = Optimiser()
 20 |     assert len(record) == 1
 21 |     assert not optimiser.scoring
 22 |     assert optimiser.n_folds == 2
 23 |     assert optimiser.random_state == 1
 24 |     assert optimiser.to_path == "save"
 25 |     assert optimiser.verbose
 26 | 
 27 | 
 28 | def test_get_params_optimiser():
 29 |     """Test get_params method of optimiser class."""
 30 |     with pytest.warns(UserWarning) as record:
 31 |         optimiser = Optimiser()
 32 |     assert len(record) == 1
 33 |     dict = {'scoring': None,
 34 |             'n_folds': 2,
 35 |             'random_state': 1,
 36 |             'to_path': "save",
 37 |             'verbose': True}
 38 |     assert optimiser.get_params() == dict
 39 | 
 40 | 
 41 | def test_set_params_optimiser():
 42 |     """Test set_params method of Optimiser class."""
 43 |     with pytest.warns(UserWarning) as record:
 44 |         optimiser = Optimiser()
 45 |     assert len(record) == 1
 46 |     optimiser.set_params(scoring='accuracy')
 47 |     assert optimiser.scoring == 'accuracy'
 48 |     optimiser.set_params(n_folds=3)
 49 |     assert optimiser.n_folds == 3
 50 |     optimiser.set_params(random_state=2)
 51 |     assert optimiser.random_state == 2
 52 |     optimiser.set_params(to_path="name")
 53 |     assert optimiser.to_path == "name"
 54 |     optimiser.set_params(verbose=False)
 55 |     assert not optimiser.verbose
 56 |     with pytest.warns(UserWarning) as record:
 57 |         optimiser.set_params(wrong_key=3)
 58 |     assert len(record) == 1
 59 | 
 60 | 
 61 | def test_evaluate_classification_optimiser():
 62 |     """Test evaluate method of Optimiser class for classication."""
 63 |     reader = Reader(sep=",")
 64 |     dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
 65 |                                           "data_for_tests/test.csv"],
 66 |                                    target_name="Survived")
 67 |     drift_thresholder = Drift_thresholder()
 68 |     drift_thresholder = drift_thresholder.fit_transform(dict)
 69 | 
 70 |     with pytest.warns(UserWarning) as record:
 71 |         opt = Optimiser(scoring=None, n_folds=3)
 72 |     assert len(record) == 1
 73 |     score = opt.evaluate(None, dict)
 74 |     assert -np.Inf <= score
 75 | 
 76 |     with pytest.warns(UserWarning) as record:
 77 |         opt = Optimiser(scoring="roc_auc", n_folds=3)
 78 |     assert len(record) == 1
 79 |     score = opt.evaluate(None, dict)
 80 |     assert 0. <= score <= 1.
 81 | 
 82 |     with pytest.warns(UserWarning) as record:
 83 |         opt = Optimiser(scoring="wrong_scoring", n_folds=3)
 84 |     assert len(record) == 1
 85 |     with pytest.warns(UserWarning) as record:
 86 |         score = opt.evaluate(None, dict)
 87 |     assert opt.scoring == "neg_log_loss"
 88 | 
 89 | 
 90 | def test_evaluate_regression_optimiser():
 91 |     """Test evaluate method of Optimiser class for regression."""
 92 |     reader = Reader(sep=",")
 93 |     dict = reader.train_test_split(Lpath=["data_for_tests/train_regression.csv",
 94 |                                           "data_for_tests/test_regression.csv"],
 95 |                                    target_name="SalePrice")
 96 |     drift_thresholder = Drift_thresholder()
 97 |     drift_thresholder = drift_thresholder.fit_transform(dict)
 98 | 
 99 |     mape = make_scorer(lambda y_true,
100 |                        y_pred: 100*np.sum(
101 |                                           np.abs(y_true-y_pred)/y_true
102 |                                           )/len(y_true),
103 |                        greater_is_better=False,
104 |                        needs_proba=False)
105 |     with pytest.warns(UserWarning) as record:
106 |         opt = Optimiser(scoring=mape, n_folds=3)
107 |     assert len(record) == 1
108 |     score = opt.evaluate(None, dict)
109 |     assert -np.Inf <= score
110 | 
111 |     with pytest.warns(UserWarning) as record:
112 |         opt = Optimiser(scoring=None, n_folds=3)
113 |     assert len(record) == 1
114 |     score = opt.evaluate(None, dict)
115 |     assert -np.Inf <= score
116 | 
117 |     with pytest.warns(UserWarning) as record:
118 |         opt = Optimiser(scoring="wrong_scoring", n_folds=3)
119 |     assert len(record) == 1
120 |     with pytest.warns(UserWarning) as record:
121 |         score = opt.evaluate(None, dict)
122 |     assert -np.Inf <= score
123 | 
124 | 
125 | def test_evaluate_and_optimise_classification():
126 |     """Test evaluate_and_optimise method of Optimiser class."""
127 |     reader = Reader(sep=",")
128 | 
129 |     dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
130 |                                           "data_for_tests/test.csv"],
131 |                                    target_name="Survived")
132 |     drift_thresholder = Drift_thresholder()
133 |     drift_thresholder = drift_thresholder.fit_transform(dict)
134 | 
135 |     with pytest.warns(UserWarning) as record:
136 |         opt = Optimiser(scoring='accuracy', n_folds=3)
137 |     assert len(record) == 1
138 |     dict_error = dict.copy()
139 |     dict_error["target"] = dict_error["target"].astype(str)
140 |     with pytest.raises(ValueError):
141 |         score = opt.evaluate(None, dict_error)
142 | 
143 |     with pytest.warns(UserWarning) as record:
144 |         opt = Optimiser(scoring='accuracy', n_folds=3)
145 |     assert len(record) == 1
146 |     score = opt.evaluate(None, dict)
147 |     assert 0. <= score <= 1.
148 | 
149 |     space = {'ne__numerical_strategy': {"search": "choice", "space": [0]},
150 |              'ce__strategy': {"search": "choice",
151 |                               "space": ["label_encoding"]},
152 |              'fs__threshold': {"search": "uniform",
153 |                                "space": [0.01, 0.3]},
154 |              'est__max_depth': {"search": "choice",
155 |                                 "space": [3, 4, 5, 6, 7]}
156 | 
157 |              }
158 | 
159 |     best = opt.optimise(space, dict, 1)
160 |     assert type(best) == type(dict)
161 | 


--------------------------------------------------------------------------------
/tests/test_predictor.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
  5 | # License: BSD 3 clause
  6 | """Test mlbox.prediction0.predictor module."""
  7 | import sys
  8 | import pytest
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | from mlbox.prediction.predictor import Predictor
 13 | from mlbox.optimisation.optimiser import Optimiser
 14 | from mlbox.preprocessing.drift_thresholder import Drift_thresholder
 15 | from mlbox.preprocessing.reader import Reader
 16 | from mlbox.optimisation import make_scorer
 17 | 
 18 | if sys.version_info[0] >= 3:
 19 |     from unittest.mock import patch
 20 | 
 21 | 
 22 | set_backend = "import matplotlib\nmatplotlib.use('Agg')\n"
 23 | 
 24 | 
 25 | def test_init_predictor():
 26 |     """Test init method of Predictor class."""
 27 |     predictor = Predictor()
 28 |     assert predictor.to_path == "save"
 29 |     assert predictor.verbose
 30 | 
 31 | 
 32 | def test_get_params_predictor():
 33 |     """Test get_params method of Predictor class."""
 34 |     predictor = Predictor()
 35 |     dict = {'to_path': "save",
 36 |             'verbose': True}
 37 |     assert predictor.get_params() == dict
 38 | 
 39 | 
 40 | def test_set_params_predictor():
 41 |     """Test set_params method of Predictor class."""
 42 |     predictor = Predictor()
 43 |     predictor.set_params(to_path="name")
 44 |     assert predictor.to_path == "name"
 45 |     predictor.set_params(verbose=False)
 46 |     assert not predictor.verbose
 47 |     with pytest.warns(UserWarning) as record:
 48 |         predictor.set_params(wrong_key=3)
 49 |     assert len(record) == 1
 50 | 
 51 | 
 52 | def test_fit_predict_predictor_classification():
 53 |     """Test fit_predict method of Predictor class for classification."""
 54 |     reader = Reader(sep=",")
 55 |     dict = reader.train_test_split(Lpath=["data_for_tests/train.csv",
 56 |                                           "data_for_tests/test.csv"],
 57 |                                    target_name="Survived")
 58 |     drift_thresholder = Drift_thresholder()
 59 |     drift_thresholder = drift_thresholder.fit_transform(dict)
 60 | 
 61 |     with pytest.warns(UserWarning) as record:
 62 |         opt = Optimiser(scoring='accuracy', n_folds=3)
 63 |     assert len(record) == 1
 64 | 
 65 |     space = {'ne__numerical_strategy': {"search": "choice", "space": [0]},
 66 |              'ce__strategy': {"search": "choice",
 67 |                               "space": ["entity_embedding"]},
 68 |              'fs__threshold': {"search": "uniform",
 69 |                                "space": [0.01, 0.3]},
 70 |              'est__max_depth': {"search": "choice",
 71 |                                 "space": [3, 4, 5, 6, 7]}
 72 | 
 73 |              }
 74 | 
 75 |     optimal_hyper_parameters = opt.optimise(space, dict, 1)
 76 | 
 77 |     predictor = Predictor(verbose=False)
 78 |     predictor.fit_predict(optimal_hyper_parameters, dict)
 79 |     pred_df = pd.read_csv("save/Survived_predictions.csv")
 80 |     assert np.all(list(pred_df.columns) == ['Unnamed: 0',
 81 |                                             '0.0',
 82 |                                             '1.0',
 83 |                                             'Survived_predicted'])
 84 |     assert np.shape(pred_df) == (418, 4)
 85 | 
 86 | 
 87 | if sys.version_info[0] >= 3:
 88 |     @patch('matplotlib.pyplot.show')
 89 |     def test_fit_predict_predictor_regression(mock_show):
 90 |         """Test fit_predict method of Predictor class for regression."""
 91 |         rd = Reader(sep=',')
 92 |         dt = rd.train_test_split(Lpath=["data_for_tests/train_regression.csv",
 93 |                                         "data_for_tests/test_regression.csv"],
 94 |                                  target_name="SalePrice")
 95 | 
 96 |         drift_thresholder = Drift_thresholder()
 97 |         df = drift_thresholder.fit_transform(dt)
 98 | 
 99 |         mape = make_scorer(lambda y_true,
100 |                            y_pred: 100*np.sum(
101 |                                               np.abs(y_true-y_pred)/y_true
102 |                                               )/len(y_true),
103 |                            greater_is_better=False,
104 |                            needs_proba=False)
105 |         opt = Optimiser(scoring=mape, n_folds=3)
106 | 
107 |         opt.evaluate(None, df)
108 | 
109 |         space = {
110 |                 'ne__numerical_strategy': {"search": "choice",
111 |                                            "space": [0]},
112 |                 'ce__strategy': {"search": "choice",
113 |                                  "space": ["random_projection"]},
114 |                 'fs__threshold': {"search": "uniform",
115 |                                   "space": [0.01, 0.3]},
116 |                 'est__max_depth': {"search": "choice",
117 |                                    "space": [3, 4, 5, 6, 7]}
118 | 
119 |                 }
120 | 
121 |         best = opt.optimise(space, df, 1)
122 | 
123 |         prd = Predictor(verbose=True)
124 |         prd.fit_predict(best, df)
125 |         pred_df = pd.read_csv("save/SalePrice_predictions.csv")
126 |         assert np.all(list(pred_df.columns) == ['Unnamed: 0',
127 |                                                 'SalePrice_predicted'])
128 |         assert np.shape(pred_df) == (1459, 2)
129 | 
130 | else:
131 |     def test_fit_predict_predictor_regression():
132 |         """Test fit_predict method of Predictor class for regression."""
133 |         rd = Reader(sep=',')
134 |         dt = rd.train_test_split(Lpath=["data_for_tests/train_regression.csv",
135 |                                         "data_for_tests/test_regression.csv"],
136 |                                  target_name="SalePrice")
137 | 
138 |         drift_thresholder = Drift_thresholder()
139 |         df = drift_thresholder.fit_transform(dt)
140 | 
141 |         mape = make_scorer(lambda y_true,
142 |                            y_pred: 100*np.sum(
143 |                                               np.abs(y_true-y_pred)/y_true
144 |                                               )/len(y_true),
145 |                            greater_is_better=False,
146 |                            needs_proba=False)
147 |         opt = Optimiser(scoring=mape, n_folds=3)
148 | 
149 |         opt.evaluate(None, df)
150 | 
151 |         space = {
152 |                 'ne__numerical_strategy': {"search": "choice",
153 |                                            "space": [0]},
154 |                 'ce__strategy': {"search": "choice",
155 |                                  "space": ["label_encoding",
156 |                                            "random_projection",
157 |                                            "entity_embedding"]},
158 |                 'fs__threshold': {"search": "uniform",
159 |                                   "space": [0.01, 0.3]},
160 |                 'est__max_depth': {"search": "choice",
161 |                                    "space": [3, 4, 5, 6, 7]}
162 | 
163 |                 }
164 | 
165 |         best = opt.optimise(space, df, 1)
166 | 
167 |         prd = Predictor(verbose=False)
168 |         prd.fit_predict(best, df)
169 |         pred_df = pd.read_csv("save/SalePrice_predictions.csv")
170 |         assert np.all(list(pred_df.columns) == ['Unnamed: 0',
171 |                                                 'SalePrice_predicted'])
172 |         assert np.shape(pred_df) == (1459, 2)
173 | 


--------------------------------------------------------------------------------
/tests/test_reader.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
  5 | # License: BSD 3 clause
  6 | """Test mlbox.preprocessing.reader module."""
  7 | import sys
  8 | 
  9 | import pytest
 10 | import pandas as pd
 11 | import numpy as np
 12 | 
 13 | 
 14 | from mlbox.preprocessing.reader import convert_list
 15 | from mlbox.preprocessing.reader import convert_float_and_dates
 16 | from mlbox.preprocessing.reader import Reader
 17 | 
 18 | 
 19 | def test_init_reader():
 20 |     """Test init method of Reader class."""
 21 |     reader = Reader()
 22 |     assert not reader.sep
 23 |     assert reader.header == 0
 24 |     assert not reader.to_hdf5
 25 |     assert reader.to_path == "save"
 26 |     assert reader.verbose
 27 | 
 28 | 
 29 | def test_clean_reader():
 30 |     """Test clean method of Reader class."""
 31 |     reader = Reader()
 32 |     with pytest.raises(ValueError):
 33 |         reader.clean(path=None, drop_duplicate=False)
 34 |     with pytest.raises(ValueError):
 35 |         reader.clean(path="data_for_tests/train.csv")
 36 |     reader = Reader(sep=",")
 37 |     df = reader.clean(path="data_for_tests/train.csv")
 38 |     assert np.shape(df) == (891, 12)
 39 |     with pytest.raises(ValueError):
 40 |         reader.clean(path="data_for_tests/train.wrong_extension")
 41 |     df_drop = reader.clean(path="data_for_tests/train.csv",
 42 |                            drop_duplicate=True)
 43 |     assert np.shape(df_drop) == (891, 12)
 44 |     assert np.all(df["Name"] == df_drop["Name"])
 45 |     reader = Reader()
 46 |     df_excel = reader.clean(path="data_for_tests/train.xls")
 47 |     assert np.shape(df_excel) == (891, 12)
 48 |     assert np.all(df["Name"] == df_excel["Name"])
 49 |     if (sys.platform == "win32" and sys.version_info[0] <=3 and sys.version_info[1] <=5):
 50 |         pass
 51 |     else:
 52 |         if sys.version_info[0] >= 3:
 53 |             df_hdf = reader.clean(path="data_for_tests/train.h5")
 54 |             assert np.shape(df_hdf) == (891, 12)
 55 |             assert np.all(df["Name"] == df_hdf["Name"])
 56 |         df_json = reader.clean(path="data_for_tests/train.json")
 57 |         assert np.shape(df_json) == (891, 12)
 58 | 
 59 | 
 60 | def test_train_test_split_reader():
 61 |     """Test train_test_split method of Reader class."""
 62 |     reader = Reader(sep=",")
 63 |     with pytest.raises(ValueError):
 64 |         reader.train_test_split(Lpath=None, target_name="target")
 65 |     with pytest.raises(ValueError):
 66 |         reader.train_test_split(Lpath=["data_for_tests/train.csv"],
 67 |                                 target_name=None)
 68 |     with pytest.raises(ValueError):
 69 |         reader = Reader(to_path=None)
 70 |         reader.train_test_split(Lpath=["data_for_tests/train.csv"],
 71 |                                 target_name="Survived")
 72 |     reader = Reader(sep=",")
 73 |     dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"],
 74 |                                    target_name="Survived")
 75 |     assert len(dict) == 3
 76 |     assert "train" in list(dict.keys())
 77 |     assert "test" in list(dict.keys())
 78 |     assert "target" in list(dict.keys())
 79 |     assert np.all(dict["train"].columns == dict["train"].columns)
 80 |     if (sys.version_info[0] >= 3 and sys.platform != "win32"):
 81 |         reader = Reader(to_hdf5=True)
 82 |         dict = reader.train_test_split(Lpath=["data_for_tests/train.h5"],
 83 |                                        target_name="Survived")
 84 |         assert len(dict) == 3
 85 |         assert "train" in list(dict.keys())
 86 |         assert "test" in list(dict.keys())
 87 |         assert "target" in list(dict.keys())
 88 |         assert np.all(dict["train"].columns == dict["train"].columns)
 89 | 
 90 | 
 91 | def test_convert_list_reader():
 92 |     """Test convert_list function of reader module."""
 93 |     data_list = list()
 94 |     data_list.append([1, 2])
 95 |     data_list.append([3, 4])
 96 |     index = ['a', 'b']
 97 |     serie = pd.Series(data=data_list, index=index, name="test")
 98 |     df = convert_list(serie)
 99 |     assert np.all(df.index == serie.index)
100 |     assert np.all(df.columns.values == ['test_item1', 'test_item2'])
101 | 
102 | 
103 | def test_convert_float_and_dates_reader():
104 |     """Test convert_float_and_dates function of reader module."""
105 |     index = ['a', 'b', 'c']
106 |     values = [1, 2, 3]
107 |     serie = pd.Series(data=values, index=index)
108 |     serie = convert_float_and_dates(serie)
109 |     assert serie.dtype == 'float64'
110 | 
111 |     index = ['a', 'b', 'c']
112 |     values = np.array(['2007-07-13', '2006-01-13', '2010-08-13'],
113 |                       dtype='datetime64')
114 |     serie = pd.Series(data=values,
115 |                       index=index,
116 |                       dtype='datetime64[ns]',
117 |                       name="test")
118 |     df = convert_float_and_dates(serie)
119 |     assert np.all(df.index == serie.index)
120 |     assert np.all(df.columns.values == ['test_TIMESTAMP',
121 |                                         'test_YEAR',
122 |                                         'test_MONTH',
123 |                                         'test_DAY',
124 |                                         'test_DAYOFWEEK',
125 |                                         'test_HOUR'])
126 | 
127 |     index = ['a', 'b', 'c']
128 |     values = np.array(['2007-07-13', '2006-01-13', '2010-08-13'])
129 |     serie = pd.Series(data=values, index=index, name="test")
130 |     df = convert_float_and_dates(serie)
131 |     assert np.all(df.index == serie.index)
132 |     assert np.all(df.columns.values == ['test_TIMESTAMP',
133 |                                         'test_YEAR',
134 |                                         'test_MONTH',
135 |                                         'test_DAY',
136 |                                         'test_DAYOFWEEK',
137 |                                         'test_HOUR'])
138 | 


--------------------------------------------------------------------------------
/tests/test_regression_feature_selector.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding: utf-8
 3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
 4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
 5 | # License: BSD 3 clause
 6 | # import pytest
 7 | """Test mlbox.model.regression.feature_selector module."""
 8 | import pytest
 9 | import pandas as pd
10 | 
11 | from mlbox.model.regression.feature_selector import Reg_feature_selector
12 | 
13 | 
14 | def test_init_Reg_feature_selector():
15 |     """Test init method of Reg_feature_selector class."""
16 |     feature_selector = Reg_feature_selector()
17 |     assert feature_selector.strategy == "l1"
18 |     assert feature_selector.threshold == 0.3
19 |     assert not feature_selector._Reg_feature_selector__fitOK
20 |     assert feature_selector._Reg_feature_selector__to_discard == []
21 | 
22 | 
23 | def test_get_params_Reg_feature_selector():
24 |     """Test get_params method of Reg_feature_selector class."""
25 |     feature_selector = Reg_feature_selector()
26 |     dict = {'strategy': "l1",
27 |             'threshold': 0.3}
28 |     assert feature_selector.get_params() == dict
29 | 
30 | 
31 | def test_set_params_Reg_feature_selector():
32 |     """Test set_params of method Reg_feature_selector class."""
33 |     feature_selector = Reg_feature_selector()
34 |     feature_selector.set_params(strategy="variance")
35 |     assert feature_selector.strategy == "variance"
36 |     feature_selector.set_params(threshold=0.2)
37 |     assert feature_selector.threshold == 0.2
38 |     with pytest.warns(UserWarning) as record:
39 |         feature_selector.set_params(wrong_strategy="wrong_strategy")
40 |     assert len(record) == 1
41 | 
42 | 
43 | def test_fit_Reg_feature_selector():
44 |     """Test fit method of Reg_feature_selector class."""
45 |     feature_selector = Reg_feature_selector()
46 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
47 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
48 |     with pytest.raises(ValueError):
49 |         feature_selector.fit(None, y_train)
50 |     with pytest.raises(ValueError):
51 |         feature_selector.fit(df_train, None)
52 |     feature_selector.fit(df_train, y_train)
53 |     assert feature_selector._Reg_feature_selector__fitOK
54 |     feature_selector.set_params(strategy="variance")
55 |     feature_selector.fit(df_train, y_train)
56 |     assert feature_selector._Reg_feature_selector__fitOK
57 |     feature_selector.set_params(strategy="rf_feature_importance")
58 |     feature_selector.fit(df_train, y_train)
59 |     assert feature_selector._Reg_feature_selector__fitOK
60 |     feature_selector.set_params(strategy="wrond_strategy")
61 |     with pytest.raises(ValueError):
62 |         feature_selector.fit(df_train, y_train)
63 | 
64 | 
65 | def test_transform_Reg_feature_selector():
66 |     """Test transform method of Reg_feature_selector class."""
67 |     feature_selector = Reg_feature_selector(threshold=0)
68 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
69 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
70 |     with pytest.raises(ValueError):
71 |         feature_selector.transform(df_train)
72 |     feature_selector.fit(df_train, y_train)
73 |     with pytest.raises(ValueError):
74 |         feature_selector.transform(None)
75 |     df_transformed = feature_selector.transform(df_train)
76 |     assert (df_transformed.columns == df_train.columns).all()
77 | 
78 | 
79 | def test_fit_transform_Reg_feature_selector():
80 |     """Test fit_transform method of Reg_feature_selector class."""
81 |     feature_selector = Reg_feature_selector(threshold=0)
82 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
83 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
84 |     df_transformed = feature_selector.fit_transform(df_train, y_train)
85 |     assert (df_transformed.columns == df_train.columns).all()
86 | 


--------------------------------------------------------------------------------
/tests/test_regressor.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
  5 | # License: BSD 3 clause
  6 | # import pytest
  7 | """Test mlbox.model.regression.regressor module."""
  8 | import pytest
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | from mlbox.model.regression.regressor import Regressor
 13 | from lightgbm import LGBMRegressor
 14 | 
 15 | 
 16 | def test_init_regressor():
 17 |     """Test init method of Regressor class."""
 18 |     regressor = Regressor()
 19 |     assert regressor._Regressor__strategy == "LightGBM"
 20 |     assert regressor._Regressor__regress_params == {}
 21 |     assert regressor._Regressor__regressor
 22 |     assert not regressor._Regressor__col
 23 |     assert not regressor._Regressor__fitOK
 24 | 
 25 | 
 26 | def test_get_params_regressor():
 27 |     """Test get_params method of Regressor class."""
 28 |     regressor = Regressor()
 29 |     params = regressor.get_params()
 30 |     assert params == {'strategy': "LightGBM"}
 31 |     assert not regressor._Regressor__regress_params
 32 | 
 33 | 
 34 | def test_set_params_regressor():
 35 |     """Test set_params method of Regressor class."""
 36 |     regressor = Regressor()
 37 |     regressor.set_params(strategy="LightGBM")
 38 |     assert regressor._Regressor__strategy == "LightGBM"
 39 |     regressor.set_params(strategy="RandomForest")
 40 |     assert regressor._Regressor__strategy == "RandomForest"
 41 |     regressor.set_params(strategy="ExtraTrees")
 42 |     assert regressor._Regressor__strategy == "ExtraTrees"
 43 |     regressor.set_params(strategy="RandomForest")
 44 |     assert regressor._Regressor__strategy == "RandomForest"
 45 |     regressor.set_params(strategy="Tree")
 46 |     assert regressor._Regressor__strategy == "Tree"
 47 |     regressor.set_params(strategy="AdaBoost")
 48 |     assert regressor._Regressor__strategy == "AdaBoost"
 49 |     regressor.set_params(strategy="Linear")
 50 |     assert regressor._Regressor__strategy == "Linear"
 51 |     regressor.set_params(strategy="Bagging")
 52 |     assert regressor._Regressor__strategy == "Bagging"
 53 |     with pytest.warns(UserWarning) as record:
 54 |         regressor.set_params(wrong_strategy="wrong_strategy")
 55 |     assert len(record) == 1
 56 | 
 57 | 
 58 | def test_set_regressor():
 59 |     """Test set method of Regressor class."""
 60 |     regressor = Regressor()
 61 |     with pytest.raises(ValueError):
 62 |         regressor._Regressor__set_regressor("wrong_strategy")
 63 | 
 64 | 
 65 | def test_fit_regressor():
 66 |     """Test fit method of Regressor class."""
 67 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 68 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
 69 |     regressor = Regressor()
 70 |     regressor.fit(df_train, y_train)
 71 |     assert np.all(regressor._Regressor__col == df_train.columns)
 72 |     assert regressor._Regressor__fitOK
 73 | 
 74 | 
 75 | def test_feature_importances_regressor():
 76 |     """Test feature_importances of Regressor class."""
 77 |     regressor = Regressor()
 78 |     with pytest.raises(ValueError):
 79 |         regressor.feature_importances()
 80 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 81 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
 82 |     regressor.set_params(strategy="LightGBM")
 83 |     regressor.fit(df_train, y_train)
 84 |     importance = regressor.feature_importances()
 85 |     assert importance != {}
 86 |     regressor.set_params(strategy="Linear")
 87 |     regressor.fit(df_train, y_train)
 88 |     importance = regressor.feature_importances()
 89 |     assert importance != {}
 90 |     regressor.set_params(strategy="RandomForest")
 91 |     regressor.fit(df_train, y_train)
 92 |     importance = regressor.feature_importances()
 93 |     assert importance != {}
 94 |     regressor.set_params(strategy="AdaBoost")
 95 |     regressor.fit(df_train, y_train)
 96 |     importance = regressor.feature_importances()
 97 |     assert importance != {}
 98 |     regressor.set_params(strategy="Bagging")
 99 |     regressor.fit(df_train, y_train)
100 |     importance = regressor.feature_importances()
101 |     assert importance != {}
102 | 
103 | 
104 | def test_predict_regressor():
105 |     """Test predict method of Regressor class."""
106 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
107 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
108 |     regressor = Regressor()
109 |     with pytest.raises(ValueError):
110 |         regressor.predict(df_train)
111 |     regressor.fit(df_train, y_train)
112 |     with pytest.raises(ValueError):
113 |         regressor.predict(None)
114 |     assert len(regressor.predict(df_train)) > 0
115 | 
116 | 
117 | def test_score_regressor():
118 |     """Test_score method of Regressor class."""
119 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
120 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
121 |     regressor = Regressor(strategy="Linear")
122 |     with pytest.raises(ValueError):
123 |         regressor.score(df_train, y_train)
124 |     regressor.fit(df_train, y_train)
125 |     with pytest.raises(ValueError):
126 |         regressor.score(None, y_train)
127 |     with pytest.raises(ValueError):
128 |         regressor.score(df_train, None)
129 |     assert regressor.score(df_train, y_train) > 0
130 | 
131 | 
132 | def test_get_estimator_regressor():
133 |     """Test get_estimator of Regressor class."""
134 |     regressor = Regressor()
135 |     estimator = regressor.get_estimator()
136 |     assert isinstance(estimator, type(LGBMRegressor()))
137 | 


--------------------------------------------------------------------------------
/tests/test_stacking_classifer.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
  5 | # License: BSD 3 clause
  6 | """Test mlbox.model.classification.stacking_classifier module."""
  7 | import pytest
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | from sklearn.linear_model import LogisticRegression
 12 | from mlbox.model.classification.stacking_classifier import StackingClassifier
 13 | 
 14 | 
 15 | def test_init_stacking_classifier():
 16 |     """Test init method of StackingClassifier class."""
 17 |     with pytest.raises(ValueError):
 18 |         stacking_classifier = StackingClassifier(base_estimators=dict())
 19 |     with pytest.raises(ValueError):
 20 |         stacking_classifier = StackingClassifier(n_folds=dict())
 21 |     with pytest.raises(ValueError):
 22 |         stacking_classifier = StackingClassifier(copy="True")
 23 |     with pytest.raises(ValueError):
 24 |         stacking_classifier = StackingClassifier(drop_first="True")
 25 |     with pytest.raises(ValueError):
 26 |         stacking_classifier = StackingClassifier(random_state="1")
 27 |     with pytest.raises(ValueError):
 28 |         stacking_classifier = StackingClassifier(verbose="True")
 29 |     stacking_classifier = StackingClassifier()
 30 |     assert len(stacking_classifier.base_estimators) == 3
 31 |     assert isinstance(stacking_classifier.level_estimator,
 32 |                       type(LogisticRegression()))
 33 |     assert stacking_classifier.n_folds == 5
 34 |     assert not stacking_classifier.copy
 35 |     assert stacking_classifier.drop_first
 36 |     assert stacking_classifier.random_state == 1
 37 |     assert stacking_classifier.verbose
 38 |     assert not stacking_classifier._StackingClassifier__fitOK
 39 |     assert not stacking_classifier._StackingClassifier__fittransformOK
 40 | 
 41 | 
 42 | def test_get_params_stacking_classifier():
 43 |     """Test get_params method StackingClassifier class."""
 44 |     stacking_classifier = StackingClassifier()
 45 |     dict = stacking_classifier.get_params()
 46 |     assert len(dict["base_estimators"]) == 3
 47 |     assert isinstance(dict["level_estimator"],
 48 |                       type(LogisticRegression()))
 49 |     assert dict["n_folds"] == 5
 50 |     assert not dict["copy"]
 51 |     assert dict["drop_first"]
 52 |     assert dict["random_state"] == 1
 53 |     assert dict["verbose"]
 54 | 
 55 | 
 56 | def test_set_params_stacking_classifier():
 57 |     """Test set_params method of StackingClassifier class."""
 58 |     stacking_classifier = StackingClassifier()
 59 |     stacking_classifier.set_params(n_folds=6)
 60 |     assert stacking_classifier.n_folds == 6
 61 |     stacking_classifier.set_params(copy=True)
 62 |     assert stacking_classifier.copy
 63 |     stacking_classifier.set_params(drop_first=False)
 64 |     assert not stacking_classifier.drop_first
 65 |     stacking_classifier.set_params(random_state=2)
 66 |     assert stacking_classifier.random_state == 2
 67 |     stacking_classifier.set_params(verbose=False)
 68 |     assert not stacking_classifier.verbose
 69 |     with pytest.warns(UserWarning) as record:
 70 |         stacking_classifier.set_params(wrong_parameters=None)
 71 |     assert len(record) == 1
 72 | 
 73 | 
 74 | def test_fit_transform_stacking_classifier():
 75 |     """Test fit_transform method of StackingClassifier class."""
 76 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 77 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
 78 |     stacking_classifier = StackingClassifier()
 79 |     with pytest.raises(ValueError):
 80 |         stacking_classifier.fit_transform(None, y_train)
 81 |     with pytest.raises(ValueError):
 82 |         stacking_classifier.fit_transform(df_train, None)
 83 |     stacking_classifier.fit_transform(df_train, y_train)
 84 |     assert stacking_classifier._StackingClassifier__fittransformOK
 85 | 
 86 | 
 87 | def test_transform_stacking_classifier():
 88 |     """Test transform method of StackingClassifier class."""
 89 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 90 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
 91 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
 92 |     stacking_classifier = StackingClassifier()
 93 |     with pytest.raises(ValueError):
 94 |         stacking_classifier.transform(None)
 95 |     with pytest.raises(ValueError):
 96 |         stacking_classifier.transform(df_test)
 97 |     stacking_classifier.fit_transform(df_train, y_train)
 98 |     results = stacking_classifier.transform(df_test)
 99 |     assert len(results.columns == 3)
100 | 
101 | 
102 | def test_fit_stacking_classifier():
103 |     """Test fit method of StackingClassifier class."""
104 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
105 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
106 |     stacking_classifier = StackingClassifier(verbose=True)
107 |     stacking_classifier.fit(df_train, y_train)
108 |     assert stacking_classifier._StackingClassifier__fitOK
109 | 
110 | 
111 | def test_predict_proba_stacking_classifier():
112 |     """Test predict_proba method of StackingClassifier class."""
113 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
114 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
115 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
116 |     stacking_classifier = StackingClassifier()
117 |     with pytest.raises(ValueError):
118 |         stacking_classifier.predict_proba(df_test)
119 |     stacking_classifier.fit(df_train, y_train)
120 |     results = stacking_classifier.predict_proba(df_test)
121 |     assert np.shape(results) == (418, 2)
122 | 
123 | 
124 | def test_predict_stacking_classifier():
125 |     """Test predict method of StackingClassifier class."""
126 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
127 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
128 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
129 |     stacking_classifier = StackingClassifier()
130 |     with pytest.raises(ValueError):
131 |         stacking_classifier.predict(df_test)
132 |     stacking_classifier.fit(df_train, y_train)
133 |     results = stacking_classifier.predict(df_test)
134 |     assert np.shape(results) == (418,)
135 | 


--------------------------------------------------------------------------------
/tests/test_stacking_regressor.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
  4 | # Author: Henri GERARD <hgerard.pro@gmail.com>
  5 | # License: BSD 3 clause
  6 | """Test mlbox.model.regression.stacking_regressor module."""
  7 | import pytest
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | from sklearn.linear_model import LinearRegression
 12 | from mlbox.model.regression.stacking_regressor import StackingRegressor
 13 | 
 14 | 
 15 | def test_init_stacking_regressor():
 16 |     """Test init method of StackingRegressor class."""
 17 |     with pytest.raises(ValueError):
 18 |         stacking_regressor = StackingRegressor(base_estimators=dict())
 19 |     with pytest.raises(ValueError):
 20 |         stacking_regressor = StackingRegressor(n_folds=dict())
 21 |     with pytest.raises(ValueError):
 22 |         stacking_regressor = StackingRegressor(copy="True")
 23 |     with pytest.raises(ValueError):
 24 |         stacking_regressor = StackingRegressor(random_state="1")
 25 |     with pytest.raises(ValueError):
 26 |         stacking_regressor = StackingRegressor(verbose="True")
 27 |     stacking_regressor = StackingRegressor()
 28 |     assert len(stacking_regressor.base_estimators) == 3
 29 |     assert isinstance(stacking_regressor.level_estimator,
 30 |                       type(LinearRegression()))
 31 |     assert stacking_regressor.n_folds == 5
 32 |     assert not stacking_regressor.copy
 33 |     assert stacking_regressor.random_state == 1
 34 |     assert stacking_regressor.verbose
 35 |     assert not stacking_regressor._StackingRegressor__fitOK
 36 |     assert not stacking_regressor._StackingRegressor__fittransformOK
 37 | 
 38 | 
 39 | def test_get_params_stacking_regressor():
 40 |     """Test get_params method of StackingRegressor class."""
 41 |     stacking_regressor = StackingRegressor()
 42 |     dict = stacking_regressor.get_params()
 43 |     assert len(dict["base_estimators"]) == 3
 44 |     assert isinstance(dict["level_estimator"],
 45 |                       type(LinearRegression()))
 46 |     assert dict["n_folds"] == 5
 47 |     assert not dict["copy"]
 48 |     assert dict["random_state"] == 1
 49 |     assert dict["verbose"]
 50 | 
 51 | 
 52 | def test_set_params_stacking_regressor():
 53 |     """Test set_params method of StackingRegressor class."""
 54 |     stacking_regressor = StackingRegressor()
 55 |     stacking_regressor.set_params(n_folds=6)
 56 |     assert stacking_regressor.n_folds == 6
 57 |     stacking_regressor.set_params(copy=True)
 58 |     assert stacking_regressor.copy
 59 |     stacking_regressor.set_params(random_state=2)
 60 |     assert stacking_regressor.random_state == 2
 61 |     stacking_regressor.set_params(verbose=False)
 62 |     assert not stacking_regressor.verbose
 63 |     with pytest.warns(UserWarning) as record:
 64 |         stacking_regressor.set_params(wrong_parameters=None)
 65 |     assert len(record) == 1
 66 | 
 67 | 
 68 | def test_fit_transform_stacking_regressor():
 69 |     """Test fit_transform method of Stacking regressor class."""
 70 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 71 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
 72 |     stacking_regressor = StackingRegressor()
 73 |     with pytest.raises(ValueError):
 74 |         stacking_regressor.fit_transform(None, y_train)
 75 |     with pytest.raises(ValueError):
 76 |         stacking_regressor.fit_transform(df_train, None)
 77 |     stacking_regressor.fit_transform(df_train, y_train)
 78 |     assert stacking_regressor._StackingRegressor__fittransformOK
 79 | 
 80 | 
 81 | def test_transform_stacking_regressor():
 82 |     """Test transform method of StackingRegressor class."""
 83 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 84 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
 85 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
 86 |     stacking_regressor = StackingRegressor()
 87 |     with pytest.raises(ValueError):
 88 |         stacking_regressor.transform(None)
 89 |     with pytest.raises(ValueError):
 90 |         stacking_regressor.transform(df_test)
 91 |     stacking_regressor.fit_transform(df_train, y_train)
 92 |     results = stacking_regressor.transform(df_test)
 93 |     assert len(results.columns == 3)
 94 | 
 95 | 
 96 | def test_fit_stacking_regressor():
 97 |     """Test fit method of StackingRegressor class."""
 98 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
 99 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
100 |     stacking_regressor = StackingRegressor(verbose=True)
101 |     stacking_regressor.fit(df_train, y_train)
102 |     assert stacking_regressor._StackingRegressor__fitOK
103 | 
104 | 
105 | def test_predict_stacking_regressor():
106 |     """Test predict method of StackingRegressor class."""
107 |     df_train = pd.read_csv("data_for_tests/clean_train.csv")
108 |     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
109 |     df_test = pd.read_csv("data_for_tests/clean_test.csv")
110 |     stacking_regressor = StackingRegressor()
111 |     with pytest.raises(ValueError):
112 |         stacking_regressor.predict(df_test)
113 |     stacking_regressor.fit(df_train, y_train)
114 |     results = stacking_regressor.predict(df_test)
115 |     assert np.shape(results) == (418,)
116 | 


--------------------------------------------------------------------------------