├── .idea
├── .name
├── .gitignore
├── vcs.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── PyRapidML.iml
├── PyRapidML
├── __init__.py
├── .DS_Store
├── datasets.py
├── eda.py
├── utils.py
├── regression.py
└── classification.py
├── .DS_Store
├── docs
├── .DS_Store
├── source
│ ├── .DS_Store
│ ├── api
│ │ ├── .DS_Store
│ │ ├── datasets.rst
│ │ ├── regression.rst
│ │ ├── classification.rst
│ │ ├── eda.rst
│ │ └── natural_language_processing.rst
│ ├── _static
│ │ ├── .DS_Store
│ │ └── css
│ │ │ └── custom.css
│ ├── requirements.txt
│ ├── index.rst
│ └── conf.py
├── Makefile
└── make.bat
├── tests
└── .DS_Store
├── Tutorials
├── .DS_Store
├── Regression
│ ├── .DS_Store
│ └── Final ET Model 30May2021.pkl
└── Classification
│ ├── .DS_Store
│ └── Final RF Model 11Nov2020.pkl
├── .readthedocs.yaml.swp
├── .readthedocs.yaml
├── setup.py
├── LICENSE
├── .gitignore
└── README.md
/.idea/.name:
--------------------------------------------------------------------------------
1 | index.rst
--------------------------------------------------------------------------------
/PyRapidML/__init__.py:
--------------------------------------------------------------------------------
1 | from PyRapidML.utils import __version__
2 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/.DS_Store
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/docs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/.DS_Store
--------------------------------------------------------------------------------
/tests/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/tests/.DS_Store
--------------------------------------------------------------------------------
/PyRapidML/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/PyRapidML/.DS_Store
--------------------------------------------------------------------------------
/Tutorials/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/.DS_Store
--------------------------------------------------------------------------------
/.readthedocs.yaml.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/.readthedocs.yaml.swp
--------------------------------------------------------------------------------
/docs/source/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/.DS_Store
--------------------------------------------------------------------------------
/docs/source/api/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/api/.DS_Store
--------------------------------------------------------------------------------
/docs/source/_static/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/_static/.DS_Store
--------------------------------------------------------------------------------
/Tutorials/Regression/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Regression/.DS_Store
--------------------------------------------------------------------------------
/docs/source/api/datasets.rst:
--------------------------------------------------------------------------------
1 | Datasets
2 | ===================
3 | .. automodule:: PyRapidML.datasets
4 | :members:
--------------------------------------------------------------------------------
/Tutorials/Classification/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Classification/.DS_Store
--------------------------------------------------------------------------------
/docs/source/api/regression.rst:
--------------------------------------------------------------------------------
1 | Regression
2 | ===================
3 | .. automodule:: PyRapidML.regression
4 | :members:
--------------------------------------------------------------------------------
/docs/source/api/classification.rst:
--------------------------------------------------------------------------------
1 | Classification
2 | ===================
3 | .. automodule:: PyRapidML.classification
4 | :members:
--------------------------------------------------------------------------------
/docs/source/api/eda.rst:
--------------------------------------------------------------------------------
1 | Exploratory Data Analysis
2 | =====================================
3 | .. automodule:: PyRapidML.eda
4 | :members:
--------------------------------------------------------------------------------
/Tutorials/Regression/Final ET Model 30May2021.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Regression/Final ET Model 30May2021.pkl
--------------------------------------------------------------------------------
/docs/source/api/natural_language_processing.rst:
--------------------------------------------------------------------------------
1 | NLP
2 | ===================
3 | .. automodule:: PyRapidML.natural_language_processing
4 | :members:
--------------------------------------------------------------------------------
/Tutorials/Classification/Final RF Model 11Nov2020.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Classification/Final RF Model 11Nov2020.pkl
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/PyRapidML.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/docs/source/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=3.0.0
2 | sphinx-rtd-theme>=0.5.0
3 | pandas
4 | scipy<=1.5.4
5 | numpy==1.19.5
6 | seaborn
7 | matplotlib
8 | IPython
9 | joblib
10 | scikit-learn==0.23.2
11 | ipywidgets
12 | yellowbrick>=1.0.1
13 | lightgbm>=2.3.1
14 | plotly>=4.4.1
15 | wordcloud
16 | textblob
17 | cufflinks>=0.17.0
18 | umap-learn
19 | pyLDAvis
20 | gensim<4.0.0
21 | spacy<2.4.0
22 | nltk
23 | mlxtend>=0.17.0
24 | pyod
25 | pandas-profiling>=2.8.0
26 | kmodes>=0.10.1
27 | mlflow
28 | imbalanced-learn==0.7.0
29 | scikit-plot #for lift and gain charts
30 | Boruta
31 | pycaret
32 | typing
33 |
34 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/source/conf.py
11 |
12 | # Optionally build your docs in additional formats such as PDF
13 | formats:
14 | - pdf
15 |
16 | # Optionally set the version of Python and requirements required to build your docs
17 | python:
18 | version: 3.7
19 | install:
20 | - requirements: docs/source/requirements.txt
21 |
22 |
23 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open("README.md", "r") as f:
4 | long_description = f.read()
5 |
6 | setup(
7 | name="PyRapidML", # Replace with your own username
8 | version="1.0.13",
9 | author="Zain Ali",
10 | author_email="zainbalouch3@gmail.com",
11 | description="An open source and low code machine learning library for quick and robust analysis",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/Zainali5/PyRapidML",
15 | packages=find_packages(),
16 | classifiers=[
17 | "Programming Language :: Python :: 3",
18 | "License :: OSI Approved :: MIT License",
19 | "Operating System :: OS Independent",
20 | ]
21 | )
22 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Zainali5
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/docs/source/_static/css/custom.css:
--------------------------------------------------------------------------------
1 | .rst-content dl:not(.docutils) dt:first-child {
2 | margin-top: 0;
3 | }
4 |
5 | .rst-content dl:not(.docutils) dl dt {
6 | margin-bottom: 4px;
7 | border: none;
8 | border-left: solid 3px #ccc;
9 | background: #f0f0f0;
10 | color: #555;
11 | }
12 |
13 | .rst-content dl table,
14 | .rst-content dl ul,
15 | .rst-content dl ol,
16 | .rst-content dl p {
17 | margin-bottom: 8px !important;
18 | }
19 |
20 | .rst-content dl:not(.docutils) dt {
21 | display: table;
22 | margin: 6px 0;
23 | font-size: 90%;
24 | line-height: normal;
25 | background: #e7f2fa;
26 | color: #2980b9;
27 | border-top: solid 3px #6ab0de;
28 | padding: 6px;
29 | position: relative;
30 | }
31 |
32 | html.writer-html5 .rst-content dl.field-list {
33 | display: initial;
34 | }
35 |
36 | html.writer-html5 .rst-content dl.field-list > dd,
37 | html.writer-html5 .rst-content dl.field-list > dt {
38 | margin-bottom: 4px;
39 | padding-left: 6px;
40 | }
41 |
42 | p {
43 | line-height: 20px;
44 | font-size: 14px;
45 | }
46 |
47 | html.writer-html5 .rst-content dl.field-list > dt:after {
48 | content: initial;
49 | }
50 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. PyRapidML documentation master file, created by
2 | sphinx-quickstart on Tue Jun 8 22:08:54 2021.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | PyRapid Homepage!
7 | =====================================
8 |
9 | PyRapidML is an open source Python library which not only helps in automating Machine Learning Workflows but also helps in building end to end ML algorithms .
10 | PyRapidML is essentially a Python wrapper around several machine learning libraries and frameworks such as Pycaret, scikit-learn, XGBoost, LightGBM, CatBoost, spaCy, Optuna, Hyperopt, Ray, and many more.
11 |
12 | PyRapidML is a low code library which means writing basic and less lines of code, one can achieve high accuracy in their machine learning models.
13 | There's no need to write hefty lines of code as PyRapidML would compare all possible machine learning algorithms to solve your problem in just a single line of code.
14 | Once PyRapidML gives you the best algorithm. You can further tune the model (in just a single line of code) to further tune the model.
15 |
16 | Are you tired of writing hefty lines of code for your data science problem?
17 | Are you having difficulty figuring out what algorithm performs the best?
18 | Is it hard for you to compare multiple algorithms and see which one has the best accuracy?
19 | Do you face issues in Hyperparameter tuning?
20 | Do you want easy model deployments?
21 | Do you a dream of auto-ml?
22 | Are you facing problems in Exploratory data analysis?
23 | Do you want a library that can automatically perform all steps of data science lifecycle?
24 | Do you want a library that can do Exploratory data analysis, Feature Engineering, Feature Selection, Compare multiple Machine Learning Algos, Hyperparamter tuning, model deployments?
25 |
26 | If the answer is Yes to the above questions then PyRapidML is the library for you.
27 |
28 |
29 | .. toctree::
30 | :maxdepth: 2
31 | :hidden:
32 | :caption: Getting Started
33 |
34 | self
35 |
36 | .. toctree::
37 | :maxdepth: 2
38 | :hidden:
39 | :caption: Documentation
40 |
41 | api/classification
42 | api/regression
43 | api/natural_language_processing
44 | api/datasets
45 | api/eda
--------------------------------------------------------------------------------
/PyRapidML/datasets.py:
--------------------------------------------------------------------------------
1 | # Module: Datasets
2 | # Author: Zain Ali
3 | # License: MIT
4 | # Release: PyRapidML
5 | # Last modified : 30/05/2021
6 |
7 |
8 | def extract_data(dataset="index", save_copy=False, profile=False, verbose=True):
9 |
10 | """
11 | This function loads sample datasets from git repository. List of available
12 | datasets can be checked using ``get_data('index')``.
13 |
14 |
15 | Example
16 | -------
17 | >>> from PyRapidML.datasets import get_data
18 | >>> all_datasets = extract_data('index')
19 | >>> juice = extract_data('juice')
20 |
21 |
22 | dataset: str, default = 'index'
23 | Index value of dataset.
24 |
25 |
26 | save_copy: bool, default = False
27 | When set to true, it saves a copy in current working directory.
28 |
29 |
30 | profile: bool, default = False
31 | When set to true, an interactive EDA report is displayed.
32 |
33 |
34 | verbose: bool, default = True
35 | When set to False, head of data is not displayed.
36 |
37 |
38 | Returns:
39 | pandas.DataFrame
40 |
41 |
42 | Warnings
43 | --------
44 | - Use of ``extract_data`` requires internet connection.
45 |
46 | """
47 |
48 | import pandas as pd
49 | import os.path
50 | from IPython.display import display, HTML, clear_output, update_display
51 |
52 | address = "https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/"
53 | extension = ".csv"
54 | filename = str(dataset) + extension
55 |
56 | complete_address = address + filename
57 |
58 | if os.path.isfile(filename):
59 | data = pd.read_csv(filename)
60 | else:
61 | data = pd.read_csv(complete_address)
62 |
63 | # create a copy for pandas profiler
64 | data_for_profiling = data.copy()
65 |
66 | if save_copy:
67 | save_name = filename
68 | data.to_csv(save_name, index=False)
69 |
70 | if dataset == "index":
71 | display(data)
72 |
73 | else:
74 | if profile:
75 | import pandas_profiling
76 |
77 | pf = pandas_profiling.ProfileReport(data_for_profiling)
78 | display(pf)
79 |
80 | else:
81 | if verbose:
82 | display(data.head())
83 |
84 | return data
85 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 |
16 | sys.path.insert(0, os.path.abspath("../.."))
17 |
18 |
19 | # -- Project information -----------------------------------------------------
20 |
21 | project = "PyRapidML"
22 | copyright = "2021, Zain Ali"
23 | author = "Zain Ali"
24 |
25 | # The full version, including alpha/beta/rc tags
26 | release = "1.0.13"
27 |
28 |
29 | # -- General configuration ---------------------------------------------------
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 |
35 | extensions = [
36 | "sphinx_rtd_theme",
37 | "sphinx.ext.autodoc",
38 | "sphinx.ext.napoleon",
39 | ]
40 |
41 | napoleon_google_docstring = True
42 | napoleon_numpy_docstring = True
43 |
44 | autodoc_mock_imports = ["setup"]
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ["_templates"]
47 |
48 | # List of patterns, relative to source directory, that match files and
49 | # directories to ignore when looking for source files.
50 | # This pattern also affects html_static_path and html_extra_path.
51 | exclude_patterns = []
52 |
53 | # Sort methods by the order they are found in the source files
54 | autodoc_member_order = "bysource"
55 |
56 |
57 | # -- Options for HTML output -------------------------------------------------
58 |
59 | # The theme to use for HTML and HTML Help pages. See the documentation for
60 | # a list of builtin themes.
61 | #
62 | html_theme = "sphinx_rtd_theme"
63 |
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ["_static"]
68 |
69 | html_css_files = ["css/custom.css"]
70 |
71 | master_doc = "index"
72 |
--------------------------------------------------------------------------------
/PyRapidML/eda.py:
--------------------------------------------------------------------------------
1 |
2 | # Author: Zain Ali
3 | # License: MIT
4 | # Release: PyRapidML
5 | # Last modified : 31/05/2021
6 |
7 | import pandas as pd
8 | import numpy as np
9 |
10 | def check_na(dataset):
11 | """
12 |
13 | This function checks missing values and gives the % of missing values in each feature
14 | This function checks missing values and gives the % of missing values in each feature
15 |
16 |
17 | Example
18 | -------
19 | >>> from PyRapidML.eda import check_na
20 | >>> na_perc = check_na(df)
21 |
22 | df: dataframe
23 |
24 |
25 | """
26 | # Here we will check the percentage of nan values present in each feature
27 | ## 1 -step make the list of features which has missing values
28 | features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]
29 | ## 2- step print the feature name and the percentage of missing values
30 | if len(features_with_na) > 0:
31 | for feature in features_with_na:
32 | print(feature, np.round(dataset[feature].isnull().mean(), 4), ' % missing values')
33 | #return pycaret.internal.tabular.check_na(dataset=dataset)
34 | else:
35 | print("No Missing Values")
36 |
37 |
38 |
39 |
40 | def numerical_features(dataset):
41 | # list of numerical variables
42 | """
43 |
44 | This function tells total numerical features and further tell how many of them are discrete and continuous
45 | This function checks missing values and gives the % of missing values in each feature
46 |
47 | Example
48 | -------
49 | >>> from PyRapidML.eda import numerical_features
50 | >>> num_fea = numerical_features(df)
51 |
52 | df: dataframe
53 |
54 |
55 | """
56 | numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
57 |
58 | print('Number of numerical variables: ', len(numerical_features))
59 |
60 | # visualise the numerical variables
61 | #print(dataset[numerical_features].head())
62 | ## Numerical variables are usually of 2 type
63 | ## 1. Continous variable and Discrete Variables
64 |
65 | discrete_feature=[feature for feature in numerical_features if len(dataset[feature].unique())<25]
66 | print("Discrete Variables Count: {}".format(len(discrete_feature)))
67 |
68 | continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature]
69 | print("Continuous feature Count {}".format(len(continuous_feature)))
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## PyRapidML
2 | # Introduction
3 | Are you tired of writing hefty lines of code for your data science problem?
4 | Are you having difficulty figuring out what algorithm performs the best?
5 | Is it hard for you to compare multiple algorithms and see which one has the best accuracy?
6 | Do you face issues in Hyperparameter tuning?
7 | Do you want easy model deployments?
8 | Do you a dream of auto-ml?
9 | Are you facing problems in Exploratory data analysis?
10 | Do you want a library that can automatically perform all steps of data science lifecycle?
11 | Do you want a library that can do Exploratory data analysis, Feature Engineering, Feature Selection, Compare multiple Machine Learning Algos, Hyperparamter tuning, model deployments?
12 |
13 | If the answer is Yes to the above questions then PyRapidML is the library for you.
14 |
15 | PyRapidML is an open source Python machine learning library.
16 | PyRapidML is essentially a Python wrapper around several machine learning libraries and frameworks such as Pycaret, scikit-learn, XGBoost, LightGBM, CatBoost, spaCy, Optuna, Hyperopt, Ray, and many more.
17 |
18 | PyRapidML is an open source Python library which not only helps in automating Machine Learning Workflows but also helps in building end to end ML algorithms .
19 |
20 | PyRapidML is a low code library which means writing basic and less lines of code, one can achieve high accuracy in their machine learning models.
21 | There's no need to write hefty lines of code as PyRapidML would compare all possible machine learning algorithms to solve your problem in just a single line of code.
22 | Once PyRapidML gives you the best algorithm. You can further tune the model (in just a single line of code) to further tune the model.
23 |
24 | Initial idea of PyRapidML was inspired by PyCaret library in Python.
25 |
26 | # What data science problems PyRapidML can cater?
27 | Regression
28 | Classification
29 | Natural Language Processing
30 |
31 | # What PyRapidML has to offer currently?
32 | Data Prep
33 | Exploratory Data Analysis
34 | Model Training
35 | Finding the best ML model
36 | Hyperparameter tuning
37 | Model Deployment
38 | Analysis and Interpretability
39 |
40 |
41 | # Who is this library for?
42 | This library is for:
43 | Data Scientists
44 | Citizen Data Scientists
45 | Data Science Students
46 | Data Analysts
47 | Data Professionals who want to build end to end data science solutions
48 |
49 | # How to install this library?
50 | pip install PyRapidML
51 |
52 | # Important Links
53 | - Docs: https://pyrapidml.readthedocs.io/en/latest/
54 | - Github Link: https://github.com/Zainali5/PyRapidML
55 | - Pypi link: https://pypi.org/project/PyRapidML/1.0.13/
56 | # Current Release
57 | PyRapidML 1.0.13 is now available. The easiest way to install PyRapidML is using pip.
58 |
--------------------------------------------------------------------------------
/PyRapidML/utils.py:
--------------------------------------------------------------------------------
1 | # Module: Utility
2 | # Author: Zain Ali
3 | # License: MIT
4 | # Release: PyRapidML
5 | # Last modified : 31/05/2021
6 |
7 | import pandas as pd
8 |
9 | version_ = "1.0.13"
10 | nightly_version_ = "1.0.13"
11 |
12 | __version__ = version_
13 |
14 |
15 | def version():
16 | return version_
17 |
18 |
19 | def nightly_version():
20 | return nightly_version_
21 |
22 |
23 | def check_metric(actual: pd.Series, prediction: pd.Series, metric: str, round: int = 4):
24 |
25 | """
26 | Function to evaluate classification and regression metrics.
27 |
28 |
29 | actual : pandas.Series
30 | Actual values of the target variable.
31 |
32 |
33 | prediction : pandas.Series
34 | Predicted values of the target variable.
35 |
36 |
37 | metric : str
38 | Metric to use.
39 |
40 |
41 | round: integer, default = 4
42 | Number of decimal places the metrics will be rounded to.
43 |
44 |
45 | Returns:
46 | float
47 |
48 | """
49 |
50 | # general dependencies
51 | import pycaret.containers.metrics.classification
52 | import pycaret.containers.metrics.regression
53 |
54 | globals_dict = {"y": prediction}
55 | metric_containers = {
56 | **pycaret.containers.metrics.classification.get_all_metric_containers(
57 | globals_dict
58 | ),
59 | **pycaret.containers.metrics.regression.get_all_metric_containers(globals_dict),
60 | }
61 | metrics = {v.name: v.score_func for k, v in metric_containers.items()}
62 |
63 | # metric calculation starts here
64 |
65 | if metric in metrics:
66 | try:
67 | result = metrics[metric](actual, prediction)
68 | except:
69 | from sklearn.preprocessing import LabelEncoder
70 |
71 | le = LabelEncoder()
72 | actual = le.fit_transform(actual)
73 | prediction = le.transform(prediction)
74 | result = metrics[metric](actual, prediction)
75 | result = result.round(round)
76 | return float(result)
77 | else:
78 | raise ValueError(
79 | f"Couldn't find metric '{metric}' Possible metrics are: {', '.join(metrics.keys())}."
80 | )
81 |
82 |
83 | def enable_colab():
84 | from IPython.display import display, HTML, clear_output, update_display
85 |
86 | """
87 | Function to render plotly visuals in colab.
88 | """
89 |
90 | def configure_plotly_browser_state():
91 |
92 | import IPython
93 |
94 | display(
95 | IPython.core.display.HTML(
96 | """
97 |
98 |
106 | """
107 | )
108 | )
109 |
110 | import IPython
111 |
112 | IPython.get_ipython().events.register(
113 | "pre_run_cell", configure_plotly_browser_state
114 | )
115 | print("Colab mode enabled.")
116 |
117 |
118 | def get_system_logs():
119 |
120 | """
121 | Read and print 'logs.log' file from current active directory
122 | """
123 |
124 | with open("logs.log", "r") as file:
125 | lines = file.read().splitlines()
126 |
127 | for line in lines:
128 | if not line:
129 | continue
130 |
131 | columns = [col.strip() for col in line.split(":") if col]
132 | print(columns)
133 |
--------------------------------------------------------------------------------
/PyRapidML/regression.py:
--------------------------------------------------------------------------------
1 | # Module: Regression
2 | # Author: Zain Ali
3 | # License: MIT
4 | # Release: PyRapidML
5 | # Last modified : 31/05/2021
6 |
7 | import pandas as pd
8 | import numpy as np
9 |
10 | import pycaret.internal.tabular
11 | from pycaret.internal.Display import Display, is_in_colab, enable_colab
12 | from typing import List, Tuple, Any, Union, Optional, Dict
13 | import warnings
14 | from IPython.utils import io
15 |
16 | from pycaret.internal.tabular import MLUsecase
17 |
18 | warnings.filterwarnings("ignore")
19 |
20 |
21 | def initializer(
22 | data: pd.DataFrame,
23 | target: str,
24 | train_size: float = 0.7,
25 | test_data: Optional[pd.DataFrame] = None,
26 | preprocess: bool = True,
27 | imputation_type: str = "simple",
28 | iterative_imputation_iters: int = 5,
29 | categorical_features: Optional[List[str]] = None,
30 | categorical_imputation: str = "constant",
31 | categorical_iterative_imputer: Union[str, Any] = "lightgbm",
32 | ordinal_features: Optional[Dict[str, list]] = None,
33 | high_cardinality_features: Optional[List[str]] = None,
34 | high_cardinality_method: str = "frequency",
35 | numeric_features: Optional[List[str]] = None,
36 | numeric_imputation: str = "mean",
37 | numeric_iterative_imputer: Union[str, Any] = "lightgbm",
38 | date_features: Optional[List[str]] = None,
39 | ignore_features: Optional[List[str]] = None,
40 | normalize: bool = False,
41 | normalize_method: str = "zscore",
42 | transformation: bool = False,
43 | transformation_method: str = "yeo-johnson",
44 | handle_unknown_categorical: bool = True,
45 | unknown_categorical_method: str = "least_frequent",
46 | pca: bool = False,
47 | pca_method: str = "linear",
48 | pca_components: Optional[float] = None,
49 | ignore_low_variance: bool = False,
50 | combine_rare_levels: bool = False,
51 | rare_level_threshold: float = 0.10,
52 | bin_numeric_features: Optional[List[str]] = None,
53 | remove_outliers: bool = False,
54 | outliers_threshold: float = 0.05,
55 | remove_multicollinearity: bool = False,
56 | multicollinearity_threshold: float = 0.9,
57 | remove_perfect_collinearity: bool = True,
58 | create_clusters: bool = False,
59 | cluster_iter: int = 20,
60 | polynomial_features: bool = False,
61 | polynomial_degree: int = 2,
62 | trigonometry_features: bool = False,
63 | polynomial_threshold: float = 0.1,
64 | group_features: Optional[List[str]] = None,
65 | group_names: Optional[List[str]] = None,
66 | feature_selection: bool = False,
67 | feature_selection_threshold: float = 0.8,
68 | feature_selection_method: str = "classic",
69 | feature_interaction: bool = False,
70 | feature_ratio: bool = False,
71 | interaction_threshold: float = 0.01,
72 | transform_target: bool = False,
73 | transform_target_method: str = "box-cox",
74 | data_split_shuffle: bool = True,
75 | data_split_stratify: Union[bool, List[str]] = False,
76 | fold_strategy: Union[str, Any] = "kfold",
77 | fold: int = 10,
78 | fold_shuffle: bool = False,
79 | fold_groups: Optional[Union[str, pd.DataFrame]] = None,
80 | n_jobs: Optional[int] = -1,
81 | use_gpu: bool = False,
82 | custom_pipeline: Union[
83 | Any, Tuple[str, Any], List[Any], List[Tuple[str, Any]]
84 | ] = None,
85 | html: bool = True,
86 | session_id: Optional[int] = None,
87 | log_experiment: bool = False,
88 | experiment_name: Optional[str] = None,
89 | log_plots: Union[bool, list] = False,
90 | log_profile: bool = False,
91 | log_data: bool = False,
92 | silent: bool = False,
93 | verbose: bool = True,
94 | profile: bool = False,
95 | profile_kwargs: Dict[str, Any] = None,
96 | ):
97 | """
98 | This function initializes the training environment and creates the transformation
99 | pipeline. Setup function must be called before executing any other function. It takes
100 | two mandatory parameters: ``data`` and ``target``. All the other parameters are
101 | optional.
102 |
103 | Example
104 | -------
105 | >>> from PyRapidML.datasets import get_data
106 | >>> boston = extract_data('boston')
107 | >>> from PyRapidML.regression import *
108 | >>> exp_name = initializer(data = boston, target = 'medv')
109 |
110 |
111 | data : pandas.DataFrame
112 | Shape (n_samples, n_features), where n_samples is the number of samples and
113 | n_features is the number of features.
114 |
115 |
116 | target: str
117 | Name of the target column to be passed in as a string. The target variable can
118 | be either binary or multiclass.
119 |
120 |
121 | train_size: float, default = 0.7
122 | Proportion of the dataset to be used for training and validation. Should be
123 | between 0.0 and 1.0.
124 |
125 |
126 | test_data: pandas.DataFrame, default = None
127 | If not None, test_data is used as a hold-out set and ``train_size`` parameter is
128 | ignored. test_data must be labelled and the shape of data and test_data must
129 | match.
130 |
131 |
132 | preprocess: bool, default = True
133 | When set to False, no transformations are applied except for train_test_split
134 | and custom transformations passed in ``custom_pipeline`` param. Data must be
135 | ready for modeling (no missing values, no dates, categorical data encoding),
136 | when preprocess is set to False.
137 |
138 |
139 | imputation_type: str, default = 'simple'
140 | The type of imputation to use. Can be either 'simple' or 'iterative'.
141 |
142 |
143 | iterative_imputation_iters: int, default = 5
144 | Number of iterations. Ignored when ``imputation_type`` is not 'iterative'.
145 |
146 |
147 | categorical_features: list of str, default = None
148 | If the inferred data types are not correct or the silent param is set to True,
149 | categorical_features param can be used to overwrite or define the data types.
150 | It takes a list of strings with column names that are categorical.
151 |
152 |
153 | categorical_imputation: str, default = 'constant'
154 | Missing values in categorical features are imputed with a constant 'not_available'
155 | value. The other available option is 'mode'.
156 |
157 |
158 | categorical_iterative_imputer: str, default = 'lightgbm'
159 | Estimator for iterative imputation of missing values in categorical features.
160 | Ignored when ``imputation_type`` is not 'iterative'.
161 |
162 |
163 | ordinal_features: dict, default = None
164 | Encode categorical features as ordinal. For example, a categorical feature with
165 | 'low', 'medium', 'high' values where low < medium < high can be passed as
166 | ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }.
167 |
168 |
169 | high_cardinality_features: list of str, default = None
170 | When categorical features contains many levels, it can be compressed into fewer
171 | levels using this parameter. It takes a list of strings with column names that
172 | are categorical.
173 |
174 |
175 | high_cardinality_method: str, default = 'frequency'
176 | Categorical features with high cardinality are replaced with the frequency of
177 | values in each level occurring in the training dataset. Other available method
178 | is 'clustering' which trains the K-Means clustering algorithm on the statistical
179 | attribute of the training data and replaces the original value of feature with the
180 | cluster label. The number of clusters is determined by optimizing Calinski-Harabasz
181 | and Silhouette criterion.
182 |
183 |
184 | numeric_features: list of str, default = None
185 | If the inferred data types are not correct or the silent param is set to True,
186 | numeric_features param can be used to overwrite or define the data types.
187 | It takes a list of strings with column names that are numeric.
188 |
189 |
190 | numeric_imputation: str, default = 'mean'
191 | Missing values in numeric features are imputed with 'mean' value of the feature
192 | in the training dataset. The other available option is 'median' or 'zero'.
193 |
194 |
195 | numeric_iterative_imputer: str, default = 'lightgbm'
196 | Estimator for iterative imputation of missing values in numeric features.
197 | Ignored when ``imputation_type`` is set to 'simple'.
198 |
199 |
200 | date_features: list of str, default = None
201 | If the inferred data types are not correct or the silent param is set to True,
202 | date_features param can be used to overwrite or define the data types. It takes
203 | a list of strings with column names that are DateTime.
204 |
205 |
206 | ignore_features: list of str, default = None
207 | ignore_features param can be used to ignore features during model training.
208 | It takes a list of strings with column names that are to be ignored.
209 |
210 |
211 | normalize: bool, default = False
212 | When set to True, it transforms the numeric features by scaling them to a given
213 | range. Type of scaling is defined by the ``normalize_method`` parameter.
214 |
215 |
216 | normalize_method: str, default = 'zscore'
217 | Defines the method for scaling. By default, normalize method is set to 'zscore'
218 | The standard zscore is calculated as z = (x - u) / s. Ignored when ``normalize``
219 | is not True. The other options are:
220 |
221 | - minmax: scales and translates each feature individually such that it is in
222 | the range of 0 - 1.
223 | - maxabs: scales and translates each feature individually such that the
224 | maximal absolute value of each feature will be 1.0. It does not
225 | shift/center the data, and thus does not destroy any sparsity.
226 | - robust: scales and translates each feature according to the Interquartile
227 | range. When the dataset contains outliers, robust scaler often gives
228 | better results.
229 |
230 |
231 | transformation: bool, default = False
232 | When set to True, it applies the power transform to make data more Gaussian-like.
233 | Type of transformation is defined by the ``transformation_method`` parameter.
234 |
235 |
236 | transformation_method: str, default = 'yeo-johnson'
237 | Defines the method for transformation. By default, the transformation method is
238 | set to 'yeo-johnson'. The other available option for transformation is 'quantile'.
239 | Ignored when ``transformation`` is not True.
240 |
241 |
242 | handle_unknown_categorical: bool, default = True
243 | When set to True, unknown categorical levels in unseen data are replaced by the
244 | most or least frequent level as learned in the training dataset.
245 |
246 |
247 | unknown_categorical_method: str, default = 'least_frequent'
248 | Method used to replace unknown categorical levels in unseen data. Method can be
249 | set to 'least_frequent' or 'most_frequent'.
250 |
251 |
252 | pca: bool, default = False
253 | When set to True, dimensionality reduction is applied to project the data into
254 | a lower dimensional space using the method defined in ``pca_method`` parameter.
255 |
256 |
257 | pca_method: str, default = 'linear'
258 | The 'linear' method performs uses Singular Value Decomposition. Other options are:
259 |
260 | - kernel: dimensionality reduction through the use of RBF kernel.
261 | - incremental: replacement for 'linear' pca when the dataset is too large.
262 |
263 |
264 | pca_components: int or float, default = None
265 | Number of components to keep. if pca_components is a float, it is treated as a
266 | target percentage for information retention. When pca_components is an integer
267 | it is treated as the number of features to be kept. pca_components must be less
268 | than the original number of features. Ignored when ``pca`` is not True.
269 |
270 |
271 | ignore_low_variance: bool, default = False
272 | When set to True, all categorical features with insignificant variances are
273 | removed from the data. The variance is calculated using the ratio of unique
274 | values to the number of samples, and the ratio of the most common value to the
275 | frequency of the second most common value.
276 |
277 |
278 | combine_rare_levels: bool, default = False
279 | When set to True, frequency percentile for levels in categorical features below
280 | a certain threshold is combined into a single level.
281 |
282 |
283 | rare_level_threshold: float, default = 0.1
284 | Percentile distribution below which rare categories are combined. Ignored when
285 | ``combine_rare_levels`` is not True.
286 |
287 |
288 | bin_numeric_features: list of str, default = None
289 | To convert numeric features into categorical, bin_numeric_features parameter can
290 | be used. It takes a list of strings with column names to be discretized. It does
291 | so by using 'sturges' rule to determine the number of clusters and then apply
292 | KMeans algorithm. Original values of the feature are then replaced by the
293 | cluster label.
294 |
295 |
296 | remove_outliers: bool, default = False
297 | When set to True, outliers from the training data are removed using the Singular
298 | Value Decomposition.
299 |
300 |
301 | outliers_threshold: float, default = 0.05
302 | The percentage outliers to be removed from the training dataset. Ignored when
303 | ``remove_outliers`` is not True.
304 |
305 |
306 | remove_multicollinearity: bool, default = False
307 | When set to True, features with the inter-correlations higher than the defined
308 | threshold are removed. When two features are highly correlated with each other,
309 | the feature that is less correlated with the target variable is removed. Only
310 | considers numeric features.
311 |
312 |
313 | multicollinearity_threshold: float, default = 0.9
314 | Threshold for correlated features. Ignored when ``remove_multicollinearity``
315 | is not True.
316 |
317 |
318 | remove_perfect_collinearity: bool, default = True
319 | When set to True, perfect collinearity (features with correlation = 1) is removed
320 | from the dataset, when two features are 100% correlated, one of it is randomly
321 | removed from the dataset.
322 |
323 |
324 | create_clusters: bool, default = False
325 | When set to True, an additional feature is created in training dataset where each
326 | instance is assigned to a cluster. The number of clusters is determined by
327 | optimizing Calinski-Harabasz and Silhouette criterion.
328 |
329 |
330 | cluster_iter: int, default = 20
331 | Number of iterations for creating cluster. Each iteration represents cluster
332 | size. Ignored when ``create_clusters`` is not True.
333 |
334 |
335 | polynomial_features: bool, default = False
336 | When set to True, new features are derived using existing numeric features.
337 |
338 |
339 | polynomial_degree: int, default = 2
340 | Degree of polynomial features. For example, if an input sample is two dimensional
341 | and of the form [a, b], the polynomial features with degree = 2 are:
342 | [1, a, b, a^2, ab, b^2]. Ignored when ``polynomial_features`` is not True.
343 |
344 |
345 | trigonometry_features: bool, default = False
346 | When set to True, new features are derived using existing numeric features.
347 |
348 |
349 | polynomial_threshold: float, default = 0.1
350 | When ``polynomial_features`` or ``trigonometry_features`` is True, new features
351 | are derived from the existing numeric features. This may sometimes result in too
352 | large feature space. polynomial_threshold parameter can be used to deal with this
353 | problem. It does so by using combination of Random Forest, AdaBoost and Linear
354 | correlation. All derived features that falls within the percentile distribution
355 | are kept and rest of the features are removed.
356 |
357 |
358 | group_features: list or list of list, default = None
359 | When the dataset contains features with related characteristics, group_features
360 | parameter can be used for feature extraction. It takes a list of strings with
361 | column names that are related.
362 |
363 |
364 | group_names: list, default = None
365 | Group names to be used in naming new features. When the length of group_names
366 | does not match with the length of ``group_features``, new features are named
367 | sequentially group_1, group_2, etc. It is ignored when ``group_features`` is
368 | None.
369 |
370 |
371 | feature_selection: bool, default = False
372 | When set to True, a subset of features are selected using a combination of
373 | various permutation importance techniques including Random Forest, Adaboost
374 | and Linear correlation with target variable. The size of the subset is
375 | dependent on the ``feature_selection_threshold`` parameter.
376 |
377 |
378 | feature_selection_threshold: float, default = 0.8
379 | Threshold value used for feature selection. When ``polynomial_features`` or
380 | ``feature_interaction`` is True, it is recommended to keep the threshold low
381 | to avoid large feature spaces. Setting a very low value may be efficient but
382 | could result in under-fitting.
383 |
384 |
385 | feature_selection_method: str, default = 'classic'
386 | Algorithm for feature selection. 'classic' method uses permutation feature
387 | importance techniques. Other possible value is 'boruta' which uses boruta
388 | algorithm for feature selection.
389 |
390 |
391 | feature_interaction: bool, default = False
392 | When set to True, new features are created by interacting (a * b) all the
393 | numeric variables in the dataset. This feature is not scalable and may not
394 | work as expected on datasets with large feature space.
395 |
396 |
397 | feature_ratio: bool, default = False
398 | When set to True, new features are created by calculating the ratios (a / b)
399 | between all numeric variables in the dataset. This feature is not scalable and
400 | may not work as expected on datasets with large feature space.
401 |
402 |
403 | interaction_threshold: bool, default = 0.01
404 | Similar to polynomial_threshold, It is used to compress a sparse matrix of newly
405 | created features through interaction. Features whose importance based on the
406 | combination of Random Forest, AdaBoost and Linear correlation falls within the
407 | percentile of the defined threshold are kept in the dataset. Remaining features
408 | are dropped before further processing.
409 |
410 |
411 | transform_target: bool, default = False
412 | When set to True, target variable is transformed using the method defined in
413 | ``transform_target_method`` param. Target transformation is applied separately
414 | from feature transformations.
415 |
416 |
417 | transform_target_method: str, default = 'box-cox'
418 | 'Box-cox' and 'yeo-johnson' methods are supported. Box-Cox requires input data to
419 | be strictly positive, while Yeo-Johnson supports both positive or negative data.
420 | When transform_target_method is 'box-cox' and target variable contains negative
421 | values, method is internally forced to 'yeo-johnson' to avoid exceptions.
422 |
423 |
424 | data_split_shuffle: bool, default = True
425 | When set to False, prevents shuffling of rows during 'train_test_split'.
426 |
427 |
428 | data_split_stratify: bool or list, default = False
429 | Controls stratification during 'train_test_split'. When set to True, will
430 | stratify by target column. To stratify on any other columns, pass a list of
431 | column names. Ignored when ``data_split_shuffle`` is False.
432 |
433 |
434 | fold_strategy: str or sklearn CV generator object, default = 'kfold'
435 | Choice of cross validation strategy. Possible values are:
436 |
437 | * 'kfold'
438 | * 'stratifiedkfold'
439 | * 'groupkfold'
440 | * 'timeseries'
441 | * a custom CV generator object compatible with scikit-learn.
442 |
443 |
444 | fold: int, default = 10
445 | Number of folds to be used in cross validation. Must be at least 2. This is
446 | a global setting that can be over-written at function level by using ``fold``
447 | parameter. Ignored when ``fold_strategy`` is a custom object.
448 |
449 |
450 | fold_shuffle: bool, default = False
451 | Controls the shuffle parameter of CV. Only applicable when ``fold_strategy``
452 | is 'kfold' or 'stratifiedkfold'. Ignored when ``fold_strategy`` is a custom
453 | object.
454 |
455 |
456 | fold_groups: str or array-like, with shape (n_samples,), default = None
457 | Optional group labels when 'GroupKFold' is used for the cross validation.
458 | It takes an array with shape (n_samples, ) where n_samples is the number
459 | of rows in the training dataset. When string is passed, it is interpreted
460 | as the column name in the dataset containing group labels.
461 |
462 |
463 | n_jobs: int, default = -1
464 | The number of jobs to run in parallel (for functions that supports parallel
465 | processing) -1 means using all processors. To run all functions on single
466 | processor set n_jobs to None.
467 |
468 |
469 | use_gpu: bool or str, default = False
470 | When set to True, it will use GPU for training with algorithms that support it,
471 | and fall back to CPU if they are unavailable. When set to 'force', it will only
472 | use GPU-enabled algorithms and raise exceptions when they are unavailable. When
473 | False, all algorithms are trained using CPU only.
474 |
475 | GPU enabled algorithms:
476 |
477 | - Extreme Gradient Boosting, requires no further installation
478 |
479 | - CatBoost Regressor, requires no further installation
480 | (GPU is only enabled when data > 50,000 rows)
481 |
482 | - Light Gradient Boosting Machine, requires GPU installation
483 | https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html
484 |
485 | - Linear Regression, Lasso Regression, Ridge Regression, K Neighbors Regressor,
486 | Random Forest, Support Vector Regression, Elastic Net requires cuML >= 0.15
487 | https://github.com/rapidsai/cuml
488 |
489 |
490 | custom_pipeline: (str, transformer) or list of (str, transformer), default = None
491 | When passed, will append the custom transformers in the preprocessing pipeline
492 | and are applied on each CV fold separately and on the final fit. All the custom
493 | transformations are applied after 'train_test_split' and before PyRapidML's internal
494 | transformations.
495 |
496 |
497 | html: bool, default = True
498 | When set to False, prevents runtime display of monitor. This must be set to False
499 | when the environment does not support IPython. For example, command line terminal,
500 | Databricks Notebook, Spyder and other similar IDEs.
501 |
502 |
503 | session_id: int, default = None
504 | Controls the randomness of experiment. It is equivalent to 'random_state' in
505 | scikit-learn. When None, a pseudo random number is generated. This can be used
506 | for later reproducibility of the entire experiment.
507 |
508 |
509 | log_experiment: bool, default = False
510 | When set to True, all metrics and parameters are logged on the ``MLFlow`` server.
511 |
512 |
513 | experiment_name: str, default = None
514 | Name of the experiment for logging. Ignored when ``log_experiment`` is not True.
515 |
516 |
517 | log_plots: bool or list, default = False
518 | When set to True, certain plots are logged automatically in the ``MLFlow`` server.
519 | To change the type of plots to be logged, pass a list containing plot IDs. Refer
520 | to documentation of ``plot_model``. Ignored when ``log_experiment`` is not True.
521 |
522 |
523 | log_profile: bool, default = False
524 | When set to True, data profile is logged on the ``MLflow`` server as a html file.
525 | Ignored when ``log_experiment`` is not True.
526 |
527 |
528 | log_data: bool, default = False
529 | When set to True, dataset is logged on the ``MLflow`` server as a csv file.
530 | Ignored when ``log_experiment`` is not True.
531 |
532 |
533 | silent: bool, default = False
534 | Controls the confirmation input of data types when ``setup`` is executed. When
535 | executing in completely automated mode or on a remote kernel, this must be True.
536 |
537 |
538 | verbose: bool, default = True
539 | When set to False, Information grid is not printed.
540 |
541 |
542 | profile: bool, default = False
543 | When set to True, an interactive EDA report is displayed.
544 |
545 |
546 | profile_kwargs: dict, default = {} (empty dict)
547 | Dictionary of arguments passed to the ProfileReport method used
548 | to create the EDA report. Ignored if ``profile`` is False.
549 |
550 |
551 | Returns:
552 | Global variables that can be changed using the ``set_config`` function.
553 |
554 | """
555 | available_plots = {
556 | "parameter": "Hyperparameters",
557 | "residuals": "Residuals",
558 | "error": "Prediction Error",
559 | "cooks": "Cooks Distance",
560 | "rfe": "Feature Selection",
561 | "learning": "Learning Curve",
562 | "manifold": "Manifold Learning",
563 | "vc": "Validation Curve",
564 | "feature": "Feature Importance",
565 | "feature_all": "Feature Importance (All)",
566 | "tree": "Decision Tree",
567 | "residuals_interactive": "Interactive Residuals",
568 | }
569 |
570 | if log_plots == True:
571 | log_plots = ["residuals", "error", "feature"]
572 |
573 | return pycaret.internal.tabular.setup(
574 | ml_usecase="regression",
575 | available_plots=available_plots,
576 | data=data,
577 | target=target,
578 | train_size=train_size,
579 | test_data=test_data,
580 | preprocess=preprocess,
581 | imputation_type=imputation_type,
582 | iterative_imputation_iters=iterative_imputation_iters,
583 | categorical_features=categorical_features,
584 | categorical_imputation=categorical_imputation,
585 | categorical_iterative_imputer=categorical_iterative_imputer,
586 | ordinal_features=ordinal_features,
587 | high_cardinality_features=high_cardinality_features,
588 | high_cardinality_method=high_cardinality_method,
589 | numeric_features=numeric_features,
590 | numeric_imputation=numeric_imputation,
591 | numeric_iterative_imputer=numeric_iterative_imputer,
592 | date_features=date_features,
593 | ignore_features=ignore_features,
594 | normalize=normalize,
595 | normalize_method=normalize_method,
596 | transformation=transformation,
597 | transformation_method=transformation_method,
598 | handle_unknown_categorical=handle_unknown_categorical,
599 | unknown_categorical_method=unknown_categorical_method,
600 | pca=pca,
601 | pca_method=pca_method,
602 | pca_components=pca_components,
603 | ignore_low_variance=ignore_low_variance,
604 | combine_rare_levels=combine_rare_levels,
605 | rare_level_threshold=rare_level_threshold,
606 | bin_numeric_features=bin_numeric_features,
607 | remove_outliers=remove_outliers,
608 | outliers_threshold=outliers_threshold,
609 | remove_multicollinearity=remove_multicollinearity,
610 | multicollinearity_threshold=multicollinearity_threshold,
611 | remove_perfect_collinearity=remove_perfect_collinearity,
612 | create_clusters=create_clusters,
613 | cluster_iter=cluster_iter,
614 | polynomial_features=polynomial_features,
615 | polynomial_degree=polynomial_degree,
616 | trigonometry_features=trigonometry_features,
617 | polynomial_threshold=polynomial_threshold,
618 | group_features=group_features,
619 | group_names=group_names,
620 | feature_selection=feature_selection,
621 | feature_selection_threshold=feature_selection_threshold,
622 | feature_selection_method=feature_selection_method,
623 | feature_interaction=feature_interaction,
624 | feature_ratio=feature_ratio,
625 | interaction_threshold=interaction_threshold,
626 | transform_target=transform_target,
627 | transform_target_method=transform_target_method,
628 | data_split_shuffle=data_split_shuffle,
629 | data_split_stratify=data_split_stratify,
630 | fold_strategy=fold_strategy,
631 | fold=fold,
632 | fold_shuffle=fold_shuffle,
633 | fold_groups=fold_groups,
634 | n_jobs=n_jobs,
635 | use_gpu=use_gpu,
636 | custom_pipeline=custom_pipeline,
637 | html=html,
638 | session_id=session_id,
639 | log_experiment=log_experiment,
640 | experiment_name=experiment_name,
641 | log_plots=log_plots,
642 | log_profile=log_profile,
643 | log_data=log_data,
644 | silent=silent,
645 | verbose=verbose,
646 | profile=profile,
647 | profile_kwargs=profile_kwargs,
648 | )
649 |
650 |
651 | def comparing_models(
652 | include: Optional[List[Union[str, Any]]] = None,
653 | exclude: Optional[List[str]] = None,
654 | fold: Optional[Union[int, Any]] = None,
655 | round: int = 4,
656 | cross_validation: bool = True,
657 | sort: str = "R2",
658 | n_select: int = 1,
659 | budget_time: Optional[float] = None,
660 | turbo: bool = True,
661 | errors: str = "ignore",
662 | fit_kwargs: Optional[dict] = None,
663 | groups: Optional[Union[str, Any]] = None,
664 | verbose: bool = True,
665 | ):
666 |
667 | """
668 | This function trains and evaluates performance of all estimators available in the
669 | model library using cross validation. The output of this function is a score grid
670 | with average cross validated scores. Metrics evaluated during CV can be accessed
671 | using the ``get_metrics`` function. Custom metrics can be added or removed using
672 | ``add_metric`` and ``remove_metric`` function.
673 |
674 |
675 | Example
676 | --------
677 | >>> from PyRapidML.datasets import get_data
678 | >>> boston = extract_data('boston')
679 | >>> from PyRapidML.regression import *
680 | >>> exp_name = initializer(data = boston, target = 'medv')
681 | >>> best_model = comparing_models()
682 |
683 |
684 | include: list of str or scikit-learn compatible object, default = None
685 | To train and evaluate select models, list containing model ID or scikit-learn
686 | compatible object can be passed in include param. To see a list of all models
687 | available in the model library use the ``models`` function.
688 |
689 |
690 | exclude: list of str, default = None
691 | To omit certain models from training and evaluation, pass a list containing
692 | model id in the exclude parameter. To see a list of all models available
693 | in the model library use the ``models`` function.
694 |
695 |
696 | fold: int or scikit-learn compatible CV generator, default = None
697 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
698 | parameter of the ``setup`` function is used. When an integer is passed,
699 | it is interpreted as the 'n_splits' parameter of the CV generator in the
700 | ``setup`` function.
701 |
702 |
703 | round: int, default = 4
704 | Number of decimal places the metrics in the score grid will be rounded to.
705 |
706 |
707 | cross_validation: bool, default = True
708 | When set to False, metrics are evaluated on holdout set. ``fold`` param
709 | is ignored when cross_validation is set to False.
710 |
711 |
712 | sort: str, default = 'R2'
713 | The sort order of the score grid. It also accepts custom metrics that are
714 | added through the ``add_metric`` function.
715 |
716 |
717 | n_select: int, default = 1
718 | Number of top_n models to return. For example, to select top 3 models use
719 | n_select = 3.
720 |
721 |
722 | budget_time: int or float, default = None
723 | If not None, will terminate execution of the function after budget_time
724 | minutes have passed and return results up to that point.
725 |
726 |
727 | turbo: bool, default = True
728 | When set to True, it excludes estimators with longer training times. To
729 | see which algorithms are excluded use the ``models`` function.
730 |
731 |
732 | errors: str, default = 'ignore'
733 | When set to 'ignore', will skip the model with exceptions and continue.
734 | If 'raise', will break the function when exceptions are raised.
735 |
736 |
737 | fit_kwargs: dict, default = {} (empty dict)
738 | Dictionary of arguments passed to the fit method of the model.
739 |
740 |
741 | groups: str or array-like, with shape (n_samples,), default = None
742 | Optional group labels when 'GroupKFold' is used for the cross validation.
743 | It takes an array with shape (n_samples, ) where n_samples is the number
744 | of rows in the training dataset. When string is passed, it is interpreted
745 | as the column name in the dataset containing group labels.
746 |
747 |
748 | verbose: bool, default = True
749 | Score grid is not printed when verbose is set to False.
750 |
751 |
752 | Returns:
753 | Trained model or list of trained models, depending on the ``n_select`` param.
754 |
755 |
756 | Warnings
757 | --------
758 | - Changing turbo parameter to False may result in very high training times with
759 | datasets exceeding 10,000 rows.
760 |
761 | - No models are logged in ``MLFlow`` when ``cross_validation`` parameter is False.
762 |
763 | """
764 |
765 | return pycaret.internal.tabular.compare_models(
766 | include=include,
767 | exclude=exclude,
768 | fold=fold,
769 | round=round,
770 | cross_validation=cross_validation,
771 | sort=sort,
772 | n_select=n_select,
773 | budget_time=budget_time,
774 | turbo=turbo,
775 | errors=errors,
776 | fit_kwargs=fit_kwargs,
777 | groups=groups,
778 | verbose=verbose,
779 | )
780 |
781 |
782 | def creating_model(
783 | estimator: Union[str, Any],
784 | fold: Optional[Union[int, Any]] = None,
785 | round: int = 4,
786 | cross_validation: bool = True,
787 | fit_kwargs: Optional[dict] = None,
788 | groups: Optional[Union[str, Any]] = None,
789 | verbose: bool = True,
790 | **kwargs,
791 | ):
792 |
793 | """
794 | This function trains and evaluates the performance of a given estimator
795 | using cross validation. The output of this function is a score grid with
796 | CV scores by fold. Metrics evaluated during CV can be accessed using the
797 | ``get_metrics`` function. Custom metrics can be added or removed using
798 | ``add_metric`` and ``remove_metric`` function. All the available models
799 | can be accessed using the ``models`` function.
800 |
801 |
802 | Example
803 | -------
804 | >>> from PyRapidML.datasets import get_data
805 | >>> boston = extract_data('boston')
806 | >>> from PyRapidML.regression import *
807 | >>> exp_name = initializer(data = boston, target = 'medv')
808 | >>> lr = creating_model('lr')
809 |
810 |
811 |
812 | estimator: str or scikit-learn compatible object
813 | ID of an estimator available in model library or pass an untrained
814 | model object consistent with scikit-learn API. Estimators available
815 | in the model library (ID - Name):
816 |
817 | * 'lr' - Linear Regression
818 | * 'lasso' - Lasso Regression
819 | * 'ridge' - Ridge Regression
820 | * 'en' - Elastic Net
821 | * 'lar' - Least Angle Regression
822 | * 'llar' - Lasso Least Angle Regression
823 | * 'omp' - Orthogonal Matching Pursuit
824 | * 'br' - Bayesian Ridge
825 | * 'ard' - Automatic Relevance Determination
826 | * 'par' - Passive Aggressive Regressor
827 | * 'ransac' - Random Sample Consensus
828 | * 'tr' - TheilSen Regressor
829 | * 'huber' - Huber Regressor
830 | * 'kr' - Kernel Ridge
831 | * 'svm' - Support Vector Regression
832 | * 'knn' - K Neighbors Regressor
833 | * 'dt' - Decision Tree Regressor
834 | * 'rf' - Random Forest Regressor
835 | * 'et' - Extra Trees Regressor
836 | * 'ada' - AdaBoost Regressor
837 | * 'gbr' - Gradient Boosting Regressor
838 | * 'mlp' - MLP Regressor
839 | * 'xgboost' - Extreme Gradient Boosting
840 | * 'lightgbm' - Light Gradient Boosting Machine
841 | * 'catboost' - CatBoost Regressor
842 |
843 |
844 | fold: int or scikit-learn compatible CV generator, default = None
845 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
846 | parameter of the ``setup`` function is used. When an integer is passed,
847 | it is interpreted as the 'n_splits' parameter of the CV generator in the
848 | ``setup`` function.
849 |
850 |
851 | round: int, default = 4
852 | Number of decimal places the metrics in the score grid will be rounded to.
853 |
854 |
855 | cross_validation: bool, default = True
856 | When set to False, metrics are evaluated on holdout set. ``fold`` param
857 | is ignored when cross_validation is set to False.
858 |
859 |
860 | fit_kwargs: dict, default = {} (empty dict)
861 | Dictionary of arguments passed to the fit method of the model.
862 |
863 |
864 | groups: str or array-like, with shape (n_samples,), default = None
865 | Optional group labels when GroupKFold is used for the cross validation.
866 | It takes an array with shape (n_samples, ) where n_samples is the number
867 | of rows in training dataset. When string is passed, it is interpreted as
868 | the column name in the dataset containing group labels.
869 |
870 |
871 | verbose: bool, default = True
872 | Score grid is not printed when verbose is set to False.
873 |
874 |
875 | **kwargs:
876 | Additional keyword arguments to pass to the estimator.
877 |
878 |
879 | Returns:
880 | Trained Model
881 |
882 |
883 | Warnings
884 | --------
885 | - Models are not logged on the ``MLFlow`` server when ``cross_validation`` param
886 | is set to False.
887 |
888 | """
889 |
890 | return pycaret.internal.tabular.create_model_supervised(
891 | estimator=estimator,
892 | fold=fold,
893 | round=round,
894 | cross_validation=cross_validation,
895 | fit_kwargs=fit_kwargs,
896 | groups=groups,
897 | verbose=verbose,
898 | **kwargs,
899 | )
900 |
901 |
902 | def tuning_model(
903 | estimator,
904 | fold: Optional[Union[int, Any]] = None,
905 | round: int = 4,
906 | n_iter: int = 10,
907 | custom_grid: Optional[Union[Dict[str, list], Any]] = None,
908 | optimize: str = "R2",
909 | custom_scorer=None,
910 | search_library: str = "scikit-learn",
911 | search_algorithm: Optional[str] = None,
912 | early_stopping: Any = False,
913 | early_stopping_max_iters: int = 10,
914 | choose_better: bool = False,
915 | fit_kwargs: Optional[dict] = None,
916 | groups: Optional[Union[str, Any]] = None,
917 | return_tuner: bool = False,
918 | verbose: bool = True,
919 | tuner_verbose: Union[int, bool] = True,
920 | **kwargs,
921 | ):
922 |
923 | """
924 | This function tunes the hyperparameters of a given estimator. The output of
925 | this function is a score grid with CV scores by fold of the best selected
926 | model based on ``optimize`` parameter. Metrics evaluated during CV can be
927 | accessed using the ``get_metrics`` function. Custom metrics can be added
928 | or removed using ``add_metric`` and ``remove_metric`` function.
929 |
930 |
931 | Example
932 | -------
933 | >>> from PyRapidML.datasets import get_data
934 | >>> boston = extract_data('boston')
935 | >>> from PyRapidML.regression import *
936 | >>> exp_name = initializer(data = boston, target = 'medv')
937 | >>> lr = creating_model('lr')
938 | >>> tuned_lr = tuning_model(lr)
939 |
940 |
941 | estimator: scikit-learn compatible object
942 | Trained model object
943 |
944 |
945 | fold: int or scikit-learn compatible CV generator, default = None
946 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
947 | parameter of the ``setup`` function is used. When an integer is passed,
948 | it is interpreted as the 'n_splits' parameter of the CV generator in the
949 | ``setup`` function.
950 |
951 |
952 | round: int, default = 4
953 | Number of decimal places the metrics in the score grid will be rounded to.
954 |
955 |
956 | n_iter: int, default = 10
957 | Number of iterations in the grid search. Increasing 'n_iter' may improve
958 | model performance but also increases the training time.
959 |
960 |
961 | custom_grid: dictionary, default = None
962 | To define custom search space for hyperparameters, pass a dictionary with
963 | parameter name and values to be iterated. Custom grids must be in a format
964 | supported by the defined ``search_library``.
965 |
966 |
967 | optimize: str, default = 'R2'
968 | Metric name to be evaluated for hyperparameter tuning. It also accepts custom
969 | metrics that are added through the ``add_metric`` function.
970 |
971 |
972 | custom_scorer: object, default = None
973 | custom scoring strategy can be passed to tune hyperparameters of the model.
974 | It must be created using ``sklearn.make_scorer``. It is equivalent of adding
975 | custom metric using the ``add_metric`` function and passing the name of the
976 | custom metric in the ``optimize`` parameter.
977 | Will be deprecated in future.
978 |
979 |
980 | search_library: str, default = 'scikit-learn'
981 | The search library used for tuning hyperparameters. Possible values:
982 |
983 | - 'scikit-learn' - default, requires no further installation
984 | https://github.com/scikit-learn/scikit-learn
985 |
986 | - 'scikit-optimize' - ``pip install scikit-optimize``
987 | https://scikit-optimize.github.io/stable/
988 |
989 | - 'tune-sklearn' - ``pip install tune-sklearn ray[tune]``
990 | https://github.com/ray-project/tune-sklearn
991 |
992 | - 'optuna' - ``pip install optuna``
993 | https://optuna.org/
994 |
995 |
996 | search_algorithm: str, default = None
997 | The search algorithm depends on the ``search_library`` parameter.
998 | Some search algorithms require additional libraries to be installed.
999 | If None, will use search library-specific default algorithm.
1000 |
1001 | - 'scikit-learn' possible values:
1002 | - 'random' : random grid search (default)
1003 | - 'grid' : grid search
1004 |
1005 | - 'scikit-optimize' possible values:
1006 | - 'bayesian' : Bayesian search (default)
1007 |
1008 | - 'tune-sklearn' possible values:
1009 | - 'random' : random grid search (default)
1010 | - 'grid' : grid search
1011 | - 'bayesian' : ``pip install scikit-optimize``
1012 | - 'hyperopt' : ``pip install hyperopt``
1013 | - 'optuna' : ``pip install optuna``
1014 | - 'bohb' : ``pip install hpbandster ConfigSpace``
1015 |
1016 | - 'optuna' possible values:
1017 | - 'random' : randomized search
1018 | - 'tpe' : Tree-structured Parzen Estimator search (default)
1019 |
1020 |
1021 | early_stopping: bool or str or object, default = False
1022 | Use early stopping to stop fitting to a hyperparameter configuration
1023 | if it performs poorly. Ignored when ``search_library`` is scikit-learn,
1024 | or if the estimator does not have 'partial_fit' attribute. If False or
1025 | None, early stopping will not be used. Can be either an object accepted
1026 | by the search library or one of the following:
1027 |
1028 | - 'asha' for Asynchronous Successive Halving Algorithm
1029 | - 'hyperband' for Hyperband
1030 | - 'median' for Median Stopping Rule
1031 | - If False or None, early stopping will not be used.
1032 |
1033 |
1034 | early_stopping_max_iters: int, default = 10
1035 | Maximum number of epochs to run for each sampled configuration.
1036 | Ignored if ``early_stopping`` is False or None.
1037 |
1038 |
1039 | choose_better: bool, default = False
1040 | When set to True, the returned object is always better performing. The
1041 | metric used for comparison is defined by the ``optimize`` parameter.
1042 |
1043 |
1044 | fit_kwargs: dict, default = {} (empty dict)
1045 | Dictionary of arguments passed to the fit method of the tuner.
1046 |
1047 |
1048 | groups: str or array-like, with shape (n_samples,), default = None
1049 | Optional group labels when GroupKFold is used for the cross validation.
1050 | It takes an array with shape (n_samples, ) where n_samples is the number
1051 | of rows in training dataset. When string is passed, it is interpreted as
1052 | the column name in the dataset containing group labels.
1053 |
1054 |
1055 | return_tuner: bool, default = False
1056 | When set to True, will return a tuple of (model, tuner_object).
1057 |
1058 |
1059 | verbose: bool, default = True
1060 | Score grid is not printed when verbose is set to False.
1061 |
1062 |
1063 | tuner_verbose: bool or in, default = True
1064 | If True or above 0, will print messages from the tuner. Higher values
1065 | print more messages. Ignored when ``verbose`` param is False.
1066 |
1067 |
1068 | **kwargs:
1069 | Additional keyword arguments to pass to the optimizer.
1070 |
1071 |
1072 | Returns:
1073 | Trained Model and Optional Tuner Object when ``return_tuner`` is True.
1074 |
1075 |
1076 | Warnings
1077 | --------
1078 | - Using 'grid' as ``search_algorithm`` may result in very long computation.
1079 | Only recommended with smaller search spaces that can be defined in the
1080 | ``custom_grid`` parameter.
1081 |
1082 | - ``search_library`` 'tune-sklearn' does not support GPU models.
1083 |
1084 | """
1085 |
1086 | return pycaret.internal.tabular.tune_model_supervised(
1087 | estimator=estimator,
1088 | fold=fold,
1089 | round=round,
1090 | n_iter=n_iter,
1091 | custom_grid=custom_grid,
1092 | optimize=optimize,
1093 | custom_scorer=custom_scorer,
1094 | search_library=search_library,
1095 | search_algorithm=search_algorithm,
1096 | early_stopping=early_stopping,
1097 | early_stopping_max_iters=early_stopping_max_iters,
1098 | choose_better=choose_better,
1099 | fit_kwargs=fit_kwargs,
1100 | groups=groups,
1101 | return_tuner=return_tuner,
1102 | verbose=verbose,
1103 | tuner_verbose=tuner_verbose,
1104 | **kwargs,
1105 | )
1106 |
1107 |
1108 | def ensemble_model(
1109 | estimator,
1110 | method: str = "Bagging",
1111 | fold: Optional[Union[int, Any]] = None,
1112 | n_estimators: int = 10,
1113 | round: int = 4,
1114 | choose_better: bool = False,
1115 | optimize: str = "R2",
1116 | fit_kwargs: Optional[dict] = None,
1117 | groups: Optional[Union[str, Any]] = None,
1118 | verbose: bool = True,
1119 | ) -> Any:
1120 |
1121 | """
1122 | This function ensembles a given estimator. The output of this function is
1123 | a score grid with CV scores by fold. Metrics evaluated during CV can be
1124 | accessed using the ``get_metrics`` function. Custom metrics can be added
1125 | or removed using ``add_metric`` and ``remove_metric`` function.
1126 |
1127 |
1128 | Example
1129 | --------
1130 | >>> from PyRapidML.datasets import get_data
1131 | >>> boston = extract_data('boston')
1132 | >>> from PyRapidML.regression import *
1133 | >>> exp_name = initializer(data = boston, target = 'medv')
1134 | >>> lr = creating_model('lr')
1135 | >>> tuned_lr = tuning_model(lr)
1136 | >>> bagged_dt = ensemble_model(dt, method = 'Bagging')
1137 |
1138 |
1139 | estimator: scikit-learn compatible object
1140 | Trained model object
1141 |
1142 |
1143 | method: str, default = 'Bagging'
1144 | Method for ensembling base estimator. It can be 'Bagging' or 'Boosting'.
1145 |
1146 |
1147 | fold: int or scikit-learn compatible CV generator, default = None
1148 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1149 | parameter of the ``setup`` function is used. When an integer is passed,
1150 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1151 | ``setup`` function.
1152 |
1153 |
1154 | n_estimators: int, default = 10
1155 | The number of base estimators in the ensemble. In case of perfect fit, the
1156 | learning procedure is stopped early.
1157 |
1158 |
1159 | round: int, default = 4
1160 | Number of decimal places the metrics in the score grid will be rounded to.
1161 |
1162 |
1163 | choose_better: bool, default = False
1164 | When set to True, the returned object is always better performing. The
1165 | metric used for comparison is defined by the ``optimize`` parameter.
1166 |
1167 |
1168 | optimize: str, default = 'R2'
1169 | Metric to compare for model selection when ``choose_better`` is True.
1170 |
1171 |
1172 | fit_kwargs: dict, default = {} (empty dict)
1173 | Dictionary of arguments passed to the fit method of the model.
1174 |
1175 |
1176 | groups: str or array-like, with shape (n_samples,), default = None
1177 | Optional group labels when GroupKFold is used for the cross validation.
1178 | It takes an array with shape (n_samples, ) where n_samples is the number
1179 | of rows in training dataset. When string is passed, it is interpreted as
1180 | the column name in the dataset containing group labels.
1181 |
1182 |
1183 | verbose: bool, default = True
1184 | Score grid is not printed when verbose is set to False.
1185 |
1186 |
1187 | Returns:
1188 | Trained Model
1189 |
1190 | """
1191 |
1192 | return pycaret.internal.tabular.ensemble_model(
1193 | estimator=estimator,
1194 | method=method,
1195 | fold=fold,
1196 | n_estimators=n_estimators,
1197 | round=round,
1198 | choose_better=choose_better,
1199 | optimize=optimize,
1200 | fit_kwargs=fit_kwargs,
1201 | groups=groups,
1202 | verbose=verbose,
1203 | )
1204 |
1205 |
1206 | def blend_models(
1207 | estimator_list: list,
1208 | fold: Optional[Union[int, Any]] = None,
1209 | round: int = 4,
1210 | choose_better: bool = False,
1211 | optimize: str = "R2",
1212 | weights: Optional[List[float]] = None,
1213 | fit_kwargs: Optional[dict] = None,
1214 | groups: Optional[Union[str, Any]] = None,
1215 | verbose: bool = True,
1216 | ):
1217 |
1218 | """
1219 | This function trains a Voting Regressor for select models passed in the
1220 | ``estimator_list`` param. The output of this function is a score grid with
1221 | CV scores by fold. Metrics evaluated during CV can be accessed using the
1222 | ``get_metrics`` function. Custom metrics can be added or removed using
1223 | ``add_metric`` and ``remove_metric`` function.
1224 |
1225 |
1226 | Example
1227 | --------
1228 | >>> from PyRapidML.datasets import get_data
1229 | >>> boston = extract_data('boston')
1230 | >>> from PyRapidML.regression import *
1231 | >>> exp_name = initializer(data = boston, target = 'medv')
1232 | >>> top3 = comparing_models(n_select = 3)
1233 | >>> blender = blend_models(top3)
1234 |
1235 |
1236 | estimator_list: list of scikit-learn compatible objects
1237 | List of trained model objects
1238 |
1239 |
1240 | fold: int or scikit-learn compatible CV generator, default = None
1241 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1242 | parameter of the ``setup`` function is used. When an integer is passed,
1243 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1244 | ``setup`` function.
1245 |
1246 |
1247 | round: int, default = 4
1248 | Number of decimal places the metrics in the score grid will be rounded to.
1249 |
1250 |
1251 | choose_better: bool, default = False
1252 | When set to True, the returned object is always better performing. The
1253 | metric used for comparison is defined by the ``optimize`` parameter.
1254 |
1255 |
1256 | optimize: str, default = 'R2'
1257 | Metric to compare for model selection when ``choose_better`` is True.
1258 |
1259 |
1260 | weights: list, default = None
1261 | Sequence of weights (float or int) to weight the occurrences of predicted class
1262 | labels (hard voting) or class probabilities before averaging (soft voting). Uses
1263 | uniform weights when None.
1264 |
1265 |
1266 | fit_kwargs: dict, default = {} (empty dict)
1267 | Dictionary of arguments passed to the fit method of the model.
1268 |
1269 |
1270 | groups: str or array-like, with shape (n_samples,), default = None
1271 | Optional group labels when GroupKFold is used for the cross validation.
1272 | It takes an array with shape (n_samples, ) where n_samples is the number
1273 | of rows in training dataset. When string is passed, it is interpreted as
1274 | the column name in the dataset containing group labels.
1275 |
1276 |
1277 | verbose: bool, default = True
1278 | Score grid is not printed when verbose is set to False.
1279 |
1280 |
1281 | Returns:
1282 | Trained Model
1283 |
1284 |
1285 | """
1286 |
1287 | return pycaret.internal.tabular.blend_models(
1288 | estimator_list=estimator_list,
1289 | fold=fold,
1290 | round=round,
1291 | choose_better=choose_better,
1292 | optimize=optimize,
1293 | method="auto",
1294 | weights=weights,
1295 | fit_kwargs=fit_kwargs,
1296 | groups=groups,
1297 | verbose=verbose,
1298 | )
1299 |
1300 |
1301 | def stack_models(
1302 | estimator_list: list,
1303 | meta_model=None,
1304 | fold: Optional[Union[int, Any]] = None,
1305 | round: int = 4,
1306 | restack: bool = True,
1307 | choose_better: bool = False,
1308 | optimize: str = "R2",
1309 | fit_kwargs: Optional[dict] = None,
1310 | groups: Optional[Union[str, Any]] = None,
1311 | verbose: bool = True,
1312 | ):
1313 |
1314 | """
1315 | This function trains a meta model over select estimators passed in
1316 | the ``estimator_list`` parameter. The output of this function is a
1317 | score grid with CV scores by fold. Metrics evaluated during CV can
1318 | be accessed using the ``get_metrics`` function. Custom metrics
1319 | can be added or removed using ``add_metric`` and ``remove_metric``
1320 | function.
1321 |
1322 |
1323 | Example
1324 | --------
1325 | >>> from PyRapidML.datasets import get_data
1326 | >>> boston = extract_data('boston')
1327 | >>> from PyRapidML.regression import *
1328 | >>> exp_name = initializer(data = boston, target = 'medv')
1329 | >>> top3 = comparing_models(n_select = 3)
1330 | >>> stacker = stack_models(top3)
1331 |
1332 |
1333 | estimator_list: list of scikit-learn compatible objects
1334 | List of trained model objects
1335 |
1336 |
1337 | meta_model: scikit-learn compatible object, default = None
1338 | When None, Linear Regression is trained as a meta model.
1339 |
1340 |
1341 | fold: int or scikit-learn compatible CV generator, default = None
1342 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1343 | parameter of the ``setup`` function is used. When an integer is passed,
1344 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1345 | ``setup`` function.
1346 |
1347 |
1348 | round: int, default = 4
1349 | Number of decimal places the metrics in the score grid will be rounded to.
1350 |
1351 |
1352 | restack: bool, default = True
1353 | When set to False, only the predictions of estimators will be used as
1354 | training data for the ``meta_model``.
1355 |
1356 |
1357 | choose_better: bool, default = False
1358 | When set to True, the returned object is always better performing. The
1359 | metric used for comparison is defined by the ``optimize`` parameter.
1360 |
1361 |
1362 | optimize: str, default = 'R2'
1363 | Metric to compare for model selection when ``choose_better`` is True.
1364 |
1365 |
1366 | fit_kwargs: dict, default = {} (empty dict)
1367 | Dictionary of arguments passed to the fit method of the model.
1368 |
1369 |
1370 | groups: str or array-like, with shape (n_samples,), default = None
1371 | Optional group labels when GroupKFold is used for the cross validation.
1372 | It takes an array with shape (n_samples, ) where n_samples is the number
1373 | of rows in training dataset. When string is passed, it is interpreted as
1374 | the column name in the dataset containing group labels.
1375 |
1376 |
1377 | verbose: bool, default = True
1378 | Score grid is not printed when verbose is set to False.
1379 |
1380 |
1381 | Returns:
1382 | Trained Model
1383 |
1384 | """
1385 |
1386 | return pycaret.internal.tabular.stack_models(
1387 | estimator_list=estimator_list,
1388 | meta_model=meta_model,
1389 | fold=fold,
1390 | round=round,
1391 | method="auto",
1392 | restack=restack,
1393 | choose_better=choose_better,
1394 | optimize=optimize,
1395 | fit_kwargs=fit_kwargs,
1396 | groups=groups,
1397 | verbose=verbose,
1398 | )
1399 |
1400 |
1401 | def plot_model(
1402 | estimator,
1403 | plot: str = "residuals",
1404 | scale: float = 1,
1405 | save: bool = False,
1406 | fold: Optional[Union[int, Any]] = None,
1407 | fit_kwargs: Optional[dict] = None,
1408 | groups: Optional[Union[str, Any]] = None,
1409 | use_train_data: bool = False,
1410 | verbose: bool = True,
1411 | display_format: Optional[str] = None,
1412 | ) -> str:
1413 |
1414 | """
1415 | This function analyzes the performance of a trained model on holdout set.
1416 | It may require re-training the model in certain cases.
1417 |
1418 |
1419 | Example
1420 | --------
1421 | >>> from PyRapidML.datasets import get_data
1422 | >>> boston = extract_data('boston')
1423 | >>> from PyRapidML.regression import *
1424 | >>> exp_name = initializer(data = boston, target = 'medv')
1425 | >>> lr = creating_model('lr')
1426 | >>> plot_model(lr, plot = 'residual')
1427 |
1428 |
1429 | estimator: scikit-learn compatible object
1430 | Trained model object
1431 |
1432 |
1433 | plot: str, default = 'residual'
1434 | List of available plots (ID - Name):
1435 |
1436 | * 'residuals_interactive' - Interactive Residual plots
1437 | * 'residuals' - Residuals Plot
1438 | * 'error' - Prediction Error Plot
1439 | * 'cooks' - Cooks Distance Plot
1440 | * 'rfe' - Recursive Feat. Selection
1441 | * 'learning' - Learning Curve
1442 | * 'vc' - Validation Curve
1443 | * 'manifold' - Manifold Learning
1444 | * 'feature' - Feature Importance
1445 | * 'feature_all' - Feature Importance (All)
1446 | * 'parameter' - Model Hyperparameter
1447 | * 'tree' - Decision Tree
1448 |
1449 |
1450 | scale: float, default = 1
1451 | The resolution scale of the figure.
1452 |
1453 |
1454 | save: bool, default = False
1455 | When set to True, plot is saved in the current working directory.
1456 |
1457 |
1458 | fold: int or scikit-learn compatible CV generator, default = None
1459 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1460 | parameter of the ``setup`` function is used. When an integer is passed,
1461 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1462 | ``setup`` function.
1463 |
1464 |
1465 | fit_kwargs: dict, default = {} (empty dict)
1466 | Dictionary of arguments passed to the fit method of the model.
1467 |
1468 |
1469 | groups: str or array-like, with shape (n_samples,), default = None
1470 | Optional group labels when GroupKFold is used for the cross validation.
1471 | It takes an array with shape (n_samples, ) where n_samples is the number
1472 | of rows in training dataset. When string is passed, it is interpreted as
1473 | the column name in the dataset containing group labels.
1474 |
1475 |
1476 | use_train_data: bool, default = False
1477 | When set to true, train data will be used for plots, instead
1478 | of test data.
1479 |
1480 |
1481 | verbose: bool, default = True
1482 | When set to False, progress bar is not displayed.
1483 |
1484 |
1485 | display_format: str, default = None
1486 | To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'.
1487 | Currently, not all plots are supported.
1488 |
1489 |
1490 | Returns:
1491 | None
1492 |
1493 | """
1494 |
1495 | return pycaret.internal.tabular.plot_model(
1496 | estimator=estimator,
1497 | plot=plot,
1498 | scale=scale,
1499 | save=save,
1500 | fold=fold,
1501 | fit_kwargs=fit_kwargs,
1502 | groups=groups,
1503 | verbose=verbose,
1504 | use_train_data=use_train_data,
1505 | system=True,
1506 | display_format=display_format,
1507 | )
1508 |
1509 |
1510 | def evaluate_model(
1511 | estimator,
1512 | fold: Optional[Union[int, Any]] = None,
1513 | fit_kwargs: Optional[dict] = None,
1514 | groups: Optional[Union[str, Any]] = None,
1515 | use_train_data: bool = False,
1516 | ):
1517 |
1518 | """
1519 | This function displays a user interface for analyzing performance of a trained
1520 | model. It calls the ``plot_model`` function internally.
1521 |
1522 | Example
1523 | --------
1524 | >>> from PyRapidML.datasets import get_data
1525 | >>> boston = extract_data('boston')
1526 | >>> from PyRapidML.regression import *
1527 | >>> exp_name = initializer(data = boston, target = 'medv')
1528 | >>> lr = creating_model('lr')
1529 | >>> evaluate_model(lr)
1530 |
1531 |
1532 | estimator: scikit-learn compatible object
1533 | Trained model object
1534 |
1535 |
1536 | fold: int or scikit-learn compatible CV generator, default = None
1537 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1538 | parameter of the ``setup`` function is used. When an integer is passed,
1539 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1540 | ``setup`` function.
1541 |
1542 |
1543 | fit_kwargs: dict, default = {} (empty dict)
1544 | Dictionary of arguments passed to the fit method of the model.
1545 |
1546 |
1547 | groups: str or array-like, with shape (n_samples,), default = None
1548 | Optional group labels when GroupKFold is used for the cross validation.
1549 | It takes an array with shape (n_samples, ) where n_samples is the number
1550 | of rows in training dataset. When string is passed, it is interpreted as
1551 | the column name in the dataset containing group labels.
1552 |
1553 |
1554 | use_train_data: bool, default = False
1555 | When set to true, train data will be used for plots, instead
1556 | of test data.
1557 |
1558 |
1559 | Returns:
1560 | None
1561 |
1562 |
1563 | Warnings
1564 | --------
1565 | - This function only works in IPython enabled Notebook.
1566 |
1567 | """
1568 |
1569 | return pycaret.internal.tabular.evaluate_model(
1570 | estimator=estimator,
1571 | fold=fold,
1572 | fit_kwargs=fit_kwargs,
1573 | groups=groups,
1574 | use_train_data=use_train_data,
1575 | )
1576 |
1577 |
1578 | def interpret_model(
1579 | estimator,
1580 | plot: str = "summary",
1581 | feature: Optional[str] = None,
1582 | observation: Optional[int] = None,
1583 | use_train_data: bool = False,
1584 | X_new_sample: Optional[pd.DataFrame] = None,
1585 | save: bool = False,
1586 | **kwargs,
1587 | ):
1588 |
1589 | """
1590 | This function analyzes the predictions generated from a tree-based model. It is
1591 | implemented based on the SHAP (SHapley Additive exPlanations). For more info on
1592 | this, please see https://shap.readthedocs.io/en/latest/
1593 |
1594 |
1595 | Example
1596 | --------
1597 | >>> from PyRapidML.datasets import get_data
1598 | >>> boston = extract_data('boston')
1599 | >>> from PyRapidML.regression import *
1600 | >>> exp_name = initializer(data = boston, target = 'medv')
1601 | >>> xgboost = creating_model('xgboost')
1602 | >>> interpret_model(xgboost)
1603 |
1604 |
1605 | estimator: scikit-learn compatible object
1606 | Trained model object
1607 |
1608 |
1609 | plot: str, default = 'summary'
1610 | Type of plot. Available options are: 'summary', 'correlation', and 'reason'.
1611 |
1612 |
1613 | feature: str, default = None
1614 | Feature to check correlation with. This parameter is only required when ``plot``
1615 | type is 'correlation'. When set to None, it uses the first column in the train
1616 | dataset.
1617 |
1618 |
1619 | observation: int, default = None
1620 | Observation index number in holdout set to explain. When ``plot`` is not
1621 | 'reason', this parameter is ignored.
1622 |
1623 |
1624 | use_train_data: bool, default = False
1625 | When set to true, train data will be used for plots, instead
1626 | of test data.
1627 |
1628 |
1629 | X_new_sample: pd.DataFrame, default = None
1630 | Row from an out-of-sample dataframe (neither train nor test data) to be plotted.
1631 | The sample must have the same columns as the raw input data, and it is transformed
1632 | by the preprocessing pipeline automatically before plotting.
1633 |
1634 |
1635 | save: bool, default = False
1636 | When set to True, Plot is saved as a 'png' file in current working directory.
1637 |
1638 |
1639 | **kwargs:
1640 | Additional keyword arguments to pass to the plot.
1641 |
1642 |
1643 | Returns:
1644 | None
1645 |
1646 | """
1647 |
1648 | return pycaret.internal.tabular.interpret_model(
1649 | estimator=estimator,
1650 | plot=plot,
1651 | feature=feature,
1652 | observation=observation,
1653 | use_train_data=use_train_data,
1654 | X_new_sample=X_new_sample,
1655 | save=save,
1656 | **kwargs,
1657 | )
1658 |
1659 |
1660 | def predict_model(
1661 | estimator,
1662 | data: Optional[pd.DataFrame] = None,
1663 | round: int = 4,
1664 | verbose: bool = True,
1665 | ) -> pd.DataFrame:
1666 |
1667 | """
1668 | This function predicts ``Label`` using a trained model. When ``data`` is
1669 | None, it predicts label on the holdout set.
1670 |
1671 |
1672 | Example
1673 | -------
1674 | >>> from PyRapidML.datasets import get_data
1675 | >>> boston = extract_data('boston')
1676 | >>> from PyRapidML.regression import *
1677 | >>> exp_name = initializer(data = boston, target = 'medv')
1678 | >>> lr = creating_model('lr')
1679 | >>> pred_holdout = predict_model(lr)
1680 | >>> pred_unseen = predict_model(lr, data = unseen_dataframe)
1681 |
1682 |
1683 | estimator: scikit-learn compatible object
1684 | Trained model object
1685 |
1686 |
1687 | data : pandas.DataFrame
1688 | Shape (n_samples, n_features). All features used during training
1689 | must be available in the unseen dataset.
1690 |
1691 |
1692 | round: int, default = 4
1693 | Number of decimal places to round predictions to.
1694 |
1695 |
1696 | verbose: bool, default = True
1697 | When set to False, holdout score grid is not printed.
1698 |
1699 |
1700 | Returns:
1701 | pandas.DataFrame
1702 |
1703 |
1704 | Warnings
1705 | --------
1706 | - The behavior of the ``predict_model`` is changed in version 2.1 without backward
1707 | compatibility. As such, the pipelines trained using the version (<= 2.0), may not
1708 | work for inference with version >= 2.1. You can either retrain your models with a
1709 | newer version or downgrade the version for inference.
1710 |
1711 |
1712 | """
1713 |
1714 | return pycaret.internal.tabular.predict_model(
1715 | estimator=estimator,
1716 | data=data,
1717 | probability_threshold=None,
1718 | encoded_labels=True,
1719 | round=round,
1720 | verbose=verbose,
1721 | ml_usecase=MLUsecase.REGRESSION,
1722 | )
1723 |
1724 |
1725 | def finalize_model(
1726 | estimator,
1727 | fit_kwargs: Optional[dict] = None,
1728 | groups: Optional[Union[str, Any]] = None,
1729 | model_only: bool = True,
1730 | ) -> Any:
1731 |
1732 | """
1733 | This function trains a given estimator on the entire dataset including the
1734 | holdout set.
1735 |
1736 |
1737 | Example
1738 | --------
1739 | >>> from PyRapidML.datasets import get_data
1740 | >>> boston = extract_data('boston')
1741 | >>> from PyRapidML.regression import *
1742 | >>> exp_name = initializer(data = boston, target = 'medv')
1743 | >>> lr = creating_model('lr')
1744 | >>> final_lr = finalize_model(lr)
1745 |
1746 |
1747 | estimator: scikit-learn compatible object
1748 | Trained model object
1749 |
1750 |
1751 | fit_kwargs: dict, default = {} (empty dict)
1752 | Dictionary of arguments passed to the fit method of the model.
1753 |
1754 |
1755 | groups: str or array-like, with shape (n_samples,), default = None
1756 | Optional group labels when GroupKFold is used for the cross validation.
1757 | It takes an array with shape (n_samples, ) where n_samples is the number
1758 | of rows in training dataset. When string is passed, it is interpreted as
1759 | the column name in the dataset containing group labels.
1760 |
1761 |
1762 | model_only: bool, default = True
1763 | When set to False, only model object is re-trained and all the
1764 | transformations in Pipeline are ignored.
1765 |
1766 |
1767 | Returns:
1768 | Trained Model
1769 |
1770 |
1771 | """
1772 |
1773 | return pycaret.internal.tabular.finalize_model(
1774 | estimator=estimator,
1775 | fit_kwargs=fit_kwargs,
1776 | groups=groups,
1777 | model_only=model_only,
1778 | )
1779 |
1780 |
1781 | def deploy_model(
1782 | model, model_name: str, authentication: dict, platform: str = "aws",
1783 | ):
1784 |
1785 | """
1786 | This function deploys the transformation pipeline and trained model on cloud.
1787 |
1788 |
1789 | Example
1790 | -------
1791 | >>> from PyRapidML.datasets import get_data
1792 | >>> boston = extract_data('boston')
1793 | >>> from PyRapidML.regression import *
1794 | >>> exp_name = initializer(data = boston, target = 'medv')
1795 | >>> lr = creating_model('lr')
1796 | >>> deploy_model(model = lr, model_name = 'lr-for-deployment', platform = 'aws', authentication = {'bucket' : 'S3-bucket-name'})
1797 |
1798 |
1799 | Amazon Web Service (AWS) users:
1800 | To deploy a model on AWS S3 ('aws'), environment variables must be set in your
1801 | local environment. To configure AWS environment variables, type ``aws configure``
1802 | in the command line. Following information from the IAM portal of amazon console
1803 | account is required:
1804 |
1805 | - AWS Access Key ID
1806 | - AWS Secret Key Access
1807 | - Default Region Name (can be seen under Global settings on your AWS console)
1808 |
1809 | More info: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html
1810 |
1811 |
1812 | Google Cloud Platform (GCP) users:
1813 | To deploy a model on Google Cloud Platform ('gcp'), project must be created
1814 | using command line or GCP console. Once project is created, you must create
1815 | a service account and download the service account key as a JSON file to set
1816 | environment variables in your local environment.
1817 |
1818 | More info: https://cloud.google.com/docs/authentication/production
1819 |
1820 |
1821 | Microsoft Azure (Azure) users:
1822 | To deploy a model on Microsoft Azure ('azure'), environment variables for connection
1823 | string must be set in your local environment. Go to settings of storage account on
1824 | Azure portal to access the connection string required.
1825 |
1826 | More info: https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?toc=%2Fpython%2Fazure%2FTOC.json
1827 |
1828 |
1829 | model: scikit-learn compatible object
1830 | Trained model object
1831 |
1832 |
1833 | model_name: str
1834 | Name of model.
1835 |
1836 |
1837 | authentication: dict
1838 | Dictionary of applicable authentication tokens.
1839 |
1840 | When platform = 'aws':
1841 | {'bucket' : 'S3-bucket-name'}
1842 |
1843 | When platform = 'gcp':
1844 | {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'}
1845 |
1846 | When platform = 'azure':
1847 | {'container': 'azure-container-name'}
1848 |
1849 |
1850 | platform: str, default = 'aws'
1851 | Name of the platform. Currently supported platforms: 'aws', 'gcp' and 'azure'.
1852 |
1853 |
1854 | Returns:
1855 | None
1856 |
1857 | """
1858 |
1859 | return pycaret.internal.tabular.deploy_model(
1860 | model=model,
1861 | model_name=model_name,
1862 | authentication=authentication,
1863 | platform=platform,
1864 | )
1865 |
1866 |
1867 | def save_model(
1868 | model, model_name: str, model_only: bool = False, verbose: bool = True, **kwargs
1869 | ):
1870 |
1871 | """
1872 | This function saves the transformation pipeline and trained model object
1873 | into the current working directory as a pickle file for later use.
1874 |
1875 | Example
1876 | -------
1877 | >>> from PyRapidML.datasets import get_data
1878 | >>> boston = get_data('boston')
1879 | >>> from PyRapidML.regression import *
1880 | >>> exp_name = setup(data = boston, target = 'medv')
1881 | >>> lr = create_model('lr')
1882 | >>> save_model(lr, 'saved_lr_model')
1883 |
1884 |
1885 | model: scikit-learn compatible object
1886 | Trained model object
1887 |
1888 |
1889 | model_name: str
1890 | Name of the model.
1891 |
1892 |
1893 | model_only: bool, default = False
1894 | When set to True, only trained model object is saved instead of the
1895 | entire pipeline.
1896 |
1897 |
1898 | **kwargs:
1899 | Additional keyword arguments to pass to joblib.dump().
1900 |
1901 |
1902 | verbose: bool, default = True
1903 | Success message is not printed when verbose is set to False.
1904 |
1905 |
1906 | Returns:
1907 | Tuple of the model object and the filename.
1908 |
1909 | """
1910 |
1911 | return pycaret.internal.tabular.save_model(
1912 | model=model,
1913 | model_name=model_name,
1914 | model_only=model_only,
1915 | verbose=verbose,
1916 | **kwargs,
1917 | )
1918 |
1919 |
1920 | def load_model(
1921 | model_name,
1922 | platform: Optional[str] = None,
1923 | authentication: Optional[Dict[str, str]] = None,
1924 | verbose: bool = True,
1925 | ):
1926 |
1927 | """
1928 | This function loads a previously saved pipeline.
1929 |
1930 | Example
1931 | -------
1932 | >>> from PyRapidML.regression import load_model
1933 | >>> saved_lr = load_model('saved_lr_model')
1934 |
1935 |
1936 | model_name: str
1937 | Name of the model.
1938 |
1939 |
1940 | platform: str, default = None
1941 | Name of the cloud platform. Currently supported platforms:
1942 | 'aws', 'gcp' and 'azure'.
1943 |
1944 |
1945 | authentication: dict, default = None
1946 | dictionary of applicable authentication tokens.
1947 |
1948 | when platform = 'aws':
1949 | {'bucket' : 'S3-bucket-name'}
1950 |
1951 | when platform = 'gcp':
1952 | {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'}
1953 |
1954 | when platform = 'azure':
1955 | {'container': 'azure-container-name'}
1956 |
1957 |
1958 | verbose: bool, default = True
1959 | Success message is not printed when verbose is set to False.
1960 |
1961 |
1962 | Returns:
1963 | Trained Model
1964 |
1965 | """
1966 |
1967 | return pycaret.internal.tabular.load_model(
1968 | model_name=model_name,
1969 | platform=platform,
1970 | authentication=authentication,
1971 | verbose=verbose,
1972 | )
1973 |
1974 |
1975 | def automl(optimize: str = "R2", use_holdout: bool = False) -> Any:
1976 |
1977 | """
1978 | This function returns the best model out of all trained models in
1979 | current session based on the ``optimize`` parameter. Metrics
1980 | evaluated can be accessed using the ``get_metrics`` function.
1981 |
1982 |
1983 | Example
1984 | -------
1985 | >>> from PyRapidML.datasets import get_data
1986 | >>> boston = extract_data('boston')
1987 | >>> from PyRapidML.regression import *
1988 | >>> exp_name = initializer(data = boston, target = 'medv')
1989 | >>> top3 = comparing_models(n_select = 3)
1990 | >>> tuned_top3 = [tuning_model(i) for i in top3]
1991 | >>> blender = blend_models(tuned_top3)
1992 | >>> stacker = stack_models(tuned_top3)
1993 | >>> best_mae_model = automl(optimize = 'MAE')
1994 |
1995 |
1996 | optimize: str, default = 'R2'
1997 | Metric to use for model selection. It also accepts custom metrics
1998 | added using the ``add_metric`` function.
1999 |
2000 |
2001 | use_holdout: bool, default = False
2002 | When set to True, metrics are evaluated on holdout set instead of CV.
2003 |
2004 |
2005 | Returns:
2006 | Trained Model
2007 |
2008 |
2009 | """
2010 |
2011 | return pycaret.internal.tabular.automl(optimize=optimize, use_holdout=use_holdout)
2012 |
2013 |
2014 | def pull(pop: bool = False) -> pd.DataFrame:
2015 | """
2016 | Returns last printed score grid. Use ``pull`` function after
2017 | any training function to store the score grid in pandas.DataFrame.
2018 |
2019 |
2020 | pop: bool, default = False
2021 | If True, will pop (remove) the returned dataframe from the
2022 | display container.
2023 |
2024 |
2025 | Returns:
2026 | pandas.DataFrame
2027 |
2028 | """
2029 | return pycaret.internal.tabular.pull(pop=pop)
2030 |
2031 |
2032 | def models(
2033 | type: Optional[str] = None, internal: bool = False, raise_errors: bool = True,
2034 | ) -> pd.DataFrame:
2035 |
2036 | """
2037 | Returns table of models available in the model library.
2038 |
2039 | Example
2040 | -------
2041 | >>> from PyRapidML.datasets import get_data
2042 | >>> boston = extract_data('boston')
2043 | >>> from PyRapidML.regression import *
2044 | >>> exp_name = initializer(data = boston, target = 'medv')
2045 | >>> all_models = models()
2046 |
2047 |
2048 | type: str, default = None
2049 | - linear : filters and only return linear models
2050 | - tree : filters and only return tree based models
2051 | - ensemble : filters and only return ensemble models
2052 |
2053 |
2054 | internal: bool, default = False
2055 | When True, will return extra columns and rows used internally.
2056 |
2057 |
2058 | raise_errors: bool, default = True
2059 | When False, will suppress all exceptions, ignoring models
2060 | that couldn't be created.
2061 |
2062 |
2063 | Returns:
2064 | pandas.DataFrame
2065 |
2066 | """
2067 | return pycaret.internal.tabular.models(
2068 | type=type, internal=internal, raise_errors=raise_errors
2069 | )
2070 |
2071 |
2072 | def get_metrics(
2073 | reset: bool = False, include_custom: bool = True, raise_errors: bool = True,
2074 | ) -> pd.DataFrame:
2075 |
2076 | """
2077 | Returns table of available metrics used for CV.
2078 |
2079 |
2080 | Example
2081 | -------
2082 | >>> from PyRapidML.datasets import get_data
2083 | >>> boston = extract_data('boston')
2084 | >>> from PyRapidML.regression import *
2085 | >>> exp_name = initializer(data = boston, target = 'medv')
2086 | >>> all_metrics = get_metrics()
2087 |
2088 |
2089 | reset: bool, default = False
2090 | When True, will reset all changes made using the ``add_metric``
2091 | and ``remove_metric`` function.
2092 |
2093 |
2094 | include_custom: bool, default = True
2095 | Whether to include user added (custom) metrics or not.
2096 |
2097 |
2098 | raise_errors: bool, default = True
2099 | If False, will suppress all exceptions, ignoring models that
2100 | couldn't be created.
2101 |
2102 |
2103 | Returns:
2104 | pandas.DataFrame
2105 |
2106 | """
2107 |
2108 | return pycaret.internal.tabular.get_metrics(
2109 | reset=reset, include_custom=include_custom, raise_errors=raise_errors,
2110 | )
2111 |
2112 |
2113 | def add_metric(
2114 | id: str, name: str, score_func: type, greater_is_better: bool = True, **kwargs,
2115 | ) -> pd.Series:
2116 |
2117 | """
2118 | Adds a custom metric to be used for CV.
2119 |
2120 |
2121 | Example
2122 | -------
2123 | >>> from PyRapidML.datasets import get_data
2124 | >>> boston = extract_data('boston')
2125 | >>> from PyRapidML.regression import *
2126 | >>> exp_name = initializer(data = boston, target = 'medv')
2127 | >>> from sklearn.metrics import explained_variance_score
2128 | >>> add_metric('evs', 'EVS', explained_variance_score)
2129 |
2130 |
2131 | id: str
2132 | Unique id for the metric.
2133 |
2134 |
2135 | name: str
2136 | Display name of the metric.
2137 |
2138 |
2139 | score_func: type
2140 | Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``.
2141 |
2142 |
2143 | greater_is_better: bool, default = True
2144 | Whether ``score_func`` is higher the better or not.
2145 |
2146 |
2147 | **kwargs:
2148 | Arguments to be passed to score function.
2149 |
2150 |
2151 | Returns:
2152 | pandas.Series
2153 |
2154 | """
2155 |
2156 | return pycaret.internal.tabular.add_metric(
2157 | id=id,
2158 | name=name,
2159 | score_func=score_func,
2160 | target="pred",
2161 | greater_is_better=greater_is_better,
2162 | **kwargs,
2163 | )
2164 |
2165 |
2166 | def remove_metric(name_or_id: str):
2167 |
2168 | """
2169 | Removes a metric from CV.
2170 |
2171 |
2172 | Example
2173 | -------
2174 | >>> from PyRapidML.datasets import get_data
2175 | >>> boston = extract_data('boston')
2176 | >>> from PyRapidML.regression import *
2177 | >>> exp_name = initializer(data = boston, target = 'mredv')
2178 | >>> remove_metric('MAPE')
2179 |
2180 |
2181 | name_or_id: str
2182 | Display name or ID of the metric.
2183 |
2184 |
2185 | Returns:
2186 | None
2187 |
2188 | """
2189 | return pycaret.internal.tabular.remove_metric(name_or_id=name_or_id)
2190 |
2191 |
2192 | def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.DataFrame:
2193 |
2194 | """
2195 | Returns a table of experiment logs. Only works when ``log_experiment``
2196 | is True when initializing the ``setup`` function.
2197 |
2198 |
2199 | Example
2200 | -------
2201 | >>> from PyRapidML.datasets import get_data
2202 | >>> boston = extract_data('boston')
2203 | >>> from PyRapidML.regression import *
2204 | >>> exp_name = initializer(data = boston, target = 'medv', log_experiment = True)
2205 | >>> best = comparing_models()
2206 | >>> exp_logs = get_logs()
2207 |
2208 |
2209 | experiment_name: str, default = None
2210 | When None current active run is used.
2211 |
2212 |
2213 | save: bool, default = False
2214 | When set to True, csv file is saved in current working directory.
2215 |
2216 |
2217 | Returns:
2218 | pandas.DataFrame
2219 |
2220 | """
2221 |
2222 | return pycaret.internal.tabular.get_logs(experiment_name=experiment_name, save=save)
2223 |
2224 |
2225 | def get_config(variable: str):
2226 |
2227 | """
2228 | This function retrieves the global variables created when initializing the
2229 | ``setup`` function. Following variables are accessible:
2230 |
2231 | - X: Transformed dataset (X)
2232 | - y: Transformed dataset (y)
2233 | - X_train: Transformed train dataset (X)
2234 | - X_test: Transformed test/holdout dataset (X)
2235 | - y_train: Transformed train dataset (y)
2236 | - y_test: Transformed test/holdout dataset (y)
2237 | - seed: random state set through session_id
2238 | - prep_pipe: Transformation pipeline
2239 | - fold_shuffle_param: shuffle parameter used in Kfolds
2240 | - n_jobs_param: n_jobs parameter used in model training
2241 | - html_param: html_param configured through setup
2242 | - create_model_container: results grid storage container
2243 | - master_model_container: model storage container
2244 | - display_container: results display container
2245 | - exp_name_log: Name of experiment
2246 | - logging_param: log_experiment param
2247 | - log_plots_param: log_plots param
2248 | - USI: Unique session ID parameter
2249 | - fix_imbalance_param: fix_imbalance param
2250 | - fix_imbalance_method_param: fix_imbalance_method param
2251 | - data_before_preprocess: data before preprocessing
2252 | - target_param: name of target variable
2253 | - gpu_param: use_gpu param configured through setup
2254 | - fold_generator: CV splitter configured in fold_strategy
2255 | - fold_param: fold params defined in the setup
2256 | - fold_groups_param: fold groups defined in the setup
2257 | - stratify_param: stratify parameter defined in the setup
2258 | - transform_target_param: transform_target_param in setup
2259 | - transform_target_method_param: transform_target_method_param in setup
2260 |
2261 |
2262 | Example
2263 | -------
2264 | >>> from PyRapidML.datasets import get_data
2265 | >>> boston = extract_data('boston')
2266 | >>> from PyRapidML.regression import *
2267 | >>> exp_name = initializer(data = boston, target = 'medv')
2268 | >>> X_train = get_config('X_train')
2269 |
2270 |
2271 | Returns:
2272 | Global variable
2273 |
2274 |
2275 | """
2276 |
2277 | return pycaret.internal.tabular.get_config(variable=variable)
2278 |
2279 |
2280 | def set_config(variable: str, value):
2281 |
2282 | """
2283 | This function resets the global variables. Following variables are
2284 | accessible:
2285 |
2286 | - X: Transformed dataset (X)
2287 | - y: Transformed dataset (y)
2288 | - X_train: Transformed train dataset (X)
2289 | - X_test: Transformed test/holdout dataset (X)
2290 | - y_train: Transformed train dataset (y)
2291 | - y_test: Transformed test/holdout dataset (y)
2292 | - seed: random state set through session_id
2293 | - prep_pipe: Transformation pipeline
2294 | - fold_shuffle_param: shuffle parameter used in Kfolds
2295 | - n_jobs_param: n_jobs parameter used in model training
2296 | - html_param: html_param configured through setup
2297 | - create_model_container: results grid storage container
2298 | - master_model_container: model storage container
2299 | - display_container: results display container
2300 | - exp_name_log: Name of experiment
2301 | - logging_param: log_experiment param
2302 | - log_plots_param: log_plots param
2303 | - USI: Unique session ID parameter
2304 | - fix_imbalance_param: fix_imbalance param
2305 | - fix_imbalance_method_param: fix_imbalance_method param
2306 | - data_before_preprocess: data before preprocessing
2307 | - target_param: name of target variable
2308 | - gpu_param: use_gpu param configured through setup
2309 | - fold_generator: CV splitter configured in fold_strategy
2310 | - fold_param: fold params defined in the setup
2311 | - fold_groups_param: fold groups defined in the setup
2312 | - stratify_param: stratify parameter defined in the setup
2313 | - transform_target_param: transform_target_param in setup
2314 | - transform_target_method_param: transform_target_method_param in setup
2315 |
2316 |
2317 | Example
2318 | -------
2319 | >>> from PyRapidML.datasets import get_data
2320 | >>> boston = extract_data('boston')
2321 | >>> from PyRapidML.regression import *
2322 | >>> exp_name = initializer(data = boston, target = 'medv')
2323 | >>> set_config('seed', 123)
2324 |
2325 |
2326 | Returns:
2327 | None
2328 |
2329 | """
2330 |
2331 | return pycaret.internal.tabular.set_config(variable=variable, value=value)
2332 |
2333 |
2334 | def save_config(file_name: str):
2335 |
2336 | """
2337 | This function save all global variables to a pickle file, allowing to
2338 | later resume without rerunning the ``setup``.
2339 |
2340 |
2341 | Example
2342 | -------
2343 | >>> from PyRapidML.datasets import get_data
2344 | >>> boston = extract_data('boston')
2345 | >>> from PyRapidML.regression import *
2346 | >>> exp_name = initializer(data = boston, target = 'medv')
2347 | >>> save_config('myvars.pkl')
2348 |
2349 |
2350 | Returns:
2351 | None
2352 |
2353 | """
2354 |
2355 | return pycaret.internal.tabular.save_config(file_name=file_name)
2356 |
2357 |
2358 | def load_config(file_name: str):
2359 |
2360 | """
2361 | This function loads global variables from a pickle file into Python
2362 | environment.
2363 |
2364 |
2365 | Example
2366 | -------
2367 | >>> from PyRapidML.regression import load_config
2368 | >>> load_config('myvars.pkl')
2369 |
2370 |
2371 | Returns:
2372 | Global variables
2373 |
2374 | """
2375 |
2376 | return pycaret.internal.tabular.load_config(file_name=file_name)
2377 |
2378 |
2379 |
--------------------------------------------------------------------------------
/PyRapidML/classification.py:
--------------------------------------------------------------------------------
1 | # Module: Classification
2 | # Author: Zain Ali
3 | # License: MIT
4 | # Release: PyRapidML
5 | # Last modified : 05/06/2021
6 |
7 |
8 | import pandas as pd
9 | import numpy as np
10 |
11 | import pycaret.internal.tabular
12 | from pycaret.internal.Display import Display, is_in_colab, enable_colab
13 | from typing import List, Tuple, Any, Union, Optional, Dict
14 | import warnings
15 | from IPython.utils import io
16 | import traceback
17 |
18 | from pycaret.internal.tabular import MLUsecase
19 |
20 | warnings.filterwarnings("ignore")
21 |
22 |
23 | def initializer(
24 | data: pd.DataFrame,
25 | target: str,
26 | train_size: float = 0.7,
27 | test_data: Optional[pd.DataFrame] = None,
28 | preprocess: bool = True,
29 | imputation_type: str = "simple",
30 | iterative_imputation_iters: int = 5,
31 | categorical_features: Optional[List[str]] = None,
32 | categorical_imputation: str = "constant",
33 | categorical_iterative_imputer: Union[str, Any] = "lightgbm",
34 | ordinal_features: Optional[Dict[str, list]] = None,
35 | high_cardinality_features: Optional[List[str]] = None,
36 | high_cardinality_method: str = "frequency",
37 | numeric_features: Optional[List[str]] = None,
38 | numeric_imputation: str = "mean",
39 | numeric_iterative_imputer: Union[str, Any] = "lightgbm",
40 | date_features: Optional[List[str]] = None,
41 | ignore_features: Optional[List[str]] = None,
42 | normalize: bool = False,
43 | normalize_method: str = "zscore",
44 | transformation: bool = False,
45 | transformation_method: str = "yeo-johnson",
46 | handle_unknown_categorical: bool = True,
47 | unknown_categorical_method: str = "least_frequent",
48 | pca: bool = False,
49 | pca_method: str = "linear",
50 | pca_components: Optional[float] = None,
51 | ignore_low_variance: bool = False,
52 | combine_rare_levels: bool = False,
53 | rare_level_threshold: float = 0.10,
54 | bin_numeric_features: Optional[List[str]] = None,
55 | remove_outliers: bool = False,
56 | outliers_threshold: float = 0.05,
57 | remove_multicollinearity: bool = False,
58 | multicollinearity_threshold: float = 0.9,
59 | remove_perfect_collinearity: bool = True,
60 | create_clusters: bool = False,
61 | cluster_iter: int = 20,
62 | polynomial_features: bool = False,
63 | polynomial_degree: int = 2,
64 | trigonometry_features: bool = False,
65 | polynomial_threshold: float = 0.1,
66 | group_features: Optional[List[str]] = None,
67 | group_names: Optional[List[str]] = None,
68 | feature_selection: bool = False,
69 | feature_selection_threshold: float = 0.8,
70 | feature_selection_method: str = "classic",
71 | feature_interaction: bool = False,
72 | feature_ratio: bool = False,
73 | interaction_threshold: float = 0.01,
74 | fix_imbalance: bool = False,
75 | fix_imbalance_method: Optional[Any] = None,
76 | data_split_shuffle: bool = True,
77 | data_split_stratify: Union[bool, List[str]] = False,
78 | fold_strategy: Union[str, Any] = "stratifiedkfold",
79 | fold: int = 10,
80 | fold_shuffle: bool = False,
81 | fold_groups: Optional[Union[str, pd.DataFrame]] = None,
82 | n_jobs: Optional[int] = -1,
83 | use_gpu: bool = False,
84 | custom_pipeline: Union[
85 | Any, Tuple[str, Any], List[Any], List[Tuple[str, Any]]
86 | ] = None,
87 | html: bool = True,
88 | session_id: Optional[int] = None,
89 | log_experiment: bool = False,
90 | experiment_name: Optional[str] = None,
91 | log_plots: Union[bool, list] = False,
92 | log_profile: bool = False,
93 | log_data: bool = False,
94 | silent: bool = False,
95 | verbose: bool = True,
96 | profile: bool = False,
97 | profile_kwargs: Dict[str, Any] = None,
98 | ):
99 |
100 | """
101 | This function initializes the training environment and creates the transformation
102 | pipeline. Setup function must be called before executing any other function. It takes
103 | two mandatory parameters: ``data`` and ``target``. All the other parameters are
104 | optional.
105 |
106 | Example
107 | -------
108 | >>> from PyRapidML.datasets import get_data
109 | >>> juice = extract_data('juice')
110 | >>> from PyRapidML.classification import *
111 | >>> exp_name = initializer(data = juice, target = 'Purchase')
112 |
113 |
114 | data: pandas.DataFrame
115 | Shape (n_samples, n_features), where n_samples is the number of samples and
116 | n_features is the number of features.
117 |
118 |
119 | target: str
120 | Name of the target column to be passed in as a string. The target variable can
121 | be either binary or multiclass.
122 |
123 |
124 | train_size: float, default = 0.7
125 | Proportion of the dataset to be used for training and validation. Should be
126 | between 0.0 and 1.0.
127 |
128 |
129 | test_data: pandas.DataFrame, default = None
130 | If not None, test_data is used as a hold-out set and ``train_size`` parameter is
131 | ignored. test_data must be labelled and the shape of data and test_data must
132 | match.
133 |
134 |
135 | preprocess: bool, default = True
136 | When set to False, no transformations are applied except for train_test_split
137 | and custom transformations passed in ``custom_pipeline`` param. Data must be
138 | ready for modeling (no missing values, no dates, categorical data encoding),
139 | when preprocess is set to False.
140 |
141 |
142 | imputation_type: str, default = 'simple'
143 | The type of imputation to use. Can be either 'simple' or 'iterative'.
144 |
145 |
146 | iterative_imputation_iters: int, default = 5
147 | Number of iterations. Ignored when ``imputation_type`` is not 'iterative'.
148 |
149 |
150 | categorical_features: list of str, default = None
151 | If the inferred data types are not correct or the silent param is set to True,
152 | categorical_features param can be used to overwrite or define the data types.
153 | It takes a list of strings with column names that are categorical.
154 |
155 |
156 | categorical_imputation: str, default = 'constant'
157 | Missing values in categorical features are imputed with a constant 'not_available'
158 | value. The other available option is 'mode'.
159 |
160 |
161 | categorical_iterative_imputer: str, default = 'lightgbm'
162 | Estimator for iterative imputation of missing values in categorical features.
163 | Ignored when ``imputation_type`` is not 'iterative'.
164 |
165 |
166 | ordinal_features: dict, default = None
167 | Encode categorical features as ordinal. For example, a categorical feature with
168 | 'low', 'medium', 'high' values where low < medium < high can be passed as
169 | ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }.
170 |
171 |
172 | high_cardinality_features: list of str, default = None
173 | When categorical features contains many levels, it can be compressed into fewer
174 | levels using this parameter. It takes a list of strings with column names that
175 | are categorical.
176 |
177 |
178 | high_cardinality_method: str, default = 'frequency'
179 | Categorical features with high cardinality are replaced with the frequency of
180 | values in each level occurring in the training dataset. Other available method
181 | is 'clustering' which trains the K-Means clustering algorithm on the statistical
182 | attribute of the training data and replaces the original value of feature with the
183 | cluster label. The number of clusters is determined by optimizing Calinski-Harabasz
184 | and Silhouette criterion.
185 |
186 |
187 | numeric_features: list of str, default = None
188 | If the inferred data types are not correct or the silent param is set to True,
189 | numeric_features param can be used to overwrite or define the data types.
190 | It takes a list of strings with column names that are numeric.
191 |
192 |
193 | numeric_imputation: str, default = 'mean'
194 | Missing values in numeric features are imputed with 'mean' value of the feature
195 | in the training dataset. The other available option is 'median' or 'zero'.
196 |
197 |
198 | numeric_iterative_imputer: str, default = 'lightgbm'
199 | Estimator for iterative imputation of missing values in numeric features.
200 | Ignored when ``imputation_type`` is set to 'simple'.
201 |
202 |
203 | date_features: list of str, default = None
204 | If the inferred data types are not correct or the silent param is set to True,
205 | date_features param can be used to overwrite or define the data types. It takes
206 | a list of strings with column names that are DateTime.
207 |
208 |
209 | ignore_features: list of str, default = None
210 | ignore_features param can be used to ignore features during model training.
211 | It takes a list of strings with column names that are to be ignored.
212 |
213 |
214 | normalize: bool, default = False
215 | When set to True, it transforms the numeric features by scaling them to a given
216 | range. Type of scaling is defined by the ``normalize_method`` parameter.
217 |
218 |
219 | normalize_method: str, default = 'zscore'
220 | Defines the method for scaling. By default, normalize method is set to 'zscore'
221 | The standard zscore is calculated as z = (x - u) / s. Ignored when ``normalize``
222 | is not True. The other options are:
223 |
224 | - minmax: scales and translates each feature individually such that it is in
225 | the range of 0 - 1.
226 | - maxabs: scales and translates each feature individually such that the
227 | maximal absolute value of each feature will be 1.0. It does not
228 | shift/center the data, and thus does not destroy any sparsity.
229 | - robust: scales and translates each feature according to the Interquartile
230 | range. When the dataset contains outliers, robust scaler often gives
231 | better results.
232 |
233 |
234 | transformation: bool, default = False
235 | When set to True, it applies the power transform to make data more Gaussian-like.
236 | Type of transformation is defined by the ``transformation_method`` parameter.
237 |
238 |
239 | transformation_method: str, default = 'yeo-johnson'
240 | Defines the method for transformation. By default, the transformation method is
241 | set to 'yeo-johnson'. The other available option for transformation is 'quantile'.
242 | Ignored when ``transformation`` is not True.
243 |
244 |
245 | handle_unknown_categorical: bool, default = True
246 | When set to True, unknown categorical levels in unseen data are replaced by the
247 | most or least frequent level as learned in the training dataset.
248 |
249 |
250 | unknown_categorical_method: str, default = 'least_frequent'
251 | Method used to replace unknown categorical levels in unseen data. Method can be
252 | set to 'least_frequent' or 'most_frequent'.
253 |
254 |
255 | pca: bool, default = False
256 | When set to True, dimensionality reduction is applied to project the data into
257 | a lower dimensional space using the method defined in ``pca_method`` parameter.
258 |
259 |
260 | pca_method: str, default = 'linear'
261 | The 'linear' method performs uses Singular Value Decomposition. Other options are:
262 |
263 | - kernel: dimensionality reduction through the use of RVF kernel.
264 | - incremental: replacement for 'linear' pca when the dataset is too large.
265 |
266 |
267 | pca_components: int or float, default = None
268 | Number of components to keep. if pca_components is a float, it is treated as a
269 | target percentage for information retention. When pca_components is an integer
270 | it is treated as the number of features to be kept. pca_components must be less
271 | than the original number of features. Ignored when ``pca`` is not True.
272 |
273 |
274 | ignore_low_variance: bool, default = False
275 | When set to True, all categorical features with insignificant variances are
276 | removed from the data. The variance is calculated using the ratio of unique
277 | values to the number of samples, and the ratio of the most common value to the
278 | frequency of the second most common value.
279 |
280 |
281 | combine_rare_levels: bool, default = False
282 | When set to True, frequency percentile for levels in categorical features below
283 | a certain threshold is combined into a single level.
284 |
285 |
286 | rare_level_threshold: float, default = 0.1
287 | Percentile distribution below which rare categories are combined. Ignored when
288 | ``combine_rare_levels`` is not True.
289 |
290 |
291 | bin_numeric_features: list of str, default = None
292 | To convert numeric features into categorical, bin_numeric_features parameter can
293 | be used. It takes a list of strings with column names to be discretized. It does
294 | so by using 'sturges' rule to determine the number of clusters and then apply
295 | KMeans algorithm. Original values of the feature are then replaced by the
296 | cluster label.
297 |
298 |
299 | remove_outliers: bool, default = False
300 | When set to True, outliers from the training data are removed using the Singular
301 | Value Decomposition.
302 |
303 |
304 | outliers_threshold: float, default = 0.05
305 | The percentage outliers to be removed from the training dataset. Ignored when
306 | ``remove_outliers`` is not True.
307 |
308 |
309 | remove_multicollinearity: bool, default = False
310 | When set to True, features with the inter-correlations higher than the defined
311 | threshold are removed. When two features are highly correlated with each other,
312 | the feature that is less correlated with the target variable is removed. Only
313 | considers numeric features.
314 |
315 | multicollinearity_threshold: float, default = 0.9
316 | Threshold for correlated features. Ignored when ``remove_multicollinearity``
317 | is not True.
318 |
319 |
320 | remove_perfect_collinearity: bool, default = True
321 | When set to True, perfect collinearity (features with correlation = 1) is removed
322 | from the dataset, when two features are 100% correlated, one of it is randomly
323 | removed from the dataset.
324 |
325 |
326 | create_clusters: bool, default = False
327 | When set to True, an additional feature is created in training dataset where each
328 | instance is assigned to a cluster. The number of clusters is determined by
329 | optimizing Calinski-Harabasz and Silhouette criterion.
330 |
331 |
332 | cluster_iter: int, default = 20
333 | Number of iterations for creating cluster. Each iteration represents cluster
334 | size. Ignored when ``create_clusters`` is not True.
335 |
336 |
337 | polynomial_features: bool, default = False
338 | When set to True, new features are derived using existing numeric features.
339 |
340 |
341 | polynomial_degree: int, default = 2
342 | Degree of polynomial features. For example, if an input sample is two dimensional
343 | and of the form [a, b], the polynomial features with degree = 2 are:
344 | [1, a, b, a^2, ab, b^2]. Ignored when ``polynomial_features`` is not True.
345 |
346 |
347 | trigonometry_features: bool, default = False
348 | When set to True, new features are derived using existing numeric features.
349 |
350 |
351 | polynomial_threshold: float, default = 0.1
352 | When ``polynomial_features`` or ``trigonometry_features`` is True, new features
353 | are derived from the existing numeric features. This may sometimes result in too
354 | large feature space. polynomial_threshold parameter can be used to deal with this
355 | problem. It does so by using combination of Random Forest, AdaBoost and Linear
356 | correlation. All derived features that falls within the percentile distribution
357 | are kept and rest of the features are removed.
358 |
359 |
360 | group_features: list or list of list, default = None
361 | When the dataset contains features with related characteristics, group_features
362 | parameter can be used for feature extraction. It takes a list of strings with
363 | column names that are related.
364 |
365 |
366 | group_names: list, default = None
367 | Group names to be used in naming new features. When the length of group_names
368 | does not match with the length of ``group_features``, new features are named
369 | sequentially group_1, group_2, etc. It is ignored when ``group_features`` is
370 | None.
371 |
372 |
373 | feature_selection: bool, default = False
374 | When set to True, a subset of features are selected using a combination of
375 | various permutation importance techniques including Random Forest, Adaboost
376 | and Linear correlation with target variable. The size of the subset is
377 | dependent on the ``feature_selection_threshold`` parameter.
378 |
379 |
380 | feature_selection_threshold: float, default = 0.8
381 | Threshold value used for feature selection. When ``polynomial_features`` or
382 | ``feature_interaction`` is True, it is recommended to keep the threshold low
383 | to avoid large feature spaces. Setting a very low value may be efficient but
384 | could result in under-fitting.
385 |
386 |
387 | feature_selection_method: str, default = 'classic'
388 | Algorithm for feature selection. 'classic' method uses permutation feature
389 | importance techniques. Other possible value is 'boruta' which uses boruta
390 | algorithm for feature selection.
391 |
392 |
393 | feature_interaction: bool, default = False
394 | When set to True, new features are created by interacting (a * b) all the
395 | numeric variables in the dataset. This feature is not scalable and may not
396 | work as expected on datasets with large feature space.
397 |
398 |
399 | feature_ratio: bool, default = False
400 | When set to True, new features are created by calculating the ratios (a / b)
401 | between all numeric variables in the dataset. This feature is not scalable and
402 | may not work as expected on datasets with large feature space.
403 |
404 |
405 | interaction_threshold: bool, default = 0.01
406 | Similar to polynomial_threshold, It is used to compress a sparse matrix of newly
407 | created features through interaction. Features whose importance based on the
408 | combination of Random Forest, AdaBoost and Linear correlation falls within the
409 | percentile of the defined threshold are kept in the dataset. Remaining features
410 | are dropped before further processing.
411 |
412 |
413 | fix_imbalance: bool, default = False
414 | When training dataset has unequal distribution of target class it can be balanced
415 | using this parameter. When set to True, SMOTE (Synthetic Minority Over-sampling
416 | Technique) is applied by default to create synthetic datapoints for minority class.
417 |
418 |
419 | fix_imbalance_method: obj, default = None
420 | When ``fix_imbalance`` is True, 'imblearn' compatible object with 'fit_resample'
421 | method can be passed. When set to None, 'imblearn.over_sampling.SMOTE' is used.
422 |
423 |
424 | data_split_shuffle: bool, default = True
425 | When set to False, prevents shuffling of rows during 'train_test_split'.
426 |
427 |
428 | data_split_stratify: bool or list, default = False
429 | Controls stratification during 'train_test_split'. When set to True, will
430 | stratify by target column. To stratify on any other columns, pass a list of
431 | column names. Ignored when ``data_split_shuffle`` is False.
432 |
433 |
434 | fold_strategy: str or sklearn CV generator object, default = 'stratifiedkfold'
435 | Choice of cross validation strategy. Possible values are:
436 |
437 | * 'kfold'
438 | * 'stratifiedkfold'
439 | * 'groupkfold'
440 | * 'timeseries'
441 | * a custom CV generator object compatible with scikit-learn.
442 |
443 |
444 | fold: int, default = 10
445 | Number of folds to be used in cross validation. Must be at least 2. This is
446 | a global setting that can be over-written at function level by using ``fold``
447 | parameter. Ignored when ``fold_strategy`` is a custom object.
448 |
449 |
450 | fold_shuffle: bool, default = False
451 | Controls the shuffle parameter of CV. Only applicable when ``fold_strategy``
452 | is 'kfold' or 'stratifiedkfold'. Ignored when ``fold_strategy`` is a custom
453 | object.
454 |
455 |
456 | fold_groups: str or array-like, with shape (n_samples,), default = None
457 | Optional group labels when 'GroupKFold' is used for the cross validation.
458 | It takes an array with shape (n_samples, ) where n_samples is the number
459 | of rows in the training dataset. When string is passed, it is interpreted
460 | as the column name in the dataset containing group labels.
461 |
462 |
463 | n_jobs: int, default = -1
464 | The number of jobs to run in parallel (for functions that supports parallel
465 | processing) -1 means using all processors. To run all functions on single
466 | processor set n_jobs to None.
467 |
468 |
469 | use_gpu: bool or str, default = False
470 | When set to True, it will use GPU for training with algorithms that support it,
471 | and fall back to CPU if they are unavailable. When set to 'force', it will only
472 | use GPU-enabled algorithms and raise exceptions when they are unavailable. When
473 | False, all algorithms are trained using CPU only.
474 |
475 | GPU enabled algorithms:
476 |
477 | - Extreme Gradient Boosting, requires no further installation
478 |
479 | - CatBoost Classifier, requires no further installation
480 | (GPU is only enabled when data > 50,000 rows)
481 |
482 | - Light Gradient Boosting Machine, requires GPU installation
483 | https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html
484 |
485 | - Logistic Regression, Ridge Classifier, Random Forest, K Neighbors Classifier,
486 | Support Vector Machine, requires cuML >= 0.15
487 | https://github.com/rapidsai/cuml
488 |
489 |
490 | custom_pipeline: (str, transformer) or list of (str, transformer), default = None
491 | When passed, will append the custom transformers in the preprocessing pipeline
492 | and are applied on each CV fold separately and on the final fit. All the custom
493 | transformations are applied after 'train_test_split' and before PyRapidML's internal
494 | transformations.
495 |
496 |
497 | html: bool, default = True
498 | When set to False, prevents runtime display of monitor. This must be set to False
499 | when the environment does not support IPython. For example, command line terminal,
500 | Databricks Notebook, Spyder and other similar IDEs.
501 |
502 |
503 | session_id: int, default = None
504 | Controls the randomness of experiment. It is equivalent to 'random_state' in
505 | scikit-learn. When None, a pseudo random number is generated. This can be used
506 | for later reproducibility of the entire experiment.
507 |
508 |
509 | log_experiment: bool, default = False
510 | When set to True, all metrics and parameters are logged on the ``MLFlow`` server.
511 |
512 |
513 | experiment_name: str, default = None
514 | Name of the experiment for logging. Ignored when ``log_experiment`` is not True.
515 |
516 |
517 | log_plots: bool or list, default = False
518 | When set to True, certain plots are logged automatically in the ``MLFlow`` server.
519 | To change the type of plots to be logged, pass a list containing plot IDs. Refer
520 | to documentation of ``plot_model``. Ignored when ``log_experiment`` is not True.
521 |
522 |
523 | log_profile: bool, default = False
524 | When set to True, data profile is logged on the ``MLflow`` server as a html file.
525 | Ignored when ``log_experiment`` is not True.
526 |
527 |
528 | log_data: bool, default = False
529 | When set to True, dataset is logged on the ``MLflow`` server as a csv file.
530 | Ignored when ``log_experiment`` is not True.
531 |
532 |
533 | silent: bool, default = False
534 | Controls the confirmation input of data types when ``setup`` is executed. When
535 | executing in completely automated mode or on a remote kernel, this must be True.
536 |
537 |
538 | verbose: bool, default = True
539 | When set to False, Information grid is not printed.
540 |
541 |
542 | profile: bool, default = False
543 | When set to True, an interactive EDA report is displayed.
544 |
545 |
546 | profile_kwargs: dict, default = {} (empty dict)
547 | Dictionary of arguments passed to the ProfileReport method used
548 | to create the EDA report. Ignored if ``profile`` is False.
549 |
550 |
551 | Returns:
552 | Global variables that can be changed using the ``set_config`` function.
553 |
554 | """
555 |
556 | available_plots = {
557 | "parameter": "Hyperparameters",
558 | "auc": "AUC",
559 | "confusion_matrix": "Confusion Matrix",
560 | "threshold": "Threshold",
561 | "pr": "Precision Recall",
562 | "error": "Prediction Error",
563 | "class_report": "Class Report",
564 | "rfe": "Feature Selection",
565 | "learning": "Learning Curve",
566 | "manifold": "Manifold Learning",
567 | "calibration": "Calibration Curve",
568 | "vc": "Validation Curve",
569 | "dimension": "Dimensions",
570 | "feature": "Feature Importance",
571 | "feature_all": "Feature Importance (All)",
572 | "boundary": "Decision Boundary",
573 | "lift": "Lift Chart",
574 | "gain": "Gain Chart",
575 | "tree": "Decision Tree",
576 | }
577 |
578 | if log_plots == True:
579 | log_plots = ["auc", "confusion_matrix", "feature"]
580 |
581 | return pycaret.internal.tabular.setup(
582 | ml_usecase="classification",
583 | available_plots=available_plots,
584 | data=data,
585 | target=target,
586 | train_size=train_size,
587 | test_data=test_data,
588 | preprocess=preprocess,
589 | imputation_type=imputation_type,
590 | iterative_imputation_iters=iterative_imputation_iters,
591 | categorical_features=categorical_features,
592 | categorical_imputation=categorical_imputation,
593 | categorical_iterative_imputer=categorical_iterative_imputer,
594 | ordinal_features=ordinal_features,
595 | high_cardinality_features=high_cardinality_features,
596 | high_cardinality_method=high_cardinality_method,
597 | numeric_features=numeric_features,
598 | numeric_imputation=numeric_imputation,
599 | numeric_iterative_imputer=numeric_iterative_imputer,
600 | date_features=date_features,
601 | ignore_features=ignore_features,
602 | normalize=normalize,
603 | normalize_method=normalize_method,
604 | transformation=transformation,
605 | transformation_method=transformation_method,
606 | handle_unknown_categorical=handle_unknown_categorical,
607 | unknown_categorical_method=unknown_categorical_method,
608 | pca=pca,
609 | pca_method=pca_method,
610 | pca_components=pca_components,
611 | ignore_low_variance=ignore_low_variance,
612 | combine_rare_levels=combine_rare_levels,
613 | rare_level_threshold=rare_level_threshold,
614 | bin_numeric_features=bin_numeric_features,
615 | remove_outliers=remove_outliers,
616 | outliers_threshold=outliers_threshold,
617 | remove_multicollinearity=remove_multicollinearity,
618 | multicollinearity_threshold=multicollinearity_threshold,
619 | remove_perfect_collinearity=remove_perfect_collinearity,
620 | create_clusters=create_clusters,
621 | cluster_iter=cluster_iter,
622 | polynomial_features=polynomial_features,
623 | polynomial_degree=polynomial_degree,
624 | trigonometry_features=trigonometry_features,
625 | polynomial_threshold=polynomial_threshold,
626 | group_features=group_features,
627 | group_names=group_names,
628 | feature_selection=feature_selection,
629 | feature_selection_threshold=feature_selection_threshold,
630 | feature_selection_method=feature_selection_method,
631 | feature_interaction=feature_interaction,
632 | feature_ratio=feature_ratio,
633 | interaction_threshold=interaction_threshold,
634 | fix_imbalance=fix_imbalance,
635 | fix_imbalance_method=fix_imbalance_method,
636 | data_split_shuffle=data_split_shuffle,
637 | data_split_stratify=data_split_stratify,
638 | fold_strategy=fold_strategy,
639 | fold=fold,
640 | fold_shuffle=fold_shuffle,
641 | fold_groups=fold_groups,
642 | n_jobs=n_jobs,
643 | use_gpu=use_gpu,
644 | custom_pipeline=custom_pipeline,
645 | html=html,
646 | session_id=session_id,
647 | log_experiment=log_experiment,
648 | experiment_name=experiment_name,
649 | log_plots=log_plots,
650 | log_profile=log_profile,
651 | log_data=log_data,
652 | silent=silent,
653 | verbose=verbose,
654 | profile=profile,
655 | profile_kwargs=profile_kwargs,
656 | )
657 |
658 |
659 | def comparing_models(
660 | include: Optional[List[Union[str, Any]]] = None,
661 | exclude: Optional[List[str]] = None,
662 | fold: Optional[Union[int, Any]] = None,
663 | round: int = 4,
664 | cross_validation: bool = True,
665 | sort: str = "Accuracy",
666 | n_select: int = 1,
667 | budget_time: Optional[float] = None,
668 | turbo: bool = True,
669 | errors: str = "ignore",
670 | fit_kwargs: Optional[dict] = None,
671 | groups: Optional[Union[str, Any]] = None,
672 | verbose: bool = True,
673 | ) -> Union[Any, List[Any]]:
674 |
675 | """
676 | This function trains and evaluates performance of all estimators available in the
677 | model library using cross validation. The output of this function is a score grid
678 | with average cross validated scores. Metrics evaluated during CV can be accessed
679 | using the ``get_metrics`` function. Custom metrics can be added or removed using
680 | ``add_metric`` and ``remove_metric`` function.
681 |
682 | Example
683 | -------
684 | >>> from PyRapidML.datasets import get_data
685 | >>> juice = extract_data('juice')
686 | >>> from PyRapidML.classification import *
687 | >>> exp_name = initializer(data = juice, target = 'Purchase')
688 | >>> best_model = comparing_models()
689 |
690 |
691 | include: list of str or scikit-learn compatible object, default = None
692 | To train and evaluate select models, list containing model ID or scikit-learn
693 | compatible object can be passed in include param. To see a list of all models
694 | available in the model library use the ``models`` function.
695 |
696 |
697 | exclude: list of str, default = None
698 | To omit certain models from training and evaluation, pass a list containing
699 | model id in the exclude parameter. To see a list of all models available
700 | in the model library use the ``models`` function.
701 |
702 |
703 | fold: int or scikit-learn compatible CV generator, default = None
704 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
705 | parameter of the ``setup`` function is used. When an integer is passed,
706 | it is interpreted as the 'n_splits' parameter of the CV generator in the
707 | ``setup`` function.
708 |
709 |
710 | round: int, default = 4
711 | Number of decimal places the metrics in the score grid will be rounded to.
712 |
713 |
714 | cross_validation: bool, default = True
715 | When set to False, metrics are evaluated on holdout set. ``fold`` param
716 | is ignored when cross_validation is set to False.
717 |
718 |
719 | sort: str, default = 'Accuracy'
720 | The sort order of the score grid. It also accepts custom metrics that are
721 | added through the ``add_metric`` function.
722 |
723 |
724 | n_select: int, default = 1
725 | Number of top_n models to return. For example, to select top 3 models use
726 | n_select = 3.
727 |
728 |
729 | budget_time: int or float, default = None
730 | If not None, will terminate execution of the function after budget_time
731 | minutes have passed and return results up to that point.
732 |
733 |
734 | turbo: bool, default = True
735 | When set to True, it excludes estimators with longer training times. To
736 | see which algorithms are excluded use the ``models`` function.
737 |
738 |
739 | errors: str, default = 'ignore'
740 | When set to 'ignore', will skip the model with exceptions and continue.
741 | If 'raise', will break the function when exceptions are raised.
742 |
743 |
744 | fit_kwargs: dict, default = {} (empty dict)
745 | Dictionary of arguments passed to the fit method of the model.
746 |
747 |
748 | groups: str or array-like, with shape (n_samples,), default = None
749 | Optional group labels when 'GroupKFold' is used for the cross validation.
750 | It takes an array with shape (n_samples, ) where n_samples is the number
751 | of rows in the training dataset. When string is passed, it is interpreted
752 | as the column name in the dataset containing group labels.
753 |
754 |
755 | verbose: bool, default = True
756 | Score grid is not printed when verbose is set to False.
757 |
758 |
759 | Returns:
760 | Trained model or list of trained models, depending on the ``n_select`` param.
761 |
762 | Warnings
763 | --------
764 | - Changing turbo parameter to False may result in very high training times with
765 | datasets exceeding 10,000 rows.
766 |
767 | - AUC for estimators that does not support 'predict_proba' is shown as 0.0000.
768 |
769 | - No models are logged in ``MLFlow`` when ``cross_validation`` parameter is False.
770 | """
771 |
772 | return pycaret.internal.tabular.compare_models(
773 | include=include,
774 | exclude=exclude,
775 | fold=fold,
776 | round=round,
777 | cross_validation=cross_validation,
778 | sort=sort,
779 | n_select=n_select,
780 | budget_time=budget_time,
781 | turbo=turbo,
782 | errors=errors,
783 | fit_kwargs=fit_kwargs,
784 | groups=groups,
785 | verbose=verbose,
786 | )
787 |
788 |
789 | def creating_model(
790 | estimator: Union[str, Any],
791 | fold: Optional[Union[int, Any]] = None,
792 | round: int = 4,
793 | cross_validation: bool = True,
794 | fit_kwargs: Optional[dict] = None,
795 | groups: Optional[Union[str, Any]] = None,
796 | verbose: bool = True,
797 | **kwargs,
798 | ) -> Any:
799 |
800 | """
801 | This function trains and evaluates the performance of a given estimator
802 | using cross validation. The output of this function is a score grid with
803 | CV scores by fold. Metrics evaluated during CV can be accessed using the
804 | ``get_metrics`` function. Custom metrics can be added or removed using
805 | ``add_metric`` and ``remove_metric`` function. All the available models
806 | can be accessed using the ``models`` function.
807 |
808 | Example
809 | -------
810 | >>> from PyRapidML.datasets import get_data
811 | >>> juice = extract_data('juice')
812 | >>> from PyRapidML.classification import *
813 | >>> exp_name = initializer(data = juice, target = 'Purchase')
814 | >>> lr = creating_model('lr')
815 |
816 |
817 | estimator: str or scikit-learn compatible object
818 | ID of an estimator available in model library or pass an untrained
819 | model object consistent with scikit-learn API. Estimators available
820 | in the model library (ID - Name):
821 |
822 | * 'lr' - Logistic Regression
823 | * 'knn' - K Neighbors Classifier
824 | * 'nb' - Naive Bayes
825 | * 'dt' - Decision Tree Classifier
826 | * 'svm' - SVM - Linear Kernel
827 | * 'rbfsvm' - SVM - Radial Kernel
828 | * 'gpc' - Gaussian Process Classifier
829 | * 'mlp' - MLP Classifier
830 | * 'ridge' - Ridge Classifier
831 | * 'rf' - Random Forest Classifier
832 | * 'qda' - Quadratic Discriminant Analysis
833 | * 'ada' - Ada Boost Classifier
834 | * 'gbc' - Gradient Boosting Classifier
835 | * 'lda' - Linear Discriminant Analysis
836 | * 'et' - Extra Trees Classifier
837 | * 'xgboost' - Extreme Gradient Boosting
838 | * 'lightgbm' - Light Gradient Boosting Machine
839 | * 'catboost' - CatBoost Classifier
840 |
841 |
842 | fold: int or scikit-learn compatible CV generator, default = None
843 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
844 | parameter of the ``setup`` function is used. When an integer is passed,
845 | it is interpreted as the 'n_splits' parameter of the CV generator in the
846 | ``setup`` function.
847 |
848 |
849 | round: int, default = 4
850 | Number of decimal places the metrics in the score grid will be rounded to.
851 |
852 |
853 | cross_validation: bool, default = True
854 | When set to False, metrics are evaluated on holdout set. ``fold`` param
855 | is ignored when cross_validation is set to False.
856 |
857 |
858 | fit_kwargs: dict, default = {} (empty dict)
859 | Dictionary of arguments passed to the fit method of the model.
860 |
861 |
862 | groups: str or array-like, with shape (n_samples,), default = None
863 | Optional group labels when GroupKFold is used for the cross validation.
864 | It takes an array with shape (n_samples, ) where n_samples is the number
865 | of rows in training dataset. When string is passed, it is interpreted as
866 | the column name in the dataset containing group labels.
867 |
868 |
869 | verbose: bool, default = True
870 | Score grid is not printed when verbose is set to False.
871 |
872 |
873 | **kwargs**:
874 | Additional keyword arguments to pass to the estimator.
875 |
876 |
877 | Returns:
878 | Trained Model
879 |
880 |
881 | Warnings
882 | --------
883 | - AUC for estimators that does not support 'predict_proba' is shown as 0.0000.
884 |
885 | - Models are not logged on the ``MLFlow`` server when ``cross_validation`` param
886 | is set to False.
887 |
888 | """
889 |
890 | return pycaret.internal.tabular.create_model_supervised(
891 | estimator=estimator,
892 | fold=fold,
893 | round=round,
894 | cross_validation=cross_validation,
895 | fit_kwargs=fit_kwargs,
896 | groups=groups,
897 | verbose=verbose,
898 | **kwargs,
899 | )
900 |
901 |
902 | def tuning_model(
903 | estimator,
904 | fold: Optional[Union[int, Any]] = None,
905 | round: int = 4,
906 | n_iter: int = 10,
907 | custom_grid: Optional[Union[Dict[str, list], Any]] = None,
908 | optimize: str = "Accuracy",
909 | custom_scorer=None,
910 | search_library: str = "scikit-learn",
911 | search_algorithm: Optional[str] = None,
912 | early_stopping: Any = False,
913 | early_stopping_max_iters: int = 10,
914 | choose_better: bool = False,
915 | fit_kwargs: Optional[dict] = None,
916 | groups: Optional[Union[str, Any]] = None,
917 | return_tuner: bool = False,
918 | verbose: bool = True,
919 | tuner_verbose: Union[int, bool] = True,
920 | **kwargs,
921 | ) -> Any:
922 |
923 | """
924 | This function tunes the hyperparameters of a given estimator. The output of
925 | this function is a score grid with CV scores by fold of the best selected
926 | model based on ``optimize`` parameter. Metrics evaluated during CV can be
927 | accessed using the ``get_metrics`` function. Custom metrics can be added
928 | or removed using ``add_metric`` and ``remove_metric`` function.
929 |
930 | Example
931 | -------
932 | >>> from PyRapidML.datasets import get_data
933 | >>> juice = extract_data('juice')
934 | >>> from PyRapidML.classification import *
935 | >>> exp_name = initializer(data = juice, target = 'Purchase')
936 | >>> lr = creating_model('lr')
937 | >>> tuned_lr = tuning_model(lr)
938 |
939 |
940 | estimator: scikit-learn compatible object
941 | Trained model object
942 |
943 |
944 | fold: int or scikit-learn compatible CV generator, default = None
945 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
946 | parameter of the ``setup`` function is used. When an integer is passed,
947 | it is interpreted as the 'n_splits' parameter of the CV generator in the
948 | ``setup`` function.
949 |
950 |
951 | round: int, default = 4
952 | Number of decimal places the metrics in the score grid will be rounded to.
953 |
954 |
955 | n_iter: int, default = 10
956 | Number of iterations in the grid search. Increasing 'n_iter' may improve
957 | model performance but also increases the training time.
958 |
959 |
960 | custom_grid: dictionary, default = None
961 | To define custom search space for hyperparameters, pass a dictionary with
962 | parameter name and values to be iterated. Custom grids must be in a format
963 | supported by the defined ``search_library``.
964 |
965 |
966 | optimize: str, default = 'Accuracy'
967 | Metric name to be evaluated for hyperparameter tuning. It also accepts custom
968 | metrics that are added through the ``add_metric`` function.
969 |
970 |
971 | custom_scorer: object, default = None
972 | custom scoring strategy can be passed to tune hyperparameters of the model.
973 | It must be created using ``sklearn.make_scorer``. It is equivalent of adding
974 | custom metric using the ``add_metric`` function and passing the name of the
975 | custom metric in the ``optimize`` parameter.
976 | Will be deprecated in future.
977 |
978 |
979 | search_library: str, default = 'scikit-learn'
980 | The search library used for tuning hyperparameters. Possible values:
981 |
982 | - 'scikit-learn' - default, requires no further installation
983 | https://github.com/scikit-learn/scikit-learn
984 |
985 | - 'scikit-optimize' - ``pip install scikit-optimize``
986 | https://scikit-optimize.github.io/stable/
987 |
988 | - 'tune-sklearn' - ``pip install tune-sklearn ray[tune]``
989 | https://github.com/ray-project/tune-sklearn
990 |
991 | - 'optuna' - ``pip install optuna``
992 | https://optuna.org/
993 |
994 |
995 | search_algorithm: str, default = None
996 | The search algorithm depends on the ``search_library`` parameter.
997 | Some search algorithms require additional libraries to be installed.
998 | If None, will use search library-specific default algorithm.
999 |
1000 | - 'scikit-learn' possible values:
1001 | - 'random' : random grid search (default)
1002 | - 'grid' : grid search
1003 |
1004 | - 'scikit-optimize' possible values:
1005 | - 'bayesian' : Bayesian search (default)
1006 |
1007 | - 'tune-sklearn' possible values:
1008 | - 'random' : random grid search (default)
1009 | - 'grid' : grid search
1010 | - 'bayesian' : ``pip install scikit-optimize``
1011 | - 'hyperopt' : ``pip install hyperopt``
1012 | - 'optuna' : ``pip install optuna``
1013 | - 'bohb' : ``pip install hpbandster ConfigSpace``
1014 |
1015 | - 'optuna' possible values:
1016 | - 'random' : randomized search
1017 | - 'tpe' : Tree-structured Parzen Estimator search (default)
1018 |
1019 |
1020 | early_stopping: bool or str or object, default = False
1021 | Use early stopping to stop fitting to a hyperparameter configuration
1022 | if it performs poorly. Ignored when ``search_library`` is scikit-learn,
1023 | or if the estimator does not have 'partial_fit' attribute. If False or
1024 | None, early stopping will not be used. Can be either an object accepted
1025 | by the search library or one of the following:
1026 |
1027 | - 'asha' for Asynchronous Successive Halving Algorithm
1028 | - 'hyperband' for Hyperband
1029 | - 'median' for Median Stopping Rule
1030 | - If False or None, early stopping will not be used.
1031 |
1032 |
1033 | early_stopping_max_iters: int, default = 10
1034 | Maximum number of epochs to run for each sampled configuration.
1035 | Ignored if ``early_stopping`` is False or None.
1036 |
1037 |
1038 | choose_better: bool, default = False
1039 | When set to True, the returned object is always better performing. The
1040 | metric used for comparison is defined by the ``optimize`` parameter.
1041 |
1042 |
1043 | fit_kwargs: dict, default = {} (empty dict)
1044 | Dictionary of arguments passed to the fit method of the tuner.
1045 |
1046 |
1047 | groups: str or array-like, with shape (n_samples,), default = None
1048 | Optional group labels when GroupKFold is used for the cross validation.
1049 | It takes an array with shape (n_samples, ) where n_samples is the number
1050 | of rows in training dataset. When string is passed, it is interpreted as
1051 | the column name in the dataset containing group labels.
1052 |
1053 |
1054 | return_tuner: bool, default = False
1055 | When set to True, will return a tuple of (model, tuner_object).
1056 |
1057 |
1058 | verbose: bool, default = True
1059 | Score grid is not printed when verbose is set to False.
1060 |
1061 |
1062 | tuner_verbose: bool or in, default = True
1063 | If True or above 0, will print messages from the tuner. Higher values
1064 | print more messages. Ignored when ``verbose`` param is False.
1065 |
1066 |
1067 | **kwargs**:
1068 | Additional keyword arguments to pass to the optimizer.
1069 |
1070 |
1071 | Returns:
1072 | Trained Model and Optional Tuner Object when ``return_tuner`` is True.
1073 |
1074 |
1075 | Warnings
1076 | --------
1077 | - Using 'grid' as ``search_algorithm`` may result in very long computation.
1078 | Only recommended with smaller search spaces that can be defined in the
1079 | ``custom_grid`` parameter.
1080 |
1081 | - ``search_library`` 'tune-sklearn' does not support GPU models.
1082 |
1083 | """
1084 |
1085 | return pycaret.internal.tabular.tune_model_supervised(
1086 | estimator=estimator,
1087 | fold=fold,
1088 | round=round,
1089 | n_iter=n_iter,
1090 | custom_grid=custom_grid,
1091 | optimize=optimize,
1092 | custom_scorer=custom_scorer,
1093 | search_library=search_library,
1094 | search_algorithm=search_algorithm,
1095 | early_stopping=early_stopping,
1096 | early_stopping_max_iters=early_stopping_max_iters,
1097 | choose_better=choose_better,
1098 | fit_kwargs=fit_kwargs,
1099 | groups=groups,
1100 | return_tuner=return_tuner,
1101 | verbose=verbose,
1102 | tuner_verbose=tuner_verbose,
1103 | **kwargs,
1104 | )
1105 |
1106 |
1107 | def ensemble_model(
1108 | estimator,
1109 | method: str = "Bagging",
1110 | fold: Optional[Union[int, Any]] = None,
1111 | n_estimators: int = 10,
1112 | round: int = 4,
1113 | choose_better: bool = False,
1114 | optimize: str = "Accuracy",
1115 | fit_kwargs: Optional[dict] = None,
1116 | groups: Optional[Union[str, Any]] = None,
1117 | verbose: bool = True,
1118 | ) -> Any:
1119 |
1120 | """
1121 | This function ensembles a given estimator. The output of this function is
1122 | a score grid with CV scores by fold. Metrics evaluated during CV can be
1123 | accessed using the ``get_metrics`` function. Custom metrics can be added
1124 | or removed using ``add_metric`` and ``remove_metric`` function.
1125 |
1126 |
1127 | Example
1128 | -------
1129 | >>> from PyRapidML.datasets import get_data
1130 | >>> juice = extract_data('juice')
1131 | >>> from PyRapidML.classification import *
1132 | >>> exp_name = initializer(data = juice, target = 'Purchase')
1133 | >>> dt = creating_model('dt')
1134 | >>> bagged_dt = ensemble_model(dt, method = 'Bagging')
1135 |
1136 |
1137 | estimator: scikit-learn compatible object
1138 | Trained model object
1139 |
1140 |
1141 | method: str, default = 'Bagging'
1142 | Method for ensembling base estimator. It can be 'Bagging' or 'Boosting'.
1143 |
1144 |
1145 | fold: int or scikit-learn compatible CV generator, default = None
1146 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1147 | parameter of the ``setup`` function is used. When an integer is passed,
1148 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1149 | ``setup`` function.
1150 |
1151 |
1152 | n_estimators: int, default = 10
1153 | The number of base estimators in the ensemble. In case of perfect fit, the
1154 | learning procedure is stopped early.
1155 |
1156 |
1157 | round: int, default = 4
1158 | Number of decimal places the metrics in the score grid will be rounded to.
1159 |
1160 |
1161 | choose_better: bool, default = False
1162 | When set to True, the returned object is always better performing. The
1163 | metric used for comparison is defined by the ``optimize`` parameter.
1164 |
1165 |
1166 | optimize: str, default = 'Accuracy'
1167 | Metric to compare for model selection when ``choose_better`` is True.
1168 |
1169 |
1170 | fit_kwargs: dict, default = {} (empty dict)
1171 | Dictionary of arguments passed to the fit method of the model.
1172 |
1173 |
1174 | groups: str or array-like, with shape (n_samples,), default = None
1175 | Optional group labels when GroupKFold is used for the cross validation.
1176 | It takes an array with shape (n_samples, ) where n_samples is the number
1177 | of rows in training dataset. When string is passed, it is interpreted as
1178 | the column name in the dataset containing group labels.
1179 |
1180 |
1181 | verbose: bool, default = True
1182 | Score grid is not printed when verbose is set to False.
1183 |
1184 |
1185 | Returns:
1186 | Trained Model
1187 |
1188 |
1189 | Warnings
1190 | --------
1191 | - Method 'Boosting' is not supported for estimators that do not have 'class_weights'
1192 | or 'predict_proba' attributes.
1193 |
1194 | """
1195 |
1196 | return pycaret.internal.tabular.ensemble_model(
1197 | estimator=estimator,
1198 | method=method,
1199 | fold=fold,
1200 | n_estimators=n_estimators,
1201 | round=round,
1202 | choose_better=choose_better,
1203 | optimize=optimize,
1204 | fit_kwargs=fit_kwargs,
1205 | groups=groups,
1206 | verbose=verbose,
1207 | )
1208 |
1209 |
1210 | def blend_models(
1211 | estimator_list: list,
1212 | fold: Optional[Union[int, Any]] = None,
1213 | round: int = 4,
1214 | choose_better: bool = False,
1215 | optimize: str = "Accuracy",
1216 | method: str = "auto",
1217 | weights: Optional[List[float]] = None,
1218 | fit_kwargs: Optional[dict] = None,
1219 | groups: Optional[Union[str, Any]] = None,
1220 | verbose: bool = True,
1221 | ) -> Any:
1222 |
1223 | """
1224 | This function trains a Soft Voting / Majority Rule classifier for select
1225 | models passed in the ``estimator_list`` param. The output of this function
1226 | is a score grid with CV scores by fold. Metrics evaluated during CV can be
1227 | accessed using the ``get_metrics`` function. Custom metrics can be added
1228 | or removed using ``add_metric`` and ``remove_metric`` function.
1229 |
1230 |
1231 | Example
1232 | -------
1233 | >>> from PyRapidML.datasets import get_data
1234 | >>> juice = get_data('juice')
1235 | >>> from PyRapidML.classification import *
1236 | >>> exp_name = setup(data = juice, target = 'Purchase')
1237 | >>> top3 = compare_models(n_select = 3)
1238 | >>> blender = blend_models(top3)
1239 |
1240 |
1241 | estimator_list: list of scikit-learn compatible objects
1242 | List of trained model objects
1243 |
1244 |
1245 | fold: int or scikit-learn compatible CV generator, default = None
1246 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1247 | parameter of the ``setup`` function is used. When an integer is passed,
1248 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1249 | ``setup`` function.
1250 |
1251 |
1252 | round: int, default = 4
1253 | Number of decimal places the metrics in the score grid will be rounded to.
1254 |
1255 |
1256 | choose_better: bool, default = False
1257 | When set to True, the returned object is always better performing. The
1258 | metric used for comparison is defined by the ``optimize`` parameter.
1259 |
1260 |
1261 | optimize: str, default = 'Accuracy'
1262 | Metric to compare for model selection when ``choose_better`` is True.
1263 |
1264 |
1265 | method: str, default = 'auto'
1266 | 'hard' uses predicted class labels for majority rule voting. 'soft', predicts
1267 | the class label based on the argmax of the sums of the predicted probabilities,
1268 | which is recommended for an ensemble of well-calibrated classifiers. Default
1269 | value, 'auto', will try to use 'soft' and fall back to 'hard' if the former is
1270 | not supported.
1271 |
1272 |
1273 | weights: list, default = None
1274 | Sequence of weights (float or int) to weight the occurrences of predicted class
1275 | labels (hard voting) or class probabilities before averaging (soft voting). Uses
1276 | uniform weights when None.
1277 |
1278 |
1279 | fit_kwargs: dict, default = {} (empty dict)
1280 | Dictionary of arguments passed to the fit method of the model.
1281 |
1282 |
1283 | groups: str or array-like, with shape (n_samples,), default = None
1284 | Optional group labels when GroupKFold is used for the cross validation.
1285 | It takes an array with shape (n_samples, ) where n_samples is the number
1286 | of rows in training dataset. When string is passed, it is interpreted as
1287 | the column name in the dataset containing group labels.
1288 |
1289 |
1290 | verbose: bool, default = True
1291 | Score grid is not printed when verbose is set to False.
1292 |
1293 |
1294 | Returns:
1295 | Trained Model
1296 |
1297 | """
1298 |
1299 | return pycaret.internal.tabular.blend_models(
1300 | estimator_list=estimator_list,
1301 | fold=fold,
1302 | round=round,
1303 | choose_better=choose_better,
1304 | optimize=optimize,
1305 | method=method,
1306 | weights=weights,
1307 | fit_kwargs=fit_kwargs,
1308 | groups=groups,
1309 | verbose=verbose,
1310 | )
1311 |
1312 |
1313 | def stack_models(
1314 | estimator_list: list,
1315 | meta_model=None,
1316 | fold: Optional[Union[int, Any]] = None,
1317 | round: int = 4,
1318 | method: str = "auto",
1319 | restack: bool = True,
1320 | choose_better: bool = False,
1321 | optimize: str = "Accuracy",
1322 | fit_kwargs: Optional[dict] = None,
1323 | groups: Optional[Union[str, Any]] = None,
1324 | verbose: bool = True,
1325 | ) -> Any:
1326 |
1327 | """
1328 | This function trains a meta model over select estimators passed in
1329 | the ``estimator_list`` parameter. The output of this function is a
1330 | score grid with CV scores by fold. Metrics evaluated during CV can
1331 | be accessed using the ``get_metrics`` function. Custom metrics
1332 | can be added or removed using ``add_metric`` and ``remove_metric``
1333 | function.
1334 |
1335 |
1336 | Example
1337 | -------
1338 | >>> from PyRapidML.datasets import get_data
1339 | >>> juice = extract_data('juice')
1340 | >>> from PyRapidML.classification import *
1341 | >>> exp_name = initializer(data = juice, target = 'Purchase')
1342 | >>> top3 = comparing_models(n_select = 3)
1343 | >>> stacker = stack_models(top3)
1344 |
1345 |
1346 | estimator_list: list of scikit-learn compatible objects
1347 | List of trained model objects
1348 |
1349 |
1350 | meta_model: scikit-learn compatible object, default = None
1351 | When None, Logistic Regression is trained as a meta model.
1352 |
1353 |
1354 | fold: int or scikit-learn compatible CV generator, default = None
1355 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1356 | parameter of the ``setup`` function is used. When an integer is passed,
1357 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1358 | ``setup`` function.
1359 |
1360 |
1361 | round: int, default = 4
1362 | Number of decimal places the metrics in the score grid will be rounded to.
1363 |
1364 |
1365 | method: str, default = 'auto'
1366 | When set to 'auto', it will invoke, for each estimator, 'predict_proba',
1367 | 'decision_function' or 'predict' in that order. Other, manually pass one
1368 | of the value from 'predict_proba', 'decision_function' or 'predict'.
1369 |
1370 |
1371 | restack: bool, default = True
1372 | When set to False, only the predictions of estimators will be used as
1373 | training data for the ``meta_model``.
1374 |
1375 |
1376 | choose_better: bool, default = False
1377 | When set to True, the returned object is always better performing. The
1378 | metric used for comparison is defined by the ``optimize`` parameter.
1379 |
1380 |
1381 | optimize: str, default = 'Accuracy'
1382 | Metric to compare for model selection when ``choose_better`` is True.
1383 |
1384 |
1385 | fit_kwargs: dict, default = {} (empty dict)
1386 | Dictionary of arguments passed to the fit method of the model.
1387 |
1388 |
1389 | groups: str or array-like, with shape (n_samples,), default = None
1390 | Optional group labels when GroupKFold is used for the cross validation.
1391 | It takes an array with shape (n_samples, ) where n_samples is the number
1392 | of rows in training dataset. When string is passed, it is interpreted as
1393 | the column name in the dataset containing group labels.
1394 |
1395 |
1396 | verbose: bool, default = True
1397 | Score grid is not printed when verbose is set to False.
1398 |
1399 |
1400 | Returns:
1401 | Trained Model
1402 |
1403 |
1404 | Warnings
1405 | --------
1406 | - When ``method`` is not set to 'auto', it will check if the defined method
1407 | is available for all estimators passed in ``estimator_list``. If the method is
1408 | not implemented by any estimator, it will raise an error.
1409 |
1410 | """
1411 |
1412 | return pycaret.internal.tabular.stack_models(
1413 | estimator_list=estimator_list,
1414 | meta_model=meta_model,
1415 | fold=fold,
1416 | round=round,
1417 | method=method,
1418 | restack=restack,
1419 | choose_better=choose_better,
1420 | optimize=optimize,
1421 | fit_kwargs=fit_kwargs,
1422 | groups=groups,
1423 | verbose=verbose,
1424 | )
1425 |
1426 |
1427 | def plot_model(
1428 | estimator,
1429 | plot: str = "auc",
1430 | scale: float = 1,
1431 | save: bool = False,
1432 | fold: Optional[Union[int, Any]] = None,
1433 | fit_kwargs: Optional[dict] = None,
1434 | groups: Optional[Union[str, Any]] = None,
1435 | use_train_data: bool = False,
1436 | verbose: bool = True,
1437 | display_format: Optional[str] = None,
1438 | ) -> str:
1439 |
1440 | """
1441 | This function analyzes the performance of a trained model on holdout set.
1442 | It may require re-training the model in certain cases.
1443 |
1444 | Example
1445 | -------
1446 | >>> from PyRapidML.datasets import get_data
1447 | >>> juice = extract_data('juice')
1448 | >>> from PyRapidML.classification import *
1449 | >>> exp_name = initializer(data = juice, target = 'Purchase')
1450 | >>> lr = creating_model('lr')
1451 | >>> plot_model(lr, plot = 'auc')
1452 |
1453 |
1454 | estimator: scikit-learn compatible object
1455 | Trained model object
1456 |
1457 |
1458 | plot: str, default = 'auc'
1459 | List of available plots (ID - Name):
1460 |
1461 | * 'auc' - Area Under the Curve
1462 | * 'threshold' - Discrimination Threshold
1463 | * 'pr' - Precision Recall Curve
1464 | * 'confusion_matrix' - Confusion Matrix
1465 | * 'error' - Class Prediction Error
1466 | * 'class_report' - Classification Report
1467 | * 'boundary' - Decision Boundary
1468 | * 'rfe' - Recursive Feature Selection
1469 | * 'learning' - Learning Curve
1470 | * 'manifold' - Manifold Learning
1471 | * 'calibration' - Calibration Curve
1472 | * 'vc' - Validation Curve
1473 | * 'dimension' - Dimension Learning
1474 | * 'feature' - Feature Importance
1475 | * 'feature_all' - Feature Importance (All)
1476 | * 'parameter' - Model Hyperparameter
1477 | * 'lift' - Lift Curve
1478 | * 'gain' - Gain Chart
1479 | * 'tree' - Decision Tree
1480 |
1481 |
1482 | scale: float, default = 1
1483 | The resolution scale of the figure.
1484 |
1485 |
1486 | save: bool, default = False
1487 | When set to True, plot is saved in the current working directory.
1488 |
1489 |
1490 | fold: int or scikit-learn compatible CV generator, default = None
1491 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1492 | parameter of the ``setup`` function is used. When an integer is passed,
1493 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1494 | ``setup`` function.
1495 |
1496 |
1497 | fit_kwargs: dict, default = {} (empty dict)
1498 | Dictionary of arguments passed to the fit method of the model.
1499 |
1500 |
1501 | groups: str or array-like, with shape (n_samples,), default = None
1502 | Optional group labels when GroupKFold is used for the cross validation.
1503 | It takes an array with shape (n_samples, ) where n_samples is the number
1504 | of rows in training dataset. When string is passed, it is interpreted as
1505 | the column name in the dataset containing group labels.
1506 |
1507 |
1508 | use_train_data: bool, default = False
1509 | When set to true, train data will be used for plots, instead
1510 | of test data.
1511 |
1512 |
1513 | verbose: bool, default = True
1514 | When set to False, progress bar is not displayed.
1515 |
1516 |
1517 | display_format: str, default = None
1518 | To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'.
1519 | Currently, not all plots are supported.
1520 |
1521 |
1522 | Returns:
1523 | None
1524 |
1525 |
1526 | Warnings
1527 | --------
1528 | - Estimators that does not support 'predict_proba' attribute cannot be used for
1529 | 'AUC' and 'calibration' plots.
1530 |
1531 | - When the target is multiclass, 'calibration', 'threshold', 'manifold' and 'rfe'
1532 | plots are not available.
1533 |
1534 | - When the 'max_features' parameter of a trained model object is not equal to
1535 | the number of samples in training set, the 'rfe' plot is not available.
1536 |
1537 | """
1538 |
1539 | return pycaret.internal.tabular.plot_model(
1540 | estimator=estimator,
1541 | plot=plot,
1542 | scale=scale,
1543 | save=save,
1544 | fold=fold,
1545 | fit_kwargs=fit_kwargs,
1546 | groups=groups,
1547 | verbose=verbose,
1548 | use_train_data=use_train_data,
1549 | system=True,
1550 | display_format=display_format,
1551 | )
1552 |
1553 |
1554 | def evaluate_model(
1555 | estimator,
1556 | fold: Optional[Union[int, Any]] = None,
1557 | fit_kwargs: Optional[dict] = None,
1558 | groups: Optional[Union[str, Any]] = None,
1559 | use_train_data: bool = False,
1560 | ):
1561 |
1562 | """
1563 | This function displays a user interface for analyzing performance of a trained
1564 | model. It calls the ``plot_model`` function internally.
1565 |
1566 |
1567 | Example
1568 | -------
1569 | >>> from PyRapidML.datasets import get_data
1570 | >>> juice = extract_data('juice')
1571 | >>> from PyRapidML.classification import *
1572 | >>> exp_name = initializer(data = juice, target = 'Purchase')
1573 | >>> lr = creating_model('lr')
1574 | >>> evaluate_model(lr)
1575 |
1576 |
1577 | estimator: scikit-learn compatible object
1578 | Trained model object
1579 |
1580 |
1581 | fold: int or scikit-learn compatible CV generator, default = None
1582 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1583 | parameter of the ``setup`` function is used. When an integer is passed,
1584 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1585 | ``setup`` function.
1586 |
1587 |
1588 | fit_kwargs: dict, default = {} (empty dict)
1589 | Dictionary of arguments passed to the fit method of the model.
1590 |
1591 |
1592 | groups: str or array-like, with shape (n_samples,), default = None
1593 | Optional group labels when GroupKFold is used for the cross validation.
1594 | It takes an array with shape (n_samples, ) where n_samples is the number
1595 | of rows in training dataset. When string is passed, it is interpreted as
1596 | the column name in the dataset containing group labels.
1597 |
1598 |
1599 | use_train_data: bool, default = False
1600 | When set to true, train data will be used for plots, instead
1601 | of test data.
1602 |
1603 |
1604 | Returns:
1605 | None
1606 |
1607 |
1608 | Warnings
1609 | --------
1610 | - This function only works in IPython enabled Notebook.
1611 |
1612 | """
1613 |
1614 | return pycaret.internal.tabular.evaluate_model(
1615 | estimator=estimator,
1616 | fold=fold,
1617 | fit_kwargs=fit_kwargs,
1618 | groups=groups,
1619 | use_train_data=use_train_data,
1620 | )
1621 |
1622 |
1623 | def interpret_model(
1624 | estimator,
1625 | plot: str = "summary",
1626 | feature: Optional[str] = None,
1627 | observation: Optional[int] = None,
1628 | use_train_data: bool = False,
1629 | X_new_sample: Optional[pd.DataFrame] = None,
1630 | save: bool = False,
1631 | **kwargs,
1632 | ):
1633 |
1634 | """
1635 | This function analyzes the predictions generated from a tree-based model. It is
1636 | implemented based on the SHAP (SHapley Additive exPlanations). For more info on
1637 | this, please see https://shap.readthedocs.io/en/latest/
1638 |
1639 | Example
1640 | -------
1641 | >>> from PyRapidML.datasets import get_data
1642 | >>> juice = extract_data('juice')
1643 | >>> from PyRapidML.classification import *
1644 | >>> exp_name = initializer(data = juice, target = 'Purchase')
1645 | >>> xgboost = creating_model('xgboost')
1646 | >>> interpret_model(xgboost)
1647 |
1648 |
1649 | estimator: scikit-learn compatible object
1650 | Trained model object
1651 |
1652 |
1653 | plot: str, default = 'summary'
1654 | Type of plot. Available options are: 'summary', 'correlation', and 'reason'.
1655 |
1656 |
1657 | feature: str, default = None
1658 | Feature to check correlation with. This parameter is only required when ``plot``
1659 | type is 'correlation'. When set to None, it uses the first column in the train
1660 | dataset.
1661 |
1662 |
1663 | observation: int, default = None
1664 | Observation index number in holdout set to explain. When ``plot`` is not
1665 | 'reason', this parameter is ignored.
1666 |
1667 |
1668 | use_train_data: bool, default = False
1669 | When set to true, train data will be used for plots, instead
1670 | of test data.
1671 |
1672 |
1673 | X_new_sample: pd.DataFrame, default = None
1674 | Row from an out-of-sample dataframe (neither train nor test data) to be plotted.
1675 | The sample must have the same columns as the raw input data, and it is transformed
1676 | by the preprocessing pipeline automatically before plotting.
1677 |
1678 |
1679 | save: bool, default = False
1680 | When set to True, Plot is saved as a 'png' file in current working directory.
1681 |
1682 |
1683 | **kwargs**:
1684 | Additional keyword arguments to pass to the plot.
1685 |
1686 |
1687 | Returns:
1688 | None
1689 |
1690 | """
1691 |
1692 | return pycaret.internal.tabular.interpret_model(
1693 | estimator=estimator,
1694 | plot=plot,
1695 | feature=feature,
1696 | observation=observation,
1697 | use_train_data=use_train_data,
1698 | X_new_sample=X_new_sample,
1699 | save=save,
1700 | **kwargs,
1701 | )
1702 |
1703 |
1704 | def calibrate_model(
1705 | estimator,
1706 | method: str = "sigmoid",
1707 | fold: Optional[Union[int, Any]] = None,
1708 | round: int = 4,
1709 | fit_kwargs: Optional[dict] = None,
1710 | groups: Optional[Union[str, Any]] = None,
1711 | verbose: bool = True,
1712 | ) -> Any:
1713 |
1714 | """
1715 | This function calibrates the probability of a given estimator using isotonic
1716 | or logistic regression. The output of this function is a score grid with CV
1717 | scores by fold. Metrics evaluated during CV can be accessed using the
1718 | ``get_metrics`` function. Custom metrics can be added or removed using
1719 | ``add_metric`` and ``remove_metric`` function.
1720 |
1721 |
1722 | Example
1723 | -------
1724 | >>> from PyRapidML.datasets import get_data
1725 | >>> juice = extract_data('juice')
1726 | >>> from PyRapidML.classification import *
1727 | >>> exp_name = initializer(data = juice, target = 'Purchase')
1728 | >>> dt = creating_model('dt')
1729 | >>> calibrated_dt = calibrate_model(dt)
1730 |
1731 |
1732 | estimator: scikit-learn compatible object
1733 | Trained model object
1734 |
1735 |
1736 | method: str, default = 'sigmoid'
1737 | The method to use for calibration. Can be 'sigmoid' which corresponds to
1738 | Platt's method or 'isotonic' which is a non-parametric approach.
1739 |
1740 |
1741 | fold: int or scikit-learn compatible CV generator, default = None
1742 | Controls cross-validation. If None, the CV generator in the ``fold_strategy``
1743 | parameter of the ``setup`` function is used. When an integer is passed,
1744 | it is interpreted as the 'n_splits' parameter of the CV generator in the
1745 | ``setup`` function.
1746 |
1747 |
1748 | round: int, default = 4
1749 | Number of decimal places the metrics in the score grid will be rounded to.
1750 |
1751 |
1752 | fit_kwargs: dict, default = {} (empty dict)
1753 | Dictionary of arguments passed to the fit method of the model.
1754 |
1755 |
1756 | groups: str or array-like, with shape (n_samples,), default = None
1757 | Optional group labels when GroupKFold is used for the cross validation.
1758 | It takes an array with shape (n_samples, ) where n_samples is the number
1759 | of rows in training dataset. When string is passed, it is interpreted as
1760 | the column name in the dataset containing group labels.
1761 |
1762 |
1763 | verbose: bool, default = True
1764 | Score grid is not printed when verbose is set to False.
1765 |
1766 |
1767 | Returns:
1768 | Trained Model
1769 |
1770 |
1771 | Warnings
1772 | --------
1773 | - Avoid isotonic calibration with too few calibration samples (< 1000) since it
1774 | tends to overfit.
1775 |
1776 | """
1777 |
1778 | return pycaret.internal.tabular.calibrate_model(
1779 | estimator=estimator,
1780 | method=method,
1781 | fold=fold,
1782 | round=round,
1783 | fit_kwargs=fit_kwargs,
1784 | groups=groups,
1785 | verbose=verbose,
1786 | )
1787 |
1788 |
1789 | def optimize_threshold(
1790 | estimator,
1791 | true_positive: int = 0,
1792 | true_negative: int = 0,
1793 | false_positive: int = 0,
1794 | false_negative: int = 0,
1795 | ):
1796 |
1797 | """
1798 | This function optimizes probability threshold for a given estimator using
1799 | custom cost function. The function displays a plot of optimized cost as a
1800 | function of probability threshold between 0.0 to 1.0 and returns the
1801 | optimized threshold value as a numpy float.
1802 |
1803 |
1804 | Example
1805 | -------
1806 | >>> from PyRapidML.datasets import get_data
1807 | >>> juice = get_data('juice')
1808 | >>> from PyRapidML.classification import *
1809 | >>> exp_name = setup(data = juice, target = 'Purchase')
1810 | >>> lr = create_model('lr')
1811 | >>> optimize_threshold(lr, true_negative = 10, false_negative = -100)
1812 |
1813 |
1814 | estimator: scikit-learn compatible object
1815 | Trained model object
1816 |
1817 |
1818 | true_positive: int, default = 0
1819 | Cost function or returns for true positive.
1820 |
1821 |
1822 | true_negative: int, default = 0
1823 | Cost function or returns for true negative.
1824 |
1825 |
1826 | false_positive: int, default = 0
1827 | Cost function or returns for false positive.
1828 |
1829 |
1830 | false_negative: int, default = 0
1831 | Cost function or returns for false negative.
1832 |
1833 |
1834 | Returns:
1835 | numpy.float64
1836 |
1837 |
1838 | Warnings
1839 | --------
1840 | - This function is not supported when target is multiclass.
1841 |
1842 | """
1843 |
1844 | return pycaret.internal.tabular.optimize_threshold(
1845 | estimator=estimator,
1846 | true_positive=true_positive,
1847 | true_negative=true_negative,
1848 | false_positive=false_positive,
1849 | false_negative=false_negative,
1850 | )
1851 |
1852 |
1853 | def predict_model(
1854 | estimator,
1855 | data: Optional[pd.DataFrame] = None,
1856 | probability_threshold: Optional[float] = None,
1857 | encoded_labels: bool = False,
1858 | raw_score: bool = False,
1859 | round: int = 4,
1860 | verbose: bool = True,
1861 | ) -> pd.DataFrame:
1862 |
1863 | """
1864 | This function predicts ``Label`` and ``Score`` (probability of predicted
1865 | class) using a trained model. When ``data`` is None, it predicts label and
1866 | score on the holdout set.
1867 |
1868 |
1869 | Example
1870 | -------
1871 | >>> from PyRapidML.datasets import get_data
1872 | >>> juice = extract_data('juice')
1873 | >>> from PyRapidML.classification import *
1874 | >>> exp_name = setup(data = juice, target = 'Purchase')
1875 | >>> lr = creating_model('lr')
1876 | >>> pred_holdout = predict_model(lr)
1877 | >>> pred_unseen = predict_model(lr, data = unseen_dataframe)
1878 |
1879 |
1880 | estimator: scikit-learn compatible object
1881 | Trained model object
1882 |
1883 |
1884 | data: pandas.DataFrame
1885 | Shape (n_samples, n_features). All features used during training
1886 | must be available in the unseen dataset.
1887 |
1888 |
1889 | probability_threshold: float, default = None
1890 | Threshold for converting predicted probability to class label.
1891 | It defaults to 0.5 for all classifiers unless explicitly defined
1892 | in this parameter.
1893 |
1894 |
1895 | encoded_labels: bool, default = False
1896 | When set to True, will return labels encoded as an integer.
1897 |
1898 |
1899 | raw_score: bool, default = False
1900 | When set to True, scores for all labels will be returned.
1901 |
1902 |
1903 | round: int, default = 4
1904 | Number of decimal places the metrics in the score grid will be rounded to.
1905 |
1906 |
1907 | verbose: bool, default = True
1908 | When set to False, holdout score grid is not printed.
1909 |
1910 |
1911 | Returns:
1912 | pandas.DataFrame
1913 |
1914 |
1915 | Warnings
1916 | --------
1917 | - The behavior of the ``predict_model`` is changed in version 2.1 without backward
1918 | compatibility. As such, the pipelines trained using the version (<= 2.0), may not
1919 | work for inference with version >= 2.1. You can either retrain your models with a
1920 | newer version or downgrade the version for inference.
1921 |
1922 | """
1923 |
1924 | return pycaret.internal.tabular.predict_model(
1925 | estimator=estimator,
1926 | data=data,
1927 | probability_threshold=probability_threshold,
1928 | encoded_labels=encoded_labels,
1929 | raw_score=raw_score,
1930 | round=round,
1931 | verbose=verbose,
1932 | ml_usecase=MLUsecase.CLASSIFICATION,
1933 | )
1934 |
1935 |
1936 | def finalize_model(
1937 | estimator,
1938 | fit_kwargs: Optional[dict] = None,
1939 | groups: Optional[Union[str, Any]] = None,
1940 | model_only: bool = True,
1941 | ) -> Any:
1942 |
1943 | """
1944 | This function trains a given estimator on the entire dataset including the
1945 | holdout set.
1946 |
1947 |
1948 | Example
1949 | -------
1950 | >>> from PyRapidML.datasets import get_data
1951 | >>> juice = extract_data('juice')
1952 | >>> from PyRapidML.classification import *
1953 | >>> exp_name = initializer(data = juice, target = 'Purchase')
1954 | >>> lr = creating_model('lr')
1955 | >>> final_lr = finalize_model(lr)
1956 |
1957 |
1958 | estimator: scikit-learn compatible object
1959 | Trained model object
1960 |
1961 |
1962 | fit_kwargs: dict, default = {} (empty dict)
1963 | Dictionary of arguments passed to the fit method of the model.
1964 |
1965 |
1966 | groups: str or array-like, with shape (n_samples,), default = None
1967 | Optional group labels when GroupKFold is used for the cross validation.
1968 | It takes an array with shape (n_samples, ) where n_samples is the number
1969 | of rows in training dataset. When string is passed, it is interpreted as
1970 | the column name in the dataset containing group labels.
1971 |
1972 |
1973 | model_only: bool, default = True
1974 | When set to False, only model object is re-trained and all the
1975 | transformations in Pipeline are ignored.
1976 |
1977 |
1978 | Returns:
1979 | Trained Model
1980 |
1981 | """
1982 |
1983 | return pycaret.internal.tabular.finalize_model(
1984 | estimator=estimator,
1985 | fit_kwargs=fit_kwargs,
1986 | groups=groups,
1987 | model_only=model_only,
1988 | )
1989 |
1990 |
1991 | def deploy_model(
1992 | model, model_name: str, authentication: dict, platform: str = "aws",
1993 | ):
1994 |
1995 | """
1996 | This function deploys the transformation pipeline and trained model on cloud.
1997 |
1998 |
1999 | Example
2000 | -------
2001 | >>> from PyRapidML.datasets import get_data
2002 | >>> juice = extract_data('juice')
2003 | >>> from PyRapidML.classification import *
2004 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2005 | >>> lr = creating_model('lr')
2006 | >>> deploy_model(model = lr, model_name = 'lr-for-deployment', platform = 'aws', authentication = {'bucket' : 'S3-bucket-name'})
2007 |
2008 |
2009 | Amazon Web Service (AWS) users:
2010 | To deploy a model on AWS S3 ('aws'), environment variables must be set in your
2011 | local environment. To configure AWS environment variables, type ``aws configure``
2012 | in the command line. Following information from the IAM portal of amazon console
2013 | account is required:
2014 |
2015 | - AWS Access Key ID
2016 | - AWS Secret Key Access
2017 | - Default Region Name (can be seen under Global settings on your AWS console)
2018 |
2019 | More info: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html
2020 |
2021 |
2022 | Google Cloud Platform (GCP) users:
2023 | To deploy a model on Google Cloud Platform ('gcp'), project must be created
2024 | using command line or GCP console. Once project is created, you must create
2025 | a service account and download the service account key as a JSON file to set
2026 | environment variables in your local environment.
2027 |
2028 | More info: https://cloud.google.com/docs/authentication/production
2029 |
2030 |
2031 | Microsoft Azure (Azure) users:
2032 | To deploy a model on Microsoft Azure ('azure'), environment variables for connection
2033 | string must be set in your local environment. Go to settings of storage account on
2034 | Azure portal to access the connection string required.
2035 |
2036 | More info: https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?toc=%2Fpython%2Fazure%2FTOC.json
2037 |
2038 |
2039 | model: scikit-learn compatible object
2040 | Trained model object
2041 |
2042 |
2043 | model_name: str
2044 | Name of model.
2045 |
2046 |
2047 | authentication: dict
2048 | Dictionary of applicable authentication tokens.
2049 |
2050 | When platform = 'aws':
2051 | {'bucket' : 'S3-bucket-name'}
2052 |
2053 | When platform = 'gcp':
2054 | {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'}
2055 |
2056 | When platform = 'azure':
2057 | {'container': 'azure-container-name'}
2058 |
2059 |
2060 | platform: str, default = 'aws'
2061 | Name of the cloud platform. Currently supported platforms: 'aws', 'gcp' and 'azure'.
2062 |
2063 |
2064 | Returns:
2065 | None
2066 |
2067 | """
2068 |
2069 | return pycaret.internal.tabular.deploy_model(
2070 | model=model,
2071 | model_name=model_name,
2072 | authentication=authentication,
2073 | platform=platform,
2074 | )
2075 |
2076 |
2077 | def save_model(
2078 | model, model_name: str, model_only: bool = False, verbose: bool = True, **kwargs
2079 | ):
2080 |
2081 | """
2082 | This function saves the transformation pipeline and trained model object
2083 | into the current working directory as a pickle file for later use.
2084 |
2085 | Example
2086 | -------
2087 | >>> from PyRapidML.datasets import get_data
2088 | >>> juice = extract_data('juice')
2089 | >>> from PyRapidML.classification import *
2090 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2091 | >>> lr = creatinng_model('lr')
2092 | >>> save_model(lr, 'saved_lr_model')
2093 |
2094 |
2095 | model: scikit-learn compatible object
2096 | Trained model object
2097 |
2098 |
2099 | model_name: str
2100 | Name of the model.
2101 |
2102 |
2103 | model_only: bool, default = False
2104 | When set to True, only trained model object is saved instead of the
2105 | entire pipeline.
2106 |
2107 |
2108 | verbose: bool, default = True
2109 | Success message is not printed when verbose is set to False.
2110 |
2111 |
2112 | **kwargs**:
2113 | Additional keyword arguments to pass to joblib.dump().
2114 |
2115 |
2116 | Returns:
2117 | Tuple of the model object and the filename.
2118 |
2119 | """
2120 |
2121 | return pycaret.internal.tabular.save_model(
2122 | model=model,
2123 | model_name=model_name,
2124 | model_only=model_only,
2125 | verbose=verbose,
2126 | **kwargs,
2127 | )
2128 |
2129 |
2130 | def load_model(
2131 | model_name,
2132 | platform: Optional[str] = None,
2133 | authentication: Optional[Dict[str, str]] = None,
2134 | verbose: bool = True,
2135 | ):
2136 |
2137 | """
2138 | This function loads a previously saved pipeline.
2139 |
2140 |
2141 | Example
2142 | -------
2143 | >>> from PyRapidML.classification import load_model
2144 | >>> saved_lr = load_model('saved_lr_model')
2145 |
2146 |
2147 | model_name: str
2148 | Name of the model.
2149 |
2150 |
2151 | platform: str, default = None
2152 | Name of the cloud platform. Currently supported platforms:
2153 | 'aws', 'gcp' and 'azure'.
2154 |
2155 |
2156 | authentication: dict, default = None
2157 | dictionary of applicable authentication tokens.
2158 |
2159 | when platform = 'aws':
2160 | {'bucket' : 'S3-bucket-name'}
2161 |
2162 | when platform = 'gcp':
2163 | {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'}
2164 |
2165 | when platform = 'azure':
2166 | {'container': 'azure-container-name'}
2167 |
2168 |
2169 | verbose: bool, default = True
2170 | Success message is not printed when verbose is set to False.
2171 |
2172 |
2173 | Returns:
2174 | Trained Model
2175 |
2176 | """
2177 |
2178 | return pycaret.internal.tabular.load_model(
2179 | model_name=model_name,
2180 | platform=platform,
2181 | authentication=authentication,
2182 | verbose=verbose,
2183 | )
2184 |
2185 |
2186 | def automl(optimize: str = "Accuracy", use_holdout: bool = False) -> Any:
2187 |
2188 | """
2189 | This function returns the best model out of all trained models in
2190 | current session based on the ``optimize`` parameter. Metrics
2191 | evaluated can be accessed using the ``get_metrics`` function.
2192 |
2193 |
2194 | Example
2195 | -------
2196 | >>> from PyRapidML.datasets import get_data
2197 | >>> juice = extract_data('juice')
2198 | >>> from PyRapidML.classification import *
2199 | >>> exp_name = setup(data = juice, target = 'Purchase')
2200 | >>> top3 = comparing_models(n_select = 3)
2201 | >>> tuned_top3 = [tuning_model(i) for i in top3]
2202 | >>> blender = blend_models(tuned_top3)
2203 | >>> stacker = stack_models(tuned_top3)
2204 | >>> best_auc_model = automl(optimize = 'AUC')
2205 |
2206 |
2207 | optimize: str, default = 'Accuracy'
2208 | Metric to use for model selection. It also accepts custom metrics
2209 | added using the ``add_metric`` function.
2210 |
2211 |
2212 | use_holdout: bool, default = False
2213 | When set to True, metrics are evaluated on holdout set instead of CV.
2214 |
2215 |
2216 | Returns:
2217 | Trained Model
2218 |
2219 | """
2220 | return pycaret.internal.tabular.automl(optimize=optimize, use_holdout=use_holdout)
2221 |
2222 |
2223 | def pull(pop: bool = False) -> pd.DataFrame:
2224 |
2225 | """
2226 | Returns last printed score grid. Use ``pull`` function after
2227 | any training function to store the score grid in pandas.DataFrame.
2228 |
2229 |
2230 | pop: bool, default = False
2231 | If True, will pop (remove) the returned dataframe from the
2232 | display container.
2233 |
2234 |
2235 | Returns:
2236 | pandas.DataFrame
2237 |
2238 | """
2239 | return pycaret.internal.tabular.pull(pop=pop)
2240 |
2241 |
2242 | def models(
2243 | type: Optional[str] = None, internal: bool = False, raise_errors: bool = True,
2244 | ) -> pd.DataFrame:
2245 |
2246 | """
2247 | Returns table of models available in the model library.
2248 |
2249 | Example
2250 | -------
2251 | >>> from PyRapidML.datasets import get_data
2252 | >>> juice = extract_data('juice')
2253 | >>> from PyRapidML.classification import *
2254 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2255 | >>> all_models = models()
2256 |
2257 |
2258 | type: str, default = None
2259 | - linear : filters and only return linear models
2260 | - tree : filters and only return tree based models
2261 | - ensemble : filters and only return ensemble models
2262 |
2263 |
2264 | internal: bool, default = False
2265 | When True, will return extra columns and rows used internally.
2266 |
2267 |
2268 | raise_errors: bool, default = True
2269 | When False, will suppress all exceptions, ignoring models
2270 | that couldn't be created.
2271 |
2272 |
2273 | Returns:
2274 | pandas.DataFrame
2275 |
2276 | """
2277 | return pycaret.internal.tabular.models(
2278 | type=type, internal=internal, raise_errors=raise_errors
2279 | )
2280 |
2281 |
2282 | def get_metrics(
2283 | reset: bool = False, include_custom: bool = True, raise_errors: bool = True,
2284 | ) -> pd.DataFrame:
2285 |
2286 | """
2287 | Returns table of available metrics used for CV.
2288 |
2289 |
2290 | Example
2291 | -------
2292 | >>> from PyRapidML.datasets import get_data
2293 | >>> juice = extract_data('juice')
2294 | >>> from PyRapidML.classification import *
2295 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2296 | >>> all_metrics = get_metrics()
2297 |
2298 |
2299 | reset: bool, default = False
2300 | When True, will reset all changes made using the ``add_metric``
2301 | and ``remove_metric`` function.
2302 |
2303 |
2304 | include_custom: bool, default = True
2305 | Whether to include user added (custom) metrics or not.
2306 |
2307 |
2308 | raise_errors: bool, default = True
2309 | If False, will suppress all exceptions, ignoring models that
2310 | couldn't be created.
2311 |
2312 |
2313 | Returns:
2314 | pandas.DataFrame
2315 |
2316 | """
2317 |
2318 | return pycaret.internal.tabular.get_metrics(
2319 | reset=reset, include_custom=include_custom, raise_errors=raise_errors,
2320 | )
2321 |
2322 |
2323 | def add_metric(
2324 | id: str,
2325 | name: str,
2326 | score_func: type,
2327 | target: str = "pred",
2328 | greater_is_better: bool = True,
2329 | multiclass: bool = True,
2330 | **kwargs,
2331 | ) -> pd.Series:
2332 |
2333 | """
2334 | Adds a custom metric to be used for CV.
2335 |
2336 |
2337 | Example
2338 | -------
2339 | >>> from PyRapidML.datasets import get_data
2340 | >>> juice = extract_data('juice')
2341 | >>> from PyRapidML.classification import *
2342 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2343 | >>> from sklearn.metrics import log_loss
2344 | >>> add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)
2345 |
2346 |
2347 | id: str
2348 | Unique id for the metric.
2349 |
2350 |
2351 | name: str
2352 | Display name of the metric.
2353 |
2354 |
2355 | score_func: type
2356 | Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``.
2357 |
2358 |
2359 | target: str, default = 'pred'
2360 | The target of the score function.
2361 |
2362 | - 'pred' for the prediction table
2363 | - 'pred_proba' for pred_proba
2364 | - 'threshold' for decision_function or predict_proba
2365 |
2366 |
2367 | greater_is_better: bool, default = True
2368 | Whether ``score_func`` is higher the better or not.
2369 |
2370 |
2371 | multiclass: bool, default = True
2372 | Whether the metric supports multiclass target.
2373 |
2374 |
2375 | **kwargs**:
2376 | Arguments to be passed to score function.
2377 |
2378 |
2379 | Returns:
2380 | pandas.Series
2381 |
2382 | """
2383 |
2384 | return pycaret.internal.tabular.add_metric(
2385 | id=id,
2386 | name=name,
2387 | score_func=score_func,
2388 | target=target,
2389 | greater_is_better=greater_is_better,
2390 | multiclass=multiclass,
2391 | **kwargs,
2392 | )
2393 |
2394 |
2395 | def remove_metric(name_or_id: str):
2396 |
2397 | """
2398 | Removes a metric from CV.
2399 |
2400 |
2401 | Example
2402 | -------
2403 | >>> from PyRapidML.datasets import get_data
2404 | >>> juice = extract_data('juice')
2405 | >>> from PyRapidML.classification import *
2406 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2407 | >>> remove_metric('MCC')
2408 |
2409 |
2410 | name_or_id: str
2411 | Display name or ID of the metric.
2412 |
2413 |
2414 | Returns:
2415 | None
2416 |
2417 | """
2418 | return pycaret.internal.tabular.remove_metric(name_or_id=name_or_id)
2419 |
2420 |
2421 | def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.DataFrame:
2422 |
2423 | """
2424 | Returns a table of experiment logs. Only works when ``log_experiment``
2425 | is True when initializing the ``setup`` function.
2426 |
2427 |
2428 | Example
2429 | -------
2430 | >>> from PyRapidML.datasets import get_data
2431 | >>> juice = extract_data('juice')
2432 | >>> from PyRapidML.classification import *
2433 | >>> exp_name = initializer(data = juice, target = 'Purchase', log_experiment = True)
2434 | >>> best = comparing_models()
2435 | >>> exp_logs = get_logs()
2436 |
2437 |
2438 | experiment_name: str, default = None
2439 | When None current active run is used.
2440 |
2441 |
2442 | save: bool, default = False
2443 | When set to True, csv file is saved in current working directory.
2444 |
2445 |
2446 | Returns:
2447 | pandas.DataFrame
2448 |
2449 | """
2450 |
2451 | return pycaret.internal.tabular.get_logs(experiment_name=experiment_name, save=save)
2452 |
2453 |
2454 | def get_config(variable: str):
2455 |
2456 | """
2457 | This function retrieves the global variables created when initializing the
2458 | ``setup`` function. Following variables are accessible:
2459 |
2460 | - X: Transformed dataset (X)
2461 | - y: Transformed dataset (y)
2462 | - X_train: Transformed train dataset (X)
2463 | - X_test: Transformed test/holdout dataset (X)
2464 | - y_train: Transformed train dataset (y)
2465 | - y_test: Transformed test/holdout dataset (y)
2466 | - seed: random state set through session_id
2467 | - prep_pipe: Transformation pipeline
2468 | - fold_shuffle_param: shuffle parameter used in Kfolds
2469 | - n_jobs_param: n_jobs parameter used in model training
2470 | - html_param: html_param configured through setup
2471 | - create_model_container: results grid storage container
2472 | - master_model_container: model storage container
2473 | - display_container: results display container
2474 | - exp_name_log: Name of experiment
2475 | - logging_param: log_experiment param
2476 | - log_plots_param: log_plots param
2477 | - USI: Unique session ID parameter
2478 | - fix_imbalance_param: fix_imbalance param
2479 | - fix_imbalance_method_param: fix_imbalance_method param
2480 | - data_before_preprocess: data before preprocessing
2481 | - target_param: name of target variable
2482 | - gpu_param: use_gpu param configured through setup
2483 | - fold_generator: CV splitter configured in fold_strategy
2484 | - fold_param: fold params defined in the setup
2485 | - fold_groups_param: fold groups defined in the setup
2486 | - stratify_param: stratify parameter defined in the setup
2487 |
2488 |
2489 | Example
2490 | -------
2491 | >>> from PyRapidML.datasets import get_data
2492 | >>> juice = extract_data('juice')
2493 | >>> from PyRapidML.classification import *
2494 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2495 | >>> X_train = get_config('X_train')
2496 |
2497 |
2498 | Returns:
2499 | Global variable
2500 |
2501 | """
2502 |
2503 | return pycaret.internal.tabular.get_config(variable=variable)
2504 |
2505 |
2506 | def set_config(variable: str, value):
2507 |
2508 | """
2509 | This function resets the global variables. Following variables are
2510 | accessible:
2511 |
2512 | - X: Transformed dataset (X)
2513 | - y: Transformed dataset (y)
2514 | - X_train: Transformed train dataset (X)
2515 | - X_test: Transformed test/holdout dataset (X)
2516 | - y_train: Transformed train dataset (y)
2517 | - y_test: Transformed test/holdout dataset (y)
2518 | - seed: random state set through session_id
2519 | - prep_pipe: Transformation pipeline
2520 | - fold_shuffle_param: shuffle parameter used in Kfolds
2521 | - n_jobs_param: n_jobs parameter used in model training
2522 | - html_param: html_param configured through setup
2523 | - create_model_container: results grid storage container
2524 | - master_model_container: model storage container
2525 | - display_container: results display container
2526 | - exp_name_log: Name of experiment
2527 | - logging_param: log_experiment param
2528 | - log_plots_param: log_plots param
2529 | - USI: Unique session ID parameter
2530 | - fix_imbalance_param: fix_imbalance param
2531 | - fix_imbalance_method_param: fix_imbalance_method param
2532 | - data_before_preprocess: data before preprocessing
2533 | - target_param: name of target variable
2534 | - gpu_param: use_gpu param configured through setup
2535 | - fold_generator: CV splitter configured in fold_strategy
2536 | - fold_param: fold params defined in the setup
2537 | - fold_groups_param: fold groups defined in the setup
2538 | - stratify_param: stratify parameter defined in the setup
2539 |
2540 | Example
2541 | -------
2542 | >>> from PyRapidML.datasets import get_data
2543 | >>> juice = extract_data('juice')
2544 | >>> from PyRapidML.classification import *
2545 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2546 | >>> set_config('seed', 123)
2547 |
2548 |
2549 | Returns:
2550 | None
2551 |
2552 | """
2553 |
2554 | return pycaret.internal.tabular.set_config(variable=variable, value=value)
2555 |
2556 |
2557 | def save_config(file_name: str):
2558 |
2559 | """
2560 | This function save all global variables to a pickle file, allowing to
2561 | later resume without rerunning the ``setup``.
2562 |
2563 |
2564 | Example
2565 | -------
2566 | >>> from PyRapidML.datasets import get_data
2567 | >>> juice = extract_data('juice')
2568 | >>> from PyRapidML.classification import *
2569 | >>> exp_name = initializer(data = juice, target = 'Purchase')
2570 | >>> save_config('myvars.pkl')
2571 |
2572 |
2573 | Returns:
2574 | None
2575 |
2576 | """
2577 |
2578 | return pycaret.internal.tabular.save_config(file_name=file_name)
2579 |
2580 |
2581 | def load_config(file_name: str):
2582 |
2583 | """
2584 | This function loads global variables from a pickle file into Python
2585 | environment.
2586 |
2587 |
2588 | Example
2589 | -------
2590 | >>> from PyRapidML.classification import load_config
2591 | >>> load_config('myvars.pkl')
2592 |
2593 |
2594 | Returns:
2595 | Global variables
2596 |
2597 | """
2598 |
2599 | return pycaret.internal.tabular.load_config(file_name=file_name)
2600 |
--------------------------------------------------------------------------------