├── .idea
    ├── .name
    ├── .gitignore
    ├── vcs.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── modules.xml
    └── PyRapidML.iml
├── PyRapidML
    ├── __init__.py
    ├── .DS_Store
    ├── datasets.py
    ├── eda.py
    ├── utils.py
    ├── regression.py
    └── classification.py
├── .DS_Store
├── docs
    ├── .DS_Store
    ├── source
    │   ├── .DS_Store
    │   ├── api
    │   │   ├── .DS_Store
    │   │   ├── datasets.rst
    │   │   ├── regression.rst
    │   │   ├── classification.rst
    │   │   ├── eda.rst
    │   │   └── natural_language_processing.rst
    │   ├── _static
    │   │   ├── .DS_Store
    │   │   └── css
    │   │   │   └── custom.css
    │   ├── requirements.txt
    │   ├── index.rst
    │   └── conf.py
    ├── Makefile
    └── make.bat
├── tests
    └── .DS_Store
├── Tutorials
    ├── .DS_Store
    ├── Regression
    │   ├── .DS_Store
    │   └── Final ET Model 30May2021.pkl
    └── Classification
    │   ├── .DS_Store
    │   └── Final RF Model 11Nov2020.pkl
├── .readthedocs.yaml.swp
├── .readthedocs.yaml
├── setup.py
├── LICENSE
├── .gitignore
└── README.md


/.idea/.name:
--------------------------------------------------------------------------------
1 | index.rst


--------------------------------------------------------------------------------
/PyRapidML/__init__.py:
--------------------------------------------------------------------------------
1 | from PyRapidML.utils import __version__
2 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/.DS_Store


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/docs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/.DS_Store


--------------------------------------------------------------------------------
/tests/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/tests/.DS_Store


--------------------------------------------------------------------------------
/PyRapidML/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/PyRapidML/.DS_Store


--------------------------------------------------------------------------------
/Tutorials/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/.DS_Store


--------------------------------------------------------------------------------
/.readthedocs.yaml.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/.readthedocs.yaml.swp


--------------------------------------------------------------------------------
/docs/source/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/.DS_Store


--------------------------------------------------------------------------------
/docs/source/api/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/api/.DS_Store


--------------------------------------------------------------------------------
/docs/source/_static/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/_static/.DS_Store


--------------------------------------------------------------------------------
/Tutorials/Regression/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Regression/.DS_Store


--------------------------------------------------------------------------------
/docs/source/api/datasets.rst:
--------------------------------------------------------------------------------
1 | Datasets
2 | ===================
3 | .. automodule:: PyRapidML.datasets
4 |    :members:


--------------------------------------------------------------------------------
/Tutorials/Classification/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Classification/.DS_Store


--------------------------------------------------------------------------------
/docs/source/api/regression.rst:
--------------------------------------------------------------------------------
1 | Regression
2 | ===================
3 | .. automodule:: PyRapidML.regression
4 |    :members:


--------------------------------------------------------------------------------
/docs/source/api/classification.rst:
--------------------------------------------------------------------------------
1 | Classification
2 | ===================
3 | .. automodule:: PyRapidML.classification
4 |    :members:


--------------------------------------------------------------------------------
/docs/source/api/eda.rst:
--------------------------------------------------------------------------------
1 | Exploratory Data Analysis
2 | =====================================
3 | .. automodule:: PyRapidML.eda
4 |    :members:


--------------------------------------------------------------------------------
/Tutorials/Regression/Final ET Model 30May2021.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Regression/Final ET Model 30May2021.pkl


--------------------------------------------------------------------------------
/docs/source/api/natural_language_processing.rst:
--------------------------------------------------------------------------------
1 | NLP
2 | ===================
3 | .. automodule:: PyRapidML.natural_language_processing
4 |    :members:


--------------------------------------------------------------------------------
/Tutorials/Classification/Final RF Model 11Nov2020.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Classification/Final RF Model 11Nov2020.pkl


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/PyRapidML.iml" filepath="$PROJECT_DIR$/.idea/PyRapidML.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/PyRapidML.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="NUMPY" />
10 |     <option name="myDocStringFormat" value="NumPy" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/docs/source/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx>=3.0.0
 2 | sphinx-rtd-theme>=0.5.0
 3 | pandas
 4 | scipy<=1.5.4
 5 | numpy==1.19.5
 6 | seaborn
 7 | matplotlib
 8 | IPython
 9 | joblib
10 | scikit-learn==0.23.2
11 | ipywidgets
12 | yellowbrick>=1.0.1
13 | lightgbm>=2.3.1
14 | plotly>=4.4.1
15 | wordcloud
16 | textblob
17 | cufflinks>=0.17.0
18 | umap-learn
19 | pyLDAvis
20 | gensim<4.0.0
21 | spacy<2.4.0
22 | nltk
23 | mlxtend>=0.17.0
24 | pyod
25 | pandas-profiling>=2.8.0
26 | kmodes>=0.10.1
27 | mlflow
28 | imbalanced-learn==0.7.0
29 | scikit-plot #for lift and gain charts
30 | Boruta
31 | pycaret
32 | typing
33 | 
34 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |    configuration: docs/source/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF
13 | formats:
14 |    - pdf
15 | 
16 | # Optionally set the version of Python and requirements required to build your docs
17 | python:
18 |    version: 3.7
19 |    install:
20 |    - requirements: docs/source/requirements.txt
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r") as f:
 4 |     long_description = f.read()
 5 | 
 6 | setup(
 7 |     name="PyRapidML", # Replace with your own username
 8 |     version="1.0.13",
 9 |     author="Zain Ali",
10 |     author_email="zainbalouch3@gmail.com",
11 |     description="An open source and low code machine learning library for quick and robust analysis",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/Zainali5/PyRapidML",
15 |     packages=find_packages(),
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: MIT License",
19 |         "Operating System :: OS Independent",
20 |     ]
21 | )
22 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Zainali5
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/source/_static/css/custom.css:
--------------------------------------------------------------------------------
 1 | .rst-content dl:not(.docutils) dt:first-child {
 2 |   margin-top: 0;
 3 | }
 4 | 
 5 | .rst-content dl:not(.docutils) dl dt {
 6 |   margin-bottom: 4px;
 7 |   border: none;
 8 |   border-left: solid 3px #ccc;
 9 |   background: #f0f0f0;
10 |   color: #555;
11 | }
12 | 
13 | .rst-content dl table,
14 | .rst-content dl ul,
15 | .rst-content dl ol,
16 | .rst-content dl p {
17 |   margin-bottom: 8px !important;
18 | }
19 | 
20 | .rst-content dl:not(.docutils) dt {
21 |   display: table;
22 |   margin: 6px 0;
23 |   font-size: 90%;
24 |   line-height: normal;
25 |   background: #e7f2fa;
26 |   color: #2980b9;
27 |   border-top: solid 3px #6ab0de;
28 |   padding: 6px;
29 |   position: relative;
30 | }
31 | 
32 | html.writer-html5 .rst-content dl.field-list {
33 |   display: initial;
34 | }
35 | 
36 | html.writer-html5 .rst-content dl.field-list > dd,
37 | html.writer-html5 .rst-content dl.field-list > dt {
38 |   margin-bottom: 4px;
39 |   padding-left: 6px;
40 | }
41 | 
42 | p {
43 |   line-height: 20px;
44 |   font-size: 14px;
45 | }
46 | 
47 | html.writer-html5 .rst-content dl.field-list > dt:after {
48 |   content: initial;
49 | }
50 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyRapidML documentation master file, created by
 2 |    sphinx-quickstart on Tue Jun  8 22:08:54 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | PyRapid Homepage!
 7 | =====================================
 8 | 
 9 | PyRapidML is an open source Python library which not only helps in automating Machine Learning Workflows but also helps in building end to end ML algorithms .
10 | PyRapidML is essentially a Python wrapper around several machine learning libraries and frameworks such as Pycaret, scikit-learn, XGBoost, LightGBM, CatBoost, spaCy, Optuna, Hyperopt, Ray, and many more.
11 | 
12 | PyRapidML is a low code library which means writing basic and less lines of code, one can achieve high accuracy in their machine learning models.
13 | There's no need to write hefty lines of code as PyRapidML would compare all possible machine learning algorithms to solve your problem in just a single line of code.
14 | Once PyRapidML gives you the best algorithm. You can further tune the model (in just a single line of code) to further tune the model.
15 | 
16 | Are you tired of writing hefty lines of code for your data science problem?
17 | Are you having difficulty figuring out what algorithm performs the best?
18 | Is it hard for you to compare multiple algorithms and see which one has the best accuracy?
19 | Do you face issues in Hyperparameter tuning?
20 | Do you want easy model deployments?
21 | Do you a dream of auto-ml?
22 | Are you facing problems in Exploratory data analysis?
23 | Do you want a library that can automatically perform all steps of data science lifecycle?
24 | Do you want a library that can do Exploratory data analysis, Feature Engineering, Feature Selection, Compare multiple Machine Learning Algos, Hyperparamter tuning, model deployments?
25 | 
26 | If the answer is Yes to the above questions then PyRapidML is the library for you.
27 | 
28 | 
29 | .. toctree::
30 |    :maxdepth: 2
31 |    :hidden:
32 |    :caption: Getting Started
33 | 
34 |    self
35 | 
36 | .. toctree::
37 |    :maxdepth: 2
38 |    :hidden:
39 |    :caption: Documentation
40 | 
41 |    api/classification
42 |    api/regression
43 |    api/natural_language_processing
44 |    api/datasets
45 |    api/eda


--------------------------------------------------------------------------------
/PyRapidML/datasets.py:
--------------------------------------------------------------------------------
 1 | # Module: Datasets
 2 | # Author: Zain Ali <zainbalouch3@gmail.com>
 3 | # License: MIT
 4 | # Release: PyRapidML
 5 | # Last modified : 30/05/2021
 6 | 
 7 | 
 8 | def extract_data(dataset="index", save_copy=False, profile=False, verbose=True):
 9 | 
10 |     """
11 |     This function loads sample datasets from git repository. List of available
12 |     datasets can be checked using ``get_data('index')``.
13 |     
14 | 
15 |     Example
16 |     -------
17 |     >>> from PyRapidML.datasets import get_data
18 |     >>> all_datasets = extract_data('index')
19 |     >>> juice = extract_data('juice')
20 | 
21 |         
22 |     dataset: str, default = 'index'
23 |         Index value of dataset.
24 |     
25 | 
26 |     save_copy: bool, default = False
27 |         When set to true, it saves a copy in current working directory.
28 |     
29 | 
30 |     profile: bool, default = False
31 |         When set to true, an interactive EDA report is displayed. 
32 | 
33 | 
34 |     verbose: bool, default = True
35 |         When set to False, head of data is not displayed.
36 | 
37 | 
38 |     Returns:
39 |         pandas.DataFrame
40 |         
41 | 
42 |     Warnings
43 |     --------
44 |     - Use of ``extract_data`` requires internet connection.
45 |          
46 |     """
47 | 
48 |     import pandas as pd
49 |     import os.path
50 |     from IPython.display import display, HTML, clear_output, update_display
51 | 
52 |     address = "https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/"
53 |     extension = ".csv"
54 |     filename = str(dataset) + extension
55 | 
56 |     complete_address = address + filename
57 | 
58 |     if os.path.isfile(filename):
59 |         data = pd.read_csv(filename)
60 |     else:
61 |         data = pd.read_csv(complete_address)
62 | 
63 |     # create a copy for pandas profiler
64 |     data_for_profiling = data.copy()
65 | 
66 |     if save_copy:
67 |         save_name = filename
68 |         data.to_csv(save_name, index=False)
69 | 
70 |     if dataset == "index":
71 |         display(data)
72 | 
73 |     else:
74 |         if profile:
75 |             import pandas_profiling
76 | 
77 |             pf = pandas_profiling.ProfileReport(data_for_profiling)
78 |             display(pf)
79 | 
80 |         else:
81 |             if verbose:
82 |                 display(data.head())
83 | 
84 |     return data
85 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | sys.path.insert(0, os.path.abspath("../.."))
17 | 
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = "PyRapidML"
22 | copyright = "2021, Zain Ali"
23 | author = "Zain Ali"
24 | 
25 | # The full version, including alpha/beta/rc tags
26 | release = "1.0.13"
27 | 
28 | 
29 | # -- General configuration ---------------------------------------------------
30 | 
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | 
35 | extensions = [
36 |     "sphinx_rtd_theme",
37 |     "sphinx.ext.autodoc",
38 |     "sphinx.ext.napoleon",
39 | ]
40 | 
41 | napoleon_google_docstring = True
42 | napoleon_numpy_docstring = True
43 | 
44 | autodoc_mock_imports = ["setup"]
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ["_templates"]
47 | 
48 | # List of patterns, relative to source directory, that match files and
49 | # directories to ignore when looking for source files.
50 | # This pattern also affects html_static_path and html_extra_path.
51 | exclude_patterns = []
52 | 
53 | # Sort methods by the order they are found in the source files
54 | autodoc_member_order = "bysource"
55 | 
56 | 
57 | # -- Options for HTML output -------------------------------------------------
58 | 
59 | # The theme to use for HTML and HTML Help pages.  See the documentation for
60 | # a list of builtin themes.
61 | #
62 | html_theme = "sphinx_rtd_theme"
63 | 
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ["_static"]
68 | 
69 | html_css_files = ["css/custom.css"]
70 | 
71 | master_doc = "index"
72 | 


--------------------------------------------------------------------------------
/PyRapidML/eda.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Author: Zain Ali <zainbalouch3@gmail.com>
 3 | # License: MIT
 4 | # Release: PyRapidML
 5 | # Last modified : 31/05/2021
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | def check_na(dataset):
11 |     """
12 | 
13 |         This function checks missing values and gives the % of missing values in each feature
14 |         This function checks missing values and gives the % of missing values in each feature
15 | 
16 | 
17 |         Example
18 |         -------
19 |         >>> from PyRapidML.eda import check_na
20 |         >>> na_perc = check_na(df)
21 | 
22 |         df: dataframe
23 | 
24 | 
25 |     """
26 |     # Here we will check the percentage of nan values present in each feature
27 |     ## 1 -step make the list of features which has missing values
28 |     features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]
29 |     ## 2- step print the feature name and the percentage of missing values
30 |     if len(features_with_na) > 0:
31 |         for feature in features_with_na:
32 |             print(feature, np.round(dataset[feature].isnull().mean(), 4),  ' % missing values')
33 |         #return pycaret.internal.tabular.check_na(dataset=dataset)
34 |     else:
35 |         print("No Missing Values")
36 |         
37 | 
38 | 
39 |     
40 | def numerical_features(dataset):
41 |     # list of numerical variables
42 |     """
43 | 
44 |         This function tells total numerical features and further tell how many of them are discrete and continuous
45 |         This function checks missing values and gives the % of missing values in each feature
46 | 
47 |         Example
48 |         -------
49 |         >>> from PyRapidML.eda import numerical_features
50 |         >>> num_fea = numerical_features(df)
51 | 
52 |         df: dataframe
53 | 
54 | 
55 |     """
56 |     numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
57 | 
58 |     print('Number of numerical variables: ', len(numerical_features))
59 | 
60 |     # visualise the numerical variables
61 |     #print(dataset[numerical_features].head())
62 |     ## Numerical variables are usually of 2 type
63 |     ## 1. Continous variable and Discrete Variables
64 | 
65 |     discrete_feature=[feature for feature in numerical_features if len(dataset[feature].unique())<25] 
66 |     print("Discrete Variables Count: {}".format(len(discrete_feature)))
67 |     
68 |     continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature]
69 |     print("Continuous feature Count {}".format(len(continuous_feature)))
70 |     
71 | 
72 |     
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## PyRapidML
 2 | # Introduction
 3 | Are you tired of writing hefty lines of code for your data science problem?
 4 | Are you having difficulty figuring out what algorithm performs the best?
 5 | Is it hard for you to compare multiple algorithms and see which one has the best accuracy?
 6 | Do you face issues in Hyperparameter tuning?
 7 | Do you want easy model deployments?
 8 | Do you a dream of auto-ml?
 9 | Are you facing problems in Exploratory data analysis?
10 | Do you want a library that can automatically perform all steps of data science lifecycle?
11 | Do you want a library that can do Exploratory data analysis, Feature Engineering, Feature Selection, Compare multiple Machine Learning Algos, Hyperparamter tuning, model deployments?
12 | 
13 | If the answer is Yes to the above questions then PyRapidML is the library for you. 
14 | 
15 | PyRapidML is an open source Python machine learning library.
16 | PyRapidML is essentially a Python wrapper around several machine learning libraries and frameworks such as Pycaret, scikit-learn, XGBoost, LightGBM, CatBoost, spaCy, Optuna, Hyperopt, Ray, and many more.
17 | 
18 | PyRapidML is an open source Python library which not only helps in automating Machine Learning Workflows but also helps in building end to end ML algorithms .
19 | 
20 | PyRapidML is a low code library which means writing basic and less lines of code, one can achieve high accuracy in their machine learning models.
21 | There's no need to write hefty lines of code as PyRapidML would compare all possible machine learning algorithms to solve your problem in just a single line of code.
22 | Once PyRapidML gives you the best algorithm. You can further tune the model (in just a single line of code) to further tune the model.
23 | 
24 | Initial idea of PyRapidML was inspired by PyCaret library in Python.
25 | 
26 | # What data science problems PyRapidML can cater?
27 | Regression
28 | Classification
29 | Natural Language Processing
30 | 
31 | # What PyRapidML has to offer currently?
32 | Data Prep
33 | Exploratory Data Analysis
34 | Model Training
35 | Finding the best ML model
36 | Hyperparameter tuning
37 | Model Deployment
38 | Analysis and Interpretability
39 | 
40 | 
41 | # Who is this library for?
42 | This library is for:
43 | Data Scientists
44 | Citizen Data Scientists
45 | Data Science Students
46 | Data Analysts
47 | Data Professionals who want to build end to end data science solutions
48 | 
49 | # How to install this library?
50 | pip install PyRapidML
51 | 
52 | # Important Links
53 | - Docs: https://pyrapidml.readthedocs.io/en/latest/ 
54 | - Github Link: https://github.com/Zainali5/PyRapidML 
55 | - Pypi link: https://pypi.org/project/PyRapidML/1.0.13/ 
56 | # Current Release
57 | PyRapidML 1.0.13 is now available. The easiest way to install PyRapidML is using pip.
58 | 


--------------------------------------------------------------------------------
/PyRapidML/utils.py:
--------------------------------------------------------------------------------
  1 | # Module: Utility
  2 | # Author: Zain Ali <zainbalouch3@gmail.com>
  3 | # License: MIT
  4 | # Release: PyRapidML
  5 | # Last modified : 31/05/2021
  6 | 
  7 | import pandas as pd
  8 | 
  9 | version_ = "1.0.13"
 10 | nightly_version_ = "1.0.13"
 11 | 
 12 | __version__ = version_
 13 | 
 14 | 
 15 | def version():
 16 |     return version_
 17 | 
 18 | 
 19 | def nightly_version():
 20 |     return nightly_version_
 21 | 
 22 | 
 23 | def check_metric(actual: pd.Series, prediction: pd.Series, metric: str, round: int = 4):
 24 | 
 25 |     """
 26 |     Function to evaluate classification and regression metrics.
 27 | 
 28 | 
 29 |     actual : pandas.Series
 30 |         Actual values of the target variable.
 31 | 
 32 | 
 33 |     prediction : pandas.Series
 34 |         Predicted values of the target variable.
 35 | 
 36 | 
 37 |     metric : str
 38 |         Metric to use.
 39 | 
 40 | 
 41 |     round: integer, default = 4
 42 |         Number of decimal places the metrics will be rounded to.
 43 | 
 44 | 
 45 |     Returns:
 46 |         float
 47 | 
 48 |     """
 49 | 
 50 |     # general dependencies
 51 |     import pycaret.containers.metrics.classification
 52 |     import pycaret.containers.metrics.regression
 53 | 
 54 |     globals_dict = {"y": prediction}
 55 |     metric_containers = {
 56 |         **pycaret.containers.metrics.classification.get_all_metric_containers(
 57 |             globals_dict
 58 |         ),
 59 |         **pycaret.containers.metrics.regression.get_all_metric_containers(globals_dict),
 60 |     }
 61 |     metrics = {v.name: v.score_func for k, v in metric_containers.items()}
 62 | 
 63 |     # metric calculation starts here
 64 | 
 65 |     if metric in metrics:
 66 |         try:
 67 |             result = metrics[metric](actual, prediction)
 68 |         except:
 69 |             from sklearn.preprocessing import LabelEncoder
 70 | 
 71 |             le = LabelEncoder()
 72 |             actual = le.fit_transform(actual)
 73 |             prediction = le.transform(prediction)
 74 |             result = metrics[metric](actual, prediction)
 75 |         result = result.round(round)
 76 |         return float(result)
 77 |     else:
 78 |         raise ValueError(
 79 |             f"Couldn't find metric '{metric}' Possible metrics are: {', '.join(metrics.keys())}."
 80 |         )
 81 | 
 82 | 
 83 | def enable_colab():
 84 |     from IPython.display import display, HTML, clear_output, update_display
 85 | 
 86 |     """
 87 |     Function to render plotly visuals in colab.
 88 |     """
 89 | 
 90 |     def configure_plotly_browser_state():
 91 | 
 92 |         import IPython
 93 | 
 94 |         display(
 95 |             IPython.core.display.HTML(
 96 |                 """
 97 |             <script src="/static/components/requirejs/require.js"></script>
 98 |             <script>
 99 |               requirejs.config({
100 |                 paths: {
101 |                   base: '/static/base',
102 |                   plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
103 |                 },
104 |               });
105 |             </script>
106 |             """
107 |             )
108 |         )
109 | 
110 |     import IPython
111 | 
112 |     IPython.get_ipython().events.register(
113 |         "pre_run_cell", configure_plotly_browser_state
114 |     )
115 |     print("Colab mode enabled.")
116 | 
117 | 
118 | def get_system_logs():
119 | 
120 |     """
121 |     Read and print 'logs.log' file from current active directory
122 |     """
123 | 
124 |     with open("logs.log", "r") as file:
125 |         lines = file.read().splitlines()
126 | 
127 |     for line in lines:
128 |         if not line:
129 |             continue
130 | 
131 |         columns = [col.strip() for col in line.split(":") if col]
132 |         print(columns)
133 | 


--------------------------------------------------------------------------------
/PyRapidML/regression.py:
--------------------------------------------------------------------------------
   1 | # Module: Regression
   2 | # Author: Zain Ali <zainbalouch3@gmail.com>
   3 | # License: MIT
   4 | # Release: PyRapidML
   5 | # Last modified : 31/05/2021
   6 | 
   7 | import pandas as pd
   8 | import numpy as np
   9 | 
  10 | import pycaret.internal.tabular
  11 | from pycaret.internal.Display import Display, is_in_colab, enable_colab
  12 | from typing import List, Tuple, Any, Union, Optional, Dict
  13 | import warnings
  14 | from IPython.utils import io
  15 | 
  16 | from pycaret.internal.tabular import MLUsecase
  17 | 
  18 | warnings.filterwarnings("ignore")
  19 | 
  20 | 
  21 | def initializer(
  22 |     data: pd.DataFrame,
  23 |     target: str,
  24 |     train_size: float = 0.7,
  25 |     test_data: Optional[pd.DataFrame] = None,
  26 |     preprocess: bool = True,
  27 |     imputation_type: str = "simple",
  28 |     iterative_imputation_iters: int = 5,
  29 |     categorical_features: Optional[List[str]] = None,
  30 |     categorical_imputation: str = "constant",
  31 |     categorical_iterative_imputer: Union[str, Any] = "lightgbm",
  32 |     ordinal_features: Optional[Dict[str, list]] = None,
  33 |     high_cardinality_features: Optional[List[str]] = None,
  34 |     high_cardinality_method: str = "frequency",
  35 |     numeric_features: Optional[List[str]] = None,
  36 |     numeric_imputation: str = "mean",
  37 |     numeric_iterative_imputer: Union[str, Any] = "lightgbm",
  38 |     date_features: Optional[List[str]] = None,
  39 |     ignore_features: Optional[List[str]] = None,
  40 |     normalize: bool = False,
  41 |     normalize_method: str = "zscore",
  42 |     transformation: bool = False,
  43 |     transformation_method: str = "yeo-johnson",
  44 |     handle_unknown_categorical: bool = True,
  45 |     unknown_categorical_method: str = "least_frequent",
  46 |     pca: bool = False,
  47 |     pca_method: str = "linear",
  48 |     pca_components: Optional[float] = None,
  49 |     ignore_low_variance: bool = False,
  50 |     combine_rare_levels: bool = False,
  51 |     rare_level_threshold: float = 0.10,
  52 |     bin_numeric_features: Optional[List[str]] = None,
  53 |     remove_outliers: bool = False,
  54 |     outliers_threshold: float = 0.05,
  55 |     remove_multicollinearity: bool = False,
  56 |     multicollinearity_threshold: float = 0.9,
  57 |     remove_perfect_collinearity: bool = True,
  58 |     create_clusters: bool = False,
  59 |     cluster_iter: int = 20,
  60 |     polynomial_features: bool = False,
  61 |     polynomial_degree: int = 2,
  62 |     trigonometry_features: bool = False,
  63 |     polynomial_threshold: float = 0.1,
  64 |     group_features: Optional[List[str]] = None,
  65 |     group_names: Optional[List[str]] = None,
  66 |     feature_selection: bool = False,
  67 |     feature_selection_threshold: float = 0.8,
  68 |     feature_selection_method: str = "classic",
  69 |     feature_interaction: bool = False,
  70 |     feature_ratio: bool = False,
  71 |     interaction_threshold: float = 0.01,
  72 |     transform_target: bool = False,
  73 |     transform_target_method: str = "box-cox",
  74 |     data_split_shuffle: bool = True,
  75 |     data_split_stratify: Union[bool, List[str]] = False,
  76 |     fold_strategy: Union[str, Any] = "kfold",
  77 |     fold: int = 10,
  78 |     fold_shuffle: bool = False,
  79 |     fold_groups: Optional[Union[str, pd.DataFrame]] = None,
  80 |     n_jobs: Optional[int] = -1,
  81 |     use_gpu: bool = False,
  82 |     custom_pipeline: Union[
  83 |         Any, Tuple[str, Any], List[Any], List[Tuple[str, Any]]
  84 |     ] = None,
  85 |     html: bool = True,
  86 |     session_id: Optional[int] = None,
  87 |     log_experiment: bool = False,
  88 |     experiment_name: Optional[str] = None,
  89 |     log_plots: Union[bool, list] = False,
  90 |     log_profile: bool = False,
  91 |     log_data: bool = False,
  92 |     silent: bool = False,
  93 |     verbose: bool = True,
  94 |     profile: bool = False,
  95 |     profile_kwargs: Dict[str, Any] = None,
  96 | ):
  97 |     """
  98 |     This function initializes the training environment and creates the transformation 
  99 |     pipeline. Setup function must be called before executing any other function. It takes 
 100 |     two mandatory parameters: ``data`` and ``target``. All the other parameters are
 101 |     optional.
 102 | 
 103 |     Example
 104 |     -------
 105 |     >>> from PyRapidML.datasets import get_data
 106 |     >>> boston = extract_data('boston')
 107 |     >>> from PyRapidML.regression import *
 108 |     >>> exp_name = initializer(data = boston,  target = 'medv')
 109 | 
 110 | 
 111 |     data : pandas.DataFrame
 112 |         Shape (n_samples, n_features), where n_samples is the number of samples and 
 113 |         n_features is the number of features.
 114 | 
 115 | 
 116 |     target: str
 117 |         Name of the target column to be passed in as a string. The target variable can 
 118 |         be either binary or multiclass.
 119 | 
 120 | 
 121 |     train_size: float, default = 0.7
 122 |         Proportion of the dataset to be used for training and validation. Should be 
 123 |         between 0.0 and 1.0.
 124 | 
 125 | 
 126 |     test_data: pandas.DataFrame, default = None
 127 |         If not None, test_data is used as a hold-out set and ``train_size`` parameter is 
 128 |         ignored. test_data must be labelled and the shape of data and test_data must 
 129 |         match. 
 130 | 
 131 | 
 132 |     preprocess: bool, default = True
 133 |         When set to False, no transformations are applied except for train_test_split 
 134 |         and custom transformations passed in ``custom_pipeline`` param. Data must be 
 135 |         ready for modeling (no missing values, no dates, categorical data encoding), 
 136 |         when preprocess is set to False. 
 137 | 
 138 | 
 139 |     imputation_type: str, default = 'simple'
 140 |         The type of imputation to use. Can be either 'simple' or 'iterative'.
 141 | 
 142 | 
 143 |     iterative_imputation_iters: int, default = 5
 144 |         Number of iterations. Ignored when ``imputation_type`` is not 'iterative'.	
 145 | 
 146 | 
 147 |     categorical_features: list of str, default = None
 148 |         If the inferred data types are not correct or the silent param is set to True,
 149 |         categorical_features param can be used to overwrite or define the data types. 
 150 |         It takes a list of strings with column names that are categorical.
 151 | 
 152 | 
 153 |     categorical_imputation: str, default = 'constant'
 154 |         Missing values in categorical features are imputed with a constant 'not_available'
 155 |         value. The other available option is 'mode'.
 156 | 
 157 | 
 158 |     categorical_iterative_imputer: str, default = 'lightgbm'
 159 |         Estimator for iterative imputation of missing values in categorical features.
 160 |         Ignored when ``imputation_type`` is not 'iterative'. 
 161 | 
 162 | 
 163 |     ordinal_features: dict, default = None
 164 |         Encode categorical features as ordinal. For example, a categorical feature with 
 165 |         'low', 'medium', 'high' values where low < medium < high can be passed as  
 166 |         ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }. 
 167 | 
 168 | 
 169 |     high_cardinality_features: list of str, default = None
 170 |         When categorical features contains many levels, it can be compressed into fewer
 171 |         levels using this parameter. It takes a list of strings with column names that 
 172 |         are categorical.
 173 | 
 174 | 
 175 |     high_cardinality_method: str, default = 'frequency'
 176 |         Categorical features with high cardinality are replaced with the frequency of
 177 |         values in each level occurring in the training dataset. Other available method
 178 |         is 'clustering' which trains the K-Means clustering algorithm on the statistical
 179 |         attribute of the training data and replaces the original value of feature with the 
 180 |         cluster label. The number of clusters is determined by optimizing Calinski-Harabasz 
 181 |         and Silhouette criterion. 
 182 | 
 183 | 
 184 |     numeric_features: list of str, default = None
 185 |         If the inferred data types are not correct or the silent param is set to True,
 186 |         numeric_features param can be used to overwrite or define the data types. 
 187 |         It takes a list of strings with column names that are numeric.
 188 | 
 189 | 
 190 |     numeric_imputation: str, default = 'mean'
 191 |         Missing values in numeric features are imputed with 'mean' value of the feature 
 192 |         in the training dataset. The other available option is 'median' or 'zero'.
 193 | 
 194 | 
 195 |     numeric_iterative_imputer: str, default = 'lightgbm'
 196 |         Estimator for iterative imputation of missing values in numeric features.
 197 |         Ignored when ``imputation_type`` is set to 'simple'. 
 198 | 
 199 | 
 200 |     date_features: list of str, default = None
 201 |         If the inferred data types are not correct or the silent param is set to True,
 202 |         date_features param can be used to overwrite or define the data types. It takes 
 203 |         a list of strings with column names that are DateTime.
 204 | 
 205 | 
 206 |     ignore_features: list of str, default = None
 207 |         ignore_features param can be used to ignore features during model training.
 208 |         It takes a list of strings with column names that are to be ignored.
 209 | 
 210 | 
 211 |     normalize: bool, default = False
 212 |         When set to True, it transforms the numeric features by scaling them to a given
 213 |         range. Type of scaling is defined by the ``normalize_method`` parameter.
 214 | 
 215 | 
 216 |     normalize_method: str, default = 'zscore'
 217 |         Defines the method for scaling. By default, normalize method is set to 'zscore'
 218 |         The standard zscore is calculated as z = (x - u) / s. Ignored when ``normalize`` 
 219 |         is not True. The other options are:
 220 |     
 221 |         - minmax: scales and translates each feature individually such that it is in 
 222 |           the range of 0 - 1.
 223 |         - maxabs: scales and translates each feature individually such that the 
 224 |           maximal absolute value of each feature will be 1.0. It does not 
 225 |           shift/center the data, and thus does not destroy any sparsity.
 226 |         - robust: scales and translates each feature according to the Interquartile 
 227 |           range. When the dataset contains outliers, robust scaler often gives 
 228 |           better results.
 229 | 
 230 | 
 231 |     transformation: bool, default = False
 232 |         When set to True, it applies the power transform to make data more Gaussian-like.
 233 |         Type of transformation is defined by the ``transformation_method`` parameter.
 234 | 
 235 | 
 236 |     transformation_method: str, default = 'yeo-johnson'
 237 |         Defines the method for transformation. By default, the transformation method is 
 238 |         set to 'yeo-johnson'. The other available option for transformation is 'quantile'. 
 239 |         Ignored when ``transformation`` is not True.
 240 | 
 241 |     
 242 |     handle_unknown_categorical: bool, default = True
 243 |         When set to True, unknown categorical levels in unseen data are replaced by the
 244 |         most or least frequent level as learned in the training dataset. 
 245 | 
 246 | 
 247 |     unknown_categorical_method: str, default = 'least_frequent'
 248 |         Method used to replace unknown categorical levels in unseen data. Method can be
 249 |         set to 'least_frequent' or 'most_frequent'.
 250 | 
 251 | 
 252 |     pca: bool, default = False
 253 |         When set to True, dimensionality reduction is applied to project the data into 
 254 |         a lower dimensional space using the method defined in ``pca_method`` parameter. 
 255 |         
 256 | 
 257 |     pca_method: str, default = 'linear'
 258 |         The 'linear' method performs uses Singular Value  Decomposition. Other options are:
 259 |         
 260 |         - kernel: dimensionality reduction through the use of RBF kernel.
 261 |         - incremental: replacement for 'linear' pca when the dataset is too large.
 262 | 
 263 | 
 264 |     pca_components: int or float, default = None
 265 |         Number of components to keep. if pca_components is a float, it is treated as a 
 266 |         target percentage for information retention. When pca_components is an integer
 267 |         it is treated as the number of features to be kept. pca_components must be less
 268 |         than the original number of features. Ignored when ``pca`` is not True.
 269 | 
 270 | 
 271 |     ignore_low_variance: bool, default = False
 272 |         When set to True, all categorical features with insignificant variances are 
 273 |         removed from the data. The variance is calculated using the ratio of unique 
 274 |         values to the number of samples, and the ratio of the most common value to the 
 275 |         frequency of the second most common value.
 276 | 
 277 |     
 278 |     combine_rare_levels: bool, default = False
 279 |         When set to True, frequency percentile for levels in categorical features below 
 280 |         a certain threshold is combined into a single level.
 281 | 
 282 |     
 283 |     rare_level_threshold: float, default = 0.1
 284 |         Percentile distribution below which rare categories are combined. Ignored when
 285 |         ``combine_rare_levels`` is not True.
 286 | 
 287 |     
 288 |     bin_numeric_features: list of str, default = None
 289 |         To convert numeric features into categorical, bin_numeric_features parameter can 
 290 |         be used. It takes a list of strings with column names to be discretized. It does
 291 |         so by using 'sturges' rule to determine the number of clusters and then apply
 292 |         KMeans algorithm. Original values of the feature are then replaced by the
 293 |         cluster label.
 294 | 
 295 | 
 296 |     remove_outliers: bool, default = False
 297 |         When set to True, outliers from the training data are removed using the Singular 
 298 |         Value Decomposition.
 299 | 
 300 | 
 301 |     outliers_threshold: float, default = 0.05
 302 |         The percentage outliers to be removed from the training dataset. Ignored when 
 303 |         ``remove_outliers`` is not True.
 304 | 
 305 | 
 306 |     remove_multicollinearity: bool, default = False
 307 |         When set to True, features with the inter-correlations higher than the defined 
 308 |         threshold are removed. When two features are highly correlated with each other, 
 309 |         the feature that is less correlated with the target variable is removed. Only
 310 |         considers numeric features.
 311 | 
 312 | 
 313 |     multicollinearity_threshold: float, default = 0.9
 314 |         Threshold for correlated features. Ignored when ``remove_multicollinearity``
 315 |         is not True.
 316 | 
 317 |     
 318 |     remove_perfect_collinearity: bool, default = True
 319 |         When set to True, perfect collinearity (features with correlation = 1) is removed
 320 |         from the dataset, when two features are 100% correlated, one of it is randomly 
 321 |         removed from the dataset.
 322 | 
 323 | 
 324 |     create_clusters: bool, default = False
 325 |         When set to True, an additional feature is created in training dataset where each 
 326 |         instance is assigned to a cluster. The number of clusters is determined by 
 327 |         optimizing Calinski-Harabasz and Silhouette criterion.
 328 | 
 329 | 
 330 |     cluster_iter: int, default = 20
 331 |         Number of iterations for creating cluster. Each iteration represents cluster 
 332 |         size. Ignored when ``create_clusters`` is not True. 
 333 | 
 334 | 
 335 |     polynomial_features: bool, default = False
 336 |         When set to True, new features are derived using existing numeric features. 
 337 | 
 338 | 
 339 |     polynomial_degree: int, default = 2
 340 |         Degree of polynomial features. For example, if an input sample is two dimensional 
 341 |         and of the form [a, b], the polynomial features with degree = 2 are: 
 342 |         [1, a, b, a^2, ab, b^2]. Ignored when ``polynomial_features`` is not True.
 343 | 
 344 | 
 345 |     trigonometry_features: bool, default = False
 346 |         When set to True, new features are derived using existing numeric features.
 347 | 
 348 | 
 349 |     polynomial_threshold: float, default = 0.1
 350 |         When ``polynomial_features`` or ``trigonometry_features`` is True, new features
 351 |         are derived from the existing numeric features. This may sometimes result in too 
 352 |         large feature space. polynomial_threshold parameter can be used to deal with this  
 353 |         problem. It does so by using combination of Random Forest, AdaBoost and Linear 
 354 |         correlation. All derived features that falls within the percentile distribution 
 355 |         are kept and rest of the features are removed.
 356 | 
 357 | 
 358 |     group_features: list or list of list, default = None
 359 |         When the dataset contains features with related characteristics, group_features
 360 |         parameter can be used for feature extraction. It takes a list of strings with 
 361 |         column names that are related.
 362 | 
 363 |         
 364 |     group_names: list, default = None
 365 |         Group names to be used in naming new features. When the length of group_names 
 366 |         does not match with the length of ``group_features``, new features are named 
 367 |         sequentially group_1, group_2, etc. It is ignored when ``group_features`` is
 368 |         None.
 369 | 
 370 |     
 371 |     feature_selection: bool, default = False
 372 |         When set to True, a subset of features are selected using a combination of 
 373 |         various permutation importance techniques including Random Forest, Adaboost 
 374 |         and Linear correlation with target variable. The size of the subset is 
 375 |         dependent on the ``feature_selection_threshold`` parameter. 
 376 | 
 377 | 
 378 |     feature_selection_threshold: float, default = 0.8
 379 |         Threshold value used for feature selection. When ``polynomial_features`` or 
 380 |         ``feature_interaction`` is True, it is recommended to keep the threshold low
 381 |         to avoid large feature spaces. Setting a very low value may be efficient but 
 382 |         could result in under-fitting.
 383 | 
 384 |     
 385 |     feature_selection_method: str, default = 'classic'
 386 |         Algorithm for feature selection. 'classic' method uses permutation feature
 387 |         importance techniques. Other possible value is 'boruta' which uses boruta
 388 |         algorithm for feature selection. 
 389 | 
 390 |     
 391 |     feature_interaction: bool, default = False 
 392 |         When set to True, new features are created by interacting (a * b) all the 
 393 |         numeric variables in the dataset. This feature is not scalable and may not
 394 |         work as expected on datasets with large feature space.
 395 | 
 396 |     
 397 |     feature_ratio: bool, default = False
 398 |         When set to True, new features are created by calculating the ratios (a / b) 
 399 |         between all numeric variables in the dataset. This feature is not scalable and 
 400 |         may not work as expected on datasets with large feature space.
 401 | 
 402 |     
 403 |     interaction_threshold: bool, default = 0.01
 404 |         Similar to polynomial_threshold, It is used to compress a sparse matrix of newly 
 405 |         created features through interaction. Features whose importance based on the 
 406 |         combination  of  Random Forest, AdaBoost and Linear correlation falls within the 
 407 |         percentile of the  defined threshold are kept in the dataset. Remaining features 
 408 |         are dropped before further processing.
 409 | 
 410 | 
 411 |     transform_target: bool, default = False
 412 |         When set to True, target variable is transformed using the method defined in
 413 |         ``transform_target_method`` param. Target transformation is applied separately
 414 |         from feature transformations. 
 415 | 
 416 | 
 417 |     transform_target_method: str, default = 'box-cox'
 418 |         'Box-cox' and 'yeo-johnson' methods are supported. Box-Cox requires input data to 
 419 |         be strictly positive, while Yeo-Johnson supports both positive or negative data.
 420 |         When transform_target_method is 'box-cox' and target variable contains negative
 421 |         values, method is internally forced to 'yeo-johnson' to avoid exceptions.
 422 |         
 423 | 
 424 |     data_split_shuffle: bool, default = True
 425 |         When set to False, prevents shuffling of rows during 'train_test_split'.
 426 | 
 427 | 
 428 |     data_split_stratify: bool or list, default = False
 429 |         Controls stratification during 'train_test_split'. When set to True, will 
 430 |         stratify by target column. To stratify on any other columns, pass a list of 
 431 |         column names. Ignored when ``data_split_shuffle`` is False.
 432 | 
 433 | 
 434 |     fold_strategy: str or sklearn CV generator object, default = 'kfold'
 435 |         Choice of cross validation strategy. Possible values are:
 436 | 
 437 |         * 'kfold'
 438 |         * 'stratifiedkfold'
 439 |         * 'groupkfold'
 440 |         * 'timeseries'
 441 |         * a custom CV generator object compatible with scikit-learn.
 442 | 
 443 | 
 444 |     fold: int, default = 10
 445 |         Number of folds to be used in cross validation. Must be at least 2. This is
 446 |         a global setting that can be over-written at function level by using ``fold``
 447 |         parameter. Ignored when ``fold_strategy`` is a custom object.
 448 | 
 449 | 
 450 |     fold_shuffle: bool, default = False
 451 |         Controls the shuffle parameter of CV. Only applicable when ``fold_strategy``
 452 |         is 'kfold' or 'stratifiedkfold'. Ignored when ``fold_strategy`` is a custom
 453 |         object.
 454 | 
 455 |     
 456 |     fold_groups: str or array-like, with shape (n_samples,), default = None
 457 |         Optional group labels when 'GroupKFold' is used for the cross validation.
 458 |         It takes an array with shape (n_samples, ) where n_samples is the number
 459 |         of rows in the training dataset. When string is passed, it is interpreted 
 460 |         as the column name in the dataset containing group labels.
 461 | 
 462 | 
 463 |     n_jobs: int, default = -1
 464 |         The number of jobs to run in parallel (for functions that supports parallel 
 465 |         processing) -1 means using all processors. To run all functions on single 
 466 |         processor set n_jobs to None.
 467 | 
 468 | 
 469 |     use_gpu: bool or str, default = False
 470 |         When set to True, it will use GPU for training with algorithms that support it, 
 471 |         and fall back to CPU if they are unavailable. When set to 'force', it will only
 472 |         use GPU-enabled algorithms and raise exceptions when they are unavailable. When 
 473 |         False, all algorithms are trained using CPU only.
 474 | 
 475 |         GPU enabled algorithms:
 476 |         
 477 |         - Extreme Gradient Boosting, requires no further installation
 478 | 
 479 |         - CatBoost Regressor, requires no further installation
 480 |           (GPU is only enabled when data > 50,000 rows)
 481 |         
 482 |         - Light Gradient Boosting Machine, requires GPU installation
 483 |           https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html
 484 | 
 485 |         - Linear Regression, Lasso Regression, Ridge Regression, K Neighbors Regressor,
 486 |           Random Forest, Support Vector Regression, Elastic Net requires cuML >= 0.15 
 487 |           https://github.com/rapidsai/cuml
 488 | 
 489 | 
 490 |     custom_pipeline: (str, transformer) or list of (str, transformer), default = None
 491 |         When passed, will append the custom transformers in the preprocessing pipeline
 492 |         and are applied on each CV fold separately and on the final fit. All the custom
 493 |         transformations are applied after 'train_test_split' and before PyRapidML's internal
 494 |         transformations. 
 495 | 
 496 | 
 497 |     html: bool, default = True
 498 |         When set to False, prevents runtime display of monitor. This must be set to False
 499 |         when the environment does not support IPython. For example, command line terminal,
 500 |         Databricks Notebook, Spyder and other similar IDEs. 
 501 | 
 502 | 
 503 |     session_id: int, default = None
 504 |         Controls the randomness of experiment. It is equivalent to 'random_state' in
 505 |         scikit-learn. When None, a pseudo random number is generated. This can be used 
 506 |         for later reproducibility of the entire experiment.
 507 | 
 508 | 
 509 |     log_experiment: bool, default = False
 510 |         When set to True, all metrics and parameters are logged on the ``MLFlow`` server.
 511 | 
 512 | 
 513 |     experiment_name: str, default = None
 514 |         Name of the experiment for logging. Ignored when ``log_experiment`` is not True.
 515 | 
 516 | 
 517 |     log_plots: bool or list, default = False
 518 |         When set to True, certain plots are logged automatically in the ``MLFlow`` server. 
 519 |         To change the type of plots to be logged, pass a list containing plot IDs. Refer
 520 |         to documentation of ``plot_model``. Ignored when ``log_experiment`` is not True.
 521 | 
 522 | 
 523 |     log_profile: bool, default = False
 524 |         When set to True, data profile is logged on the ``MLflow`` server as a html file.
 525 |         Ignored when ``log_experiment`` is not True. 
 526 | 
 527 | 
 528 |     log_data: bool, default = False
 529 |         When set to True, dataset is logged on the ``MLflow`` server as a csv file.
 530 |         Ignored when ``log_experiment`` is not True.
 531 |         
 532 | 
 533 |     silent: bool, default = False
 534 |         Controls the confirmation input of data types when ``setup`` is executed. When
 535 |         executing in completely automated mode or on a remote kernel, this must be True.
 536 | 
 537 |     
 538 |     verbose: bool, default = True
 539 |         When set to False, Information grid is not printed.
 540 | 
 541 | 
 542 |     profile: bool, default = False
 543 |         When set to True, an interactive EDA report is displayed. 
 544 | 
 545 | 
 546 |     profile_kwargs: dict, default = {} (empty dict)
 547 |         Dictionary of arguments passed to the ProfileReport method used
 548 |         to create the EDA report. Ignored if ``profile`` is False.
 549 | 
 550 | 
 551 |     Returns:
 552 |         Global variables that can be changed using the ``set_config`` function.
 553 |       
 554 |     """
 555 |     available_plots = {
 556 |         "parameter": "Hyperparameters",
 557 |         "residuals": "Residuals",
 558 |         "error": "Prediction Error",
 559 |         "cooks": "Cooks Distance",
 560 |         "rfe": "Feature Selection",
 561 |         "learning": "Learning Curve",
 562 |         "manifold": "Manifold Learning",
 563 |         "vc": "Validation Curve",
 564 |         "feature": "Feature Importance",
 565 |         "feature_all": "Feature Importance (All)",
 566 |         "tree": "Decision Tree",
 567 |         "residuals_interactive": "Interactive Residuals",
 568 |     }
 569 | 
 570 |     if log_plots == True:
 571 |         log_plots = ["residuals", "error", "feature"]
 572 | 
 573 |     return pycaret.internal.tabular.setup(
 574 |         ml_usecase="regression",
 575 |         available_plots=available_plots,
 576 |         data=data,
 577 |         target=target,
 578 |         train_size=train_size,
 579 |         test_data=test_data,
 580 |         preprocess=preprocess,
 581 |         imputation_type=imputation_type,
 582 |         iterative_imputation_iters=iterative_imputation_iters,
 583 |         categorical_features=categorical_features,
 584 |         categorical_imputation=categorical_imputation,
 585 |         categorical_iterative_imputer=categorical_iterative_imputer,
 586 |         ordinal_features=ordinal_features,
 587 |         high_cardinality_features=high_cardinality_features,
 588 |         high_cardinality_method=high_cardinality_method,
 589 |         numeric_features=numeric_features,
 590 |         numeric_imputation=numeric_imputation,
 591 |         numeric_iterative_imputer=numeric_iterative_imputer,
 592 |         date_features=date_features,
 593 |         ignore_features=ignore_features,
 594 |         normalize=normalize,
 595 |         normalize_method=normalize_method,
 596 |         transformation=transformation,
 597 |         transformation_method=transformation_method,
 598 |         handle_unknown_categorical=handle_unknown_categorical,
 599 |         unknown_categorical_method=unknown_categorical_method,
 600 |         pca=pca,
 601 |         pca_method=pca_method,
 602 |         pca_components=pca_components,
 603 |         ignore_low_variance=ignore_low_variance,
 604 |         combine_rare_levels=combine_rare_levels,
 605 |         rare_level_threshold=rare_level_threshold,
 606 |         bin_numeric_features=bin_numeric_features,
 607 |         remove_outliers=remove_outliers,
 608 |         outliers_threshold=outliers_threshold,
 609 |         remove_multicollinearity=remove_multicollinearity,
 610 |         multicollinearity_threshold=multicollinearity_threshold,
 611 |         remove_perfect_collinearity=remove_perfect_collinearity,
 612 |         create_clusters=create_clusters,
 613 |         cluster_iter=cluster_iter,
 614 |         polynomial_features=polynomial_features,
 615 |         polynomial_degree=polynomial_degree,
 616 |         trigonometry_features=trigonometry_features,
 617 |         polynomial_threshold=polynomial_threshold,
 618 |         group_features=group_features,
 619 |         group_names=group_names,
 620 |         feature_selection=feature_selection,
 621 |         feature_selection_threshold=feature_selection_threshold,
 622 |         feature_selection_method=feature_selection_method,
 623 |         feature_interaction=feature_interaction,
 624 |         feature_ratio=feature_ratio,
 625 |         interaction_threshold=interaction_threshold,
 626 |         transform_target=transform_target,
 627 |         transform_target_method=transform_target_method,
 628 |         data_split_shuffle=data_split_shuffle,
 629 |         data_split_stratify=data_split_stratify,
 630 |         fold_strategy=fold_strategy,
 631 |         fold=fold,
 632 |         fold_shuffle=fold_shuffle,
 633 |         fold_groups=fold_groups,
 634 |         n_jobs=n_jobs,
 635 |         use_gpu=use_gpu,
 636 |         custom_pipeline=custom_pipeline,
 637 |         html=html,
 638 |         session_id=session_id,
 639 |         log_experiment=log_experiment,
 640 |         experiment_name=experiment_name,
 641 |         log_plots=log_plots,
 642 |         log_profile=log_profile,
 643 |         log_data=log_data,
 644 |         silent=silent,
 645 |         verbose=verbose,
 646 |         profile=profile,
 647 |         profile_kwargs=profile_kwargs,
 648 |     )
 649 | 
 650 | 
 651 | def comparing_models(
 652 |     include: Optional[List[Union[str, Any]]] = None,
 653 |     exclude: Optional[List[str]] = None,
 654 |     fold: Optional[Union[int, Any]] = None,
 655 |     round: int = 4,
 656 |     cross_validation: bool = True,
 657 |     sort: str = "R2",
 658 |     n_select: int = 1,
 659 |     budget_time: Optional[float] = None,
 660 |     turbo: bool = True,
 661 |     errors: str = "ignore",
 662 |     fit_kwargs: Optional[dict] = None,
 663 |     groups: Optional[Union[str, Any]] = None,
 664 |     verbose: bool = True,
 665 | ):
 666 | 
 667 |     """
 668 |     This function trains and evaluates performance of all estimators available in the 
 669 |     model library using cross validation. The output of this function is a score grid 
 670 |     with average cross validated scores. Metrics evaluated during CV can be accessed 
 671 |     using the ``get_metrics`` function. Custom metrics can be added or removed using 
 672 |     ``add_metric`` and ``remove_metric`` function.
 673 | 
 674 | 
 675 |     Example
 676 |     --------
 677 |     >>> from PyRapidML.datasets import get_data
 678 |     >>> boston = extract_data('boston')
 679 |     >>> from PyRapidML.regression import *
 680 |     >>> exp_name = initializer(data = boston,  target = 'medv')
 681 |     >>> best_model = comparing_models()
 682 | 
 683 | 
 684 |     include: list of str or scikit-learn compatible object, default = None
 685 |         To train and evaluate select models, list containing model ID or scikit-learn 
 686 |         compatible object can be passed in include param. To see a list of all models 
 687 |         available in the model library use the ``models`` function. 
 688 | 
 689 | 
 690 |     exclude: list of str, default = None
 691 |         To omit certain models from training and evaluation, pass a list containing 
 692 |         model id in the exclude parameter. To see a list of all models available
 693 |         in the model library use the ``models`` function. 
 694 | 
 695 | 
 696 |     fold: int or scikit-learn compatible CV generator, default = None
 697 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
 698 |         parameter of the ``setup`` function is used. When an integer is passed, 
 699 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
 700 |         ``setup`` function.
 701 | 
 702 | 
 703 |     round: int, default = 4
 704 |         Number of decimal places the metrics in the score grid will be rounded to.
 705 | 
 706 | 
 707 |     cross_validation: bool, default = True
 708 |         When set to False, metrics are evaluated on holdout set. ``fold`` param
 709 |         is ignored when cross_validation is set to False.
 710 | 
 711 | 
 712 |     sort: str, default = 'R2'
 713 |         The sort order of the score grid. It also accepts custom metrics that are
 714 |         added through the ``add_metric`` function.
 715 | 
 716 | 
 717 |     n_select: int, default = 1
 718 |         Number of top_n models to return. For example, to select top 3 models use
 719 |         n_select = 3.
 720 | 
 721 | 
 722 |     budget_time: int or float, default = None
 723 |         If not None, will terminate execution of the function after budget_time 
 724 |         minutes have passed and return results up to that point.
 725 | 
 726 | 
 727 |     turbo: bool, default = True
 728 |         When set to True, it excludes estimators with longer training times. To
 729 |         see which algorithms are excluded use the ``models`` function.
 730 | 
 731 | 
 732 |     errors: str, default = 'ignore'
 733 |         When set to 'ignore', will skip the model with exceptions and continue.
 734 |         If 'raise', will break the function when exceptions are raised.
 735 | 
 736 | 
 737 |     fit_kwargs: dict, default = {} (empty dict)
 738 |         Dictionary of arguments passed to the fit method of the model.
 739 | 
 740 | 
 741 |     groups: str or array-like, with shape (n_samples,), default = None
 742 |         Optional group labels when 'GroupKFold' is used for the cross validation.
 743 |         It takes an array with shape (n_samples, ) where n_samples is the number
 744 |         of rows in the training dataset. When string is passed, it is interpreted 
 745 |         as the column name in the dataset containing group labels.
 746 | 
 747 | 
 748 |     verbose: bool, default = True
 749 |         Score grid is not printed when verbose is set to False.
 750 |     
 751 |     
 752 |     Returns:
 753 |         Trained model or list of trained models, depending on the ``n_select`` param.
 754 | 
 755 | 
 756 |     Warnings
 757 |     --------
 758 |     - Changing turbo parameter to False may result in very high training times with 
 759 |       datasets exceeding 10,000 rows.
 760 | 
 761 |     - No models are logged in ``MLFlow`` when ``cross_validation`` parameter is False.
 762 | 
 763 |     """
 764 | 
 765 |     return pycaret.internal.tabular.compare_models(
 766 |         include=include,
 767 |         exclude=exclude,
 768 |         fold=fold,
 769 |         round=round,
 770 |         cross_validation=cross_validation,
 771 |         sort=sort,
 772 |         n_select=n_select,
 773 |         budget_time=budget_time,
 774 |         turbo=turbo,
 775 |         errors=errors,
 776 |         fit_kwargs=fit_kwargs,
 777 |         groups=groups,
 778 |         verbose=verbose,
 779 |     )
 780 | 
 781 | 
 782 | def creating_model(
 783 |     estimator: Union[str, Any],
 784 |     fold: Optional[Union[int, Any]] = None,
 785 |     round: int = 4,
 786 |     cross_validation: bool = True,
 787 |     fit_kwargs: Optional[dict] = None,
 788 |     groups: Optional[Union[str, Any]] = None,
 789 |     verbose: bool = True,
 790 |     **kwargs,
 791 | ):
 792 | 
 793 |     """
 794 |     This function trains and evaluates the performance of a given estimator 
 795 |     using cross validation. The output of this function is a score grid with 
 796 |     CV scores by fold. Metrics evaluated during CV can be accessed using the 
 797 |     ``get_metrics`` function. Custom metrics can be added or removed using 
 798 |     ``add_metric`` and ``remove_metric`` function. All the available models
 799 |     can be accessed using the ``models`` function.
 800 | 
 801 | 
 802 |     Example
 803 |     -------
 804 |     >>> from PyRapidML.datasets import get_data
 805 |     >>> boston = extract_data('boston')
 806 |     >>> from PyRapidML.regression import *
 807 |     >>> exp_name = initializer(data = boston,  target = 'medv')
 808 |     >>> lr = creating_model('lr')
 809 |     
 810 |     
 811 | 
 812 |     estimator: str or scikit-learn compatible object
 813 |         ID of an estimator available in model library or pass an untrained 
 814 |         model object consistent with scikit-learn API. Estimators available  
 815 |         in the model library (ID - Name):
 816 | 
 817 |         * 'lr' - Linear Regression                   
 818 |         * 'lasso' - Lasso Regression                
 819 |         * 'ridge' - Ridge Regression                
 820 |         * 'en' - Elastic Net                   
 821 |         * 'lar' - Least Angle Regression                  
 822 |         * 'llar' - Lasso Least Angle Regression                   
 823 |         * 'omp' - Orthogonal Matching Pursuit                     
 824 |         * 'br' - Bayesian Ridge                   
 825 |         * 'ard' - Automatic Relevance Determination                  
 826 |         * 'par' - Passive Aggressive Regressor                    
 827 |         * 'ransac' - Random Sample Consensus       
 828 |         * 'tr' - TheilSen Regressor                   
 829 |         * 'huber' - Huber Regressor                               
 830 |         * 'kr' - Kernel Ridge                                     
 831 |         * 'svm' - Support Vector Regression                           
 832 |         * 'knn' - K Neighbors Regressor                           
 833 |         * 'dt' - Decision Tree Regressor                                   
 834 |         * 'rf' - Random Forest Regressor                                   
 835 |         * 'et' - Extra Trees Regressor                            
 836 |         * 'ada' - AdaBoost Regressor                              
 837 |         * 'gbr' - Gradient Boosting Regressor                               
 838 |         * 'mlp' - MLP Regressor
 839 |         * 'xgboost' - Extreme Gradient Boosting                   
 840 |         * 'lightgbm' - Light Gradient Boosting Machine                    
 841 |         * 'catboost' - CatBoost Regressor                         
 842 | 
 843 | 
 844 |     fold: int or scikit-learn compatible CV generator, default = None
 845 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
 846 |         parameter of the ``setup`` function is used. When an integer is passed, 
 847 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
 848 |         ``setup`` function.
 849 |         
 850 | 
 851 |     round: int, default = 4
 852 |         Number of decimal places the metrics in the score grid will be rounded to. 
 853 | 
 854 | 
 855 |     cross_validation: bool, default = True
 856 |         When set to False, metrics are evaluated on holdout set. ``fold`` param
 857 |         is ignored when cross_validation is set to False.
 858 | 
 859 | 
 860 |     fit_kwargs: dict, default = {} (empty dict)
 861 |         Dictionary of arguments passed to the fit method of the model.
 862 | 
 863 | 
 864 |     groups: str or array-like, with shape (n_samples,), default = None
 865 |         Optional group labels when GroupKFold is used for the cross validation.
 866 |         It takes an array with shape (n_samples, ) where n_samples is the number
 867 |         of rows in training dataset. When string is passed, it is interpreted as 
 868 |         the column name in the dataset containing group labels.
 869 | 
 870 | 
 871 |     verbose: bool, default = True
 872 |         Score grid is not printed when verbose is set to False.
 873 | 
 874 | 
 875 |     **kwargs: 
 876 |         Additional keyword arguments to pass to the estimator.
 877 | 
 878 | 
 879 |     Returns:
 880 |         Trained Model
 881 | 
 882 | 
 883 |     Warnings
 884 |     --------
 885 |     - Models are not logged on the ``MLFlow`` server when ``cross_validation`` param
 886 |       is set to False.
 887 |       
 888 |     """
 889 | 
 890 |     return pycaret.internal.tabular.create_model_supervised(
 891 |         estimator=estimator,
 892 |         fold=fold,
 893 |         round=round,
 894 |         cross_validation=cross_validation,
 895 |         fit_kwargs=fit_kwargs,
 896 |         groups=groups,
 897 |         verbose=verbose,
 898 |         **kwargs,
 899 |     )
 900 | 
 901 | 
 902 | def tuning_model(
 903 |     estimator,
 904 |     fold: Optional[Union[int, Any]] = None,
 905 |     round: int = 4,
 906 |     n_iter: int = 10,
 907 |     custom_grid: Optional[Union[Dict[str, list], Any]] = None,
 908 |     optimize: str = "R2",
 909 |     custom_scorer=None,
 910 |     search_library: str = "scikit-learn",
 911 |     search_algorithm: Optional[str] = None,
 912 |     early_stopping: Any = False,
 913 |     early_stopping_max_iters: int = 10,
 914 |     choose_better: bool = False,
 915 |     fit_kwargs: Optional[dict] = None,
 916 |     groups: Optional[Union[str, Any]] = None,
 917 |     return_tuner: bool = False,
 918 |     verbose: bool = True,
 919 |     tuner_verbose: Union[int, bool] = True,
 920 |     **kwargs,
 921 | ):
 922 | 
 923 |     """
 924 |     This function tunes the hyperparameters of a given estimator. The output of
 925 |     this function is a score grid with CV scores by fold of the best selected 
 926 |     model based on ``optimize`` parameter. Metrics evaluated during CV can be 
 927 |     accessed using the ``get_metrics`` function. Custom metrics can be added
 928 |     or removed using ``add_metric`` and ``remove_metric`` function. 
 929 | 
 930 | 
 931 |     Example
 932 |     -------
 933 |     >>> from PyRapidML.datasets import get_data
 934 |     >>> boston = extract_data('boston')
 935 |     >>> from PyRapidML.regression import *
 936 |     >>> exp_name = initializer(data = boston,  target = 'medv')
 937 |     >>> lr = creating_model('lr')
 938 |     >>> tuned_lr = tuning_model(lr) 
 939 | 
 940 | 
 941 |     estimator: scikit-learn compatible object
 942 |         Trained model object
 943 | 
 944 | 
 945 |     fold: int or scikit-learn compatible CV generator, default = None
 946 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
 947 |         parameter of the ``setup`` function is used. When an integer is passed, 
 948 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
 949 |         ``setup`` function.
 950 |         
 951 | 
 952 |     round: int, default = 4
 953 |         Number of decimal places the metrics in the score grid will be rounded to. 
 954 | 
 955 | 
 956 |     n_iter: int, default = 10
 957 |         Number of iterations in the grid search. Increasing 'n_iter' may improve 
 958 |         model performance but also increases the training time.
 959 | 
 960 | 
 961 |     custom_grid: dictionary, default = None
 962 |         To define custom search space for hyperparameters, pass a dictionary with 
 963 |         parameter name and values to be iterated. Custom grids must be in a format 
 964 |         supported by the defined ``search_library``.
 965 | 
 966 | 
 967 |     optimize: str, default = 'R2'
 968 |         Metric name to be evaluated for hyperparameter tuning. It also accepts custom 
 969 |         metrics that are added through the ``add_metric`` function.
 970 | 
 971 | 
 972 |     custom_scorer: object, default = None
 973 |         custom scoring strategy can be passed to tune hyperparameters of the model. 
 974 |         It must be created using ``sklearn.make_scorer``. It is equivalent of adding
 975 |         custom metric using the ``add_metric`` function and passing the name of the
 976 |         custom metric in the ``optimize`` parameter. 
 977 |         Will be deprecated in future.
 978 | 
 979 | 
 980 |     search_library: str, default = 'scikit-learn'
 981 |         The search library used for tuning hyperparameters. Possible values:
 982 | 
 983 |         - 'scikit-learn' - default, requires no further installation
 984 |             https://github.com/scikit-learn/scikit-learn
 985 | 
 986 |         - 'scikit-optimize' - ``pip install scikit-optimize`` 
 987 |             https://scikit-optimize.github.io/stable/
 988 | 
 989 |         - 'tune-sklearn' - ``pip install tune-sklearn ray[tune]`` 
 990 |             https://github.com/ray-project/tune-sklearn
 991 | 
 992 |         - 'optuna' - ``pip install optuna`` 
 993 |             https://optuna.org/
 994 | 
 995 | 
 996 |     search_algorithm: str, default = None
 997 |         The search algorithm depends on the ``search_library`` parameter.
 998 |         Some search algorithms require additional libraries to be installed.
 999 |         If None, will use search library-specific default algorithm.
1000 | 
1001 |         - 'scikit-learn' possible values:
1002 |             - 'random' : random grid search (default)
1003 |             - 'grid' : grid search
1004 | 
1005 |         - 'scikit-optimize' possible values:
1006 |             - 'bayesian' : Bayesian search (default)
1007 | 
1008 |         - 'tune-sklearn' possible values:
1009 |             - 'random' : random grid search (default)
1010 |             - 'grid' : grid search
1011 |             - 'bayesian' : ``pip install scikit-optimize``
1012 |             - 'hyperopt' : ``pip install hyperopt``
1013 |             - 'optuna' : ``pip install optuna``
1014 |             - 'bohb' : ``pip install hpbandster ConfigSpace``
1015 | 
1016 |         - 'optuna' possible values:
1017 |             - 'random' : randomized search
1018 |             - 'tpe' : Tree-structured Parzen Estimator search (default)
1019 | 
1020 | 
1021 |     early_stopping: bool or str or object, default = False
1022 |         Use early stopping to stop fitting to a hyperparameter configuration 
1023 |         if it performs poorly. Ignored when ``search_library`` is scikit-learn, 
1024 |         or if the estimator does not have 'partial_fit' attribute. If False or 
1025 |         None, early stopping will not be used. Can be either an object accepted 
1026 |         by the search library or one of the following:
1027 | 
1028 |         - 'asha' for Asynchronous Successive Halving Algorithm
1029 |         - 'hyperband' for Hyperband
1030 |         - 'median' for Median Stopping Rule
1031 |         - If False or None, early stopping will not be used.
1032 | 
1033 | 
1034 |     early_stopping_max_iters: int, default = 10
1035 |         Maximum number of epochs to run for each sampled configuration.
1036 |         Ignored if ``early_stopping`` is False or None.
1037 | 
1038 | 
1039 |     choose_better: bool, default = False
1040 |         When set to True, the returned object is always better performing. The
1041 |         metric used for comparison is defined by the ``optimize`` parameter.  
1042 | 
1043 | 
1044 |     fit_kwargs: dict, default = {} (empty dict)
1045 |         Dictionary of arguments passed to the fit method of the tuner.
1046 | 
1047 | 
1048 |     groups: str or array-like, with shape (n_samples,), default = None
1049 |         Optional group labels when GroupKFold is used for the cross validation.
1050 |         It takes an array with shape (n_samples, ) where n_samples is the number
1051 |         of rows in training dataset. When string is passed, it is interpreted as 
1052 |         the column name in the dataset containing group labels.
1053 | 
1054 | 
1055 |     return_tuner: bool, default = False
1056 |         When set to True, will return a tuple of (model, tuner_object). 
1057 | 
1058 | 
1059 |     verbose: bool, default = True
1060 |         Score grid is not printed when verbose is set to False.
1061 | 
1062 | 
1063 |     tuner_verbose: bool or in, default = True
1064 |         If True or above 0, will print messages from the tuner. Higher values
1065 |         print more messages. Ignored when ``verbose`` param is False.
1066 | 
1067 | 
1068 |     **kwargs: 
1069 |         Additional keyword arguments to pass to the optimizer.
1070 | 
1071 | 
1072 |     Returns:
1073 |         Trained Model and Optional Tuner Object when ``return_tuner`` is True. 
1074 | 
1075 | 
1076 |     Warnings
1077 |     --------
1078 |     - Using 'grid' as ``search_algorithm`` may result in very long computation.
1079 |       Only recommended with smaller search spaces that can be defined in the
1080 |       ``custom_grid`` parameter.
1081 | 
1082 |     - ``search_library`` 'tune-sklearn' does not support GPU models.
1083 | 
1084 |     """
1085 | 
1086 |     return pycaret.internal.tabular.tune_model_supervised(
1087 |         estimator=estimator,
1088 |         fold=fold,
1089 |         round=round,
1090 |         n_iter=n_iter,
1091 |         custom_grid=custom_grid,
1092 |         optimize=optimize,
1093 |         custom_scorer=custom_scorer,
1094 |         search_library=search_library,
1095 |         search_algorithm=search_algorithm,
1096 |         early_stopping=early_stopping,
1097 |         early_stopping_max_iters=early_stopping_max_iters,
1098 |         choose_better=choose_better,
1099 |         fit_kwargs=fit_kwargs,
1100 |         groups=groups,
1101 |         return_tuner=return_tuner,
1102 |         verbose=verbose,
1103 |         tuner_verbose=tuner_verbose,
1104 |         **kwargs,
1105 |     )
1106 | 
1107 | 
1108 | def ensemble_model(
1109 |     estimator,
1110 |     method: str = "Bagging",
1111 |     fold: Optional[Union[int, Any]] = None,
1112 |     n_estimators: int = 10,
1113 |     round: int = 4,
1114 |     choose_better: bool = False,
1115 |     optimize: str = "R2",
1116 |     fit_kwargs: Optional[dict] = None,
1117 |     groups: Optional[Union[str, Any]] = None,
1118 |     verbose: bool = True,
1119 | ) -> Any:
1120 | 
1121 |     """
1122 |     This function ensembles a given estimator. The output of this function is 
1123 |     a score grid with CV scores by fold. Metrics evaluated during CV can be 
1124 |     accessed using the ``get_metrics`` function. Custom metrics can be added
1125 |     or removed using ``add_metric`` and ``remove_metric`` function. 
1126 | 
1127 | 
1128 |     Example
1129 |     --------
1130 |     >>> from PyRapidML.datasets import get_data
1131 |     >>> boston = extract_data('boston')
1132 |     >>> from PyRapidML.regression import *
1133 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1134 |     >>> lr = creating_model('lr')
1135 |     >>> tuned_lr = tuning_model(lr) 
1136 |     >>> bagged_dt = ensemble_model(dt, method = 'Bagging')
1137 | 
1138 | 
1139 |    estimator: scikit-learn compatible object
1140 |         Trained model object
1141 | 
1142 | 
1143 |     method: str, default = 'Bagging'
1144 |         Method for ensembling base estimator. It can be 'Bagging' or 'Boosting'. 
1145 | 
1146 | 
1147 |     fold: int or scikit-learn compatible CV generator, default = None
1148 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1149 |         parameter of the ``setup`` function is used. When an integer is passed, 
1150 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1151 |         ``setup`` function.
1152 |         
1153 | 
1154 |     n_estimators: int, default = 10
1155 |         The number of base estimators in the ensemble. In case of perfect fit, the 
1156 |         learning procedure is stopped early.
1157 | 
1158 |         
1159 |     round: int, default = 4
1160 |         Number of decimal places the metrics in the score grid will be rounded to. 
1161 | 
1162 | 
1163 |     choose_better: bool, default = False
1164 |         When set to True, the returned object is always better performing. The
1165 |         metric used for comparison is defined by the ``optimize`` parameter. 
1166 | 
1167 | 
1168 |     optimize: str, default = 'R2'
1169 |         Metric to compare for model selection when ``choose_better`` is True.
1170 | 
1171 | 
1172 |     fit_kwargs: dict, default = {} (empty dict)
1173 |         Dictionary of arguments passed to the fit method of the model.
1174 | 
1175 | 
1176 |     groups: str or array-like, with shape (n_samples,), default = None
1177 |         Optional group labels when GroupKFold is used for the cross validation.
1178 |         It takes an array with shape (n_samples, ) where n_samples is the number
1179 |         of rows in training dataset. When string is passed, it is interpreted as 
1180 |         the column name in the dataset containing group labels.
1181 | 
1182 | 
1183 |     verbose: bool, default = True
1184 |         Score grid is not printed when verbose is set to False.
1185 | 
1186 | 
1187 |     Returns:
1188 |         Trained Model
1189 |       
1190 |     """
1191 | 
1192 |     return pycaret.internal.tabular.ensemble_model(
1193 |         estimator=estimator,
1194 |         method=method,
1195 |         fold=fold,
1196 |         n_estimators=n_estimators,
1197 |         round=round,
1198 |         choose_better=choose_better,
1199 |         optimize=optimize,
1200 |         fit_kwargs=fit_kwargs,
1201 |         groups=groups,
1202 |         verbose=verbose,
1203 |     )
1204 | 
1205 | 
1206 | def blend_models(
1207 |     estimator_list: list,
1208 |     fold: Optional[Union[int, Any]] = None,
1209 |     round: int = 4,
1210 |     choose_better: bool = False,
1211 |     optimize: str = "R2",
1212 |     weights: Optional[List[float]] = None,
1213 |     fit_kwargs: Optional[dict] = None,
1214 |     groups: Optional[Union[str, Any]] = None,
1215 |     verbose: bool = True,
1216 | ):
1217 | 
1218 |     """
1219 |     This function trains a Voting Regressor for select models passed in the 
1220 |     ``estimator_list`` param. The output of this function is a score grid with 
1221 |     CV scores by fold. Metrics evaluated during CV can be accessed using the 
1222 |     ``get_metrics`` function. Custom metrics can be added or removed using 
1223 |     ``add_metric`` and ``remove_metric`` function.
1224 | 
1225 |     
1226 |     Example
1227 |     --------
1228 |     >>> from PyRapidML.datasets import get_data
1229 |     >>> boston = extract_data('boston')
1230 |     >>> from PyRapidML.regression import *
1231 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1232 |     >>> top3 = comparing_models(n_select = 3)
1233 |     >>> blender = blend_models(top3)
1234 | 
1235 | 
1236 |     estimator_list: list of scikit-learn compatible objects
1237 |         List of trained model objects
1238 | 
1239 | 
1240 |     fold: int or scikit-learn compatible CV generator, default = None
1241 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1242 |         parameter of the ``setup`` function is used. When an integer is passed, 
1243 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1244 |         ``setup`` function.
1245 | 
1246 | 
1247 |     round: int, default = 4
1248 |         Number of decimal places the metrics in the score grid will be rounded to.
1249 | 
1250 | 
1251 |     choose_better: bool, default = False
1252 |         When set to True, the returned object is always better performing. The
1253 |         metric used for comparison is defined by the ``optimize`` parameter. 
1254 | 
1255 | 
1256 |     optimize: str, default = 'R2'
1257 |         Metric to compare for model selection when ``choose_better`` is True.
1258 | 
1259 | 
1260 |     weights: list, default = None
1261 |         Sequence of weights (float or int) to weight the occurrences of predicted class 
1262 |         labels (hard voting) or class probabilities before averaging (soft voting). Uses 
1263 |         uniform weights when None.
1264 | 
1265 | 
1266 |     fit_kwargs: dict, default = {} (empty dict)
1267 |         Dictionary of arguments passed to the fit method of the model.
1268 | 
1269 | 
1270 |     groups: str or array-like, with shape (n_samples,), default = None
1271 |         Optional group labels when GroupKFold is used for the cross validation.
1272 |         It takes an array with shape (n_samples, ) where n_samples is the number
1273 |         of rows in training dataset. When string is passed, it is interpreted as 
1274 |         the column name in the dataset containing group labels.
1275 | 
1276 | 
1277 |     verbose: bool, default = True
1278 |         Score grid is not printed when verbose is set to False.
1279 | 
1280 | 
1281 |     Returns:
1282 |         Trained Model
1283 |        
1284 |   
1285 |     """
1286 | 
1287 |     return pycaret.internal.tabular.blend_models(
1288 |         estimator_list=estimator_list,
1289 |         fold=fold,
1290 |         round=round,
1291 |         choose_better=choose_better,
1292 |         optimize=optimize,
1293 |         method="auto",
1294 |         weights=weights,
1295 |         fit_kwargs=fit_kwargs,
1296 |         groups=groups,
1297 |         verbose=verbose,
1298 |     )
1299 | 
1300 | 
1301 | def stack_models(
1302 |     estimator_list: list,
1303 |     meta_model=None,
1304 |     fold: Optional[Union[int, Any]] = None,
1305 |     round: int = 4,
1306 |     restack: bool = True,
1307 |     choose_better: bool = False,
1308 |     optimize: str = "R2",
1309 |     fit_kwargs: Optional[dict] = None,
1310 |     groups: Optional[Union[str, Any]] = None,
1311 |     verbose: bool = True,
1312 | ):
1313 | 
1314 |     """
1315 |     This function trains a meta model over select estimators passed in 
1316 |     the ``estimator_list`` parameter. The output of this function is a 
1317 |     score grid with CV scores by fold. Metrics evaluated during CV can 
1318 |     be accessed using the ``get_metrics`` function. Custom metrics 
1319 |     can be added or removed using ``add_metric`` and ``remove_metric`` 
1320 |     function.
1321 | 
1322 | 
1323 |     Example
1324 |     --------
1325 |     >>> from PyRapidML.datasets import get_data
1326 |     >>> boston = extract_data('boston')
1327 |     >>> from PyRapidML.regression import *
1328 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1329 |     >>> top3 = comparing_models(n_select = 3)
1330 |     >>> stacker = stack_models(top3)
1331 | 
1332 | 
1333 |     estimator_list: list of scikit-learn compatible objects
1334 |         List of trained model objects
1335 | 
1336 | 
1337 |     meta_model: scikit-learn compatible object, default = None
1338 |         When None, Linear Regression is trained as a meta model.
1339 | 
1340 | 
1341 |     fold: int or scikit-learn compatible CV generator, default = None
1342 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1343 |         parameter of the ``setup`` function is used. When an integer is passed, 
1344 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1345 |         ``setup`` function.
1346 | 
1347 | 
1348 |     round: int, default = 4
1349 |         Number of decimal places the metrics in the score grid will be rounded to.
1350 | 
1351 | 
1352 |     restack: bool, default = True
1353 |         When set to False, only the predictions of estimators will be used as 
1354 |         training data for the ``meta_model``.
1355 | 
1356 | 
1357 |     choose_better: bool, default = False
1358 |         When set to True, the returned object is always better performing. The
1359 |         metric used for comparison is defined by the ``optimize`` parameter. 
1360 | 
1361 | 
1362 |     optimize: str, default = 'R2'
1363 |         Metric to compare for model selection when ``choose_better`` is True.
1364 | 
1365 | 
1366 |     fit_kwargs: dict, default = {} (empty dict)
1367 |         Dictionary of arguments passed to the fit method of the model.
1368 | 
1369 | 
1370 |     groups: str or array-like, with shape (n_samples,), default = None
1371 |         Optional group labels when GroupKFold is used for the cross validation.
1372 |         It takes an array with shape (n_samples, ) where n_samples is the number
1373 |         of rows in training dataset. When string is passed, it is interpreted as 
1374 |         the column name in the dataset containing group labels.
1375 | 
1376 | 
1377 |     verbose: bool, default = True
1378 |         Score grid is not printed when verbose is set to False.
1379 | 
1380 | 
1381 |     Returns:
1382 |         Trained Model
1383 | 
1384 |     """
1385 | 
1386 |     return pycaret.internal.tabular.stack_models(
1387 |         estimator_list=estimator_list,
1388 |         meta_model=meta_model,
1389 |         fold=fold,
1390 |         round=round,
1391 |         method="auto",
1392 |         restack=restack,
1393 |         choose_better=choose_better,
1394 |         optimize=optimize,
1395 |         fit_kwargs=fit_kwargs,
1396 |         groups=groups,
1397 |         verbose=verbose,
1398 |     )
1399 | 
1400 | 
1401 | def plot_model(
1402 |     estimator,
1403 |     plot: str = "residuals",
1404 |     scale: float = 1,
1405 |     save: bool = False,
1406 |     fold: Optional[Union[int, Any]] = None,
1407 |     fit_kwargs: Optional[dict] = None,
1408 |     groups: Optional[Union[str, Any]] = None,
1409 |     use_train_data: bool = False,
1410 |     verbose: bool = True,
1411 |     display_format: Optional[str] = None,
1412 | ) -> str:
1413 | 
1414 |     """
1415 |     This function analyzes the performance of a trained model on holdout set. 
1416 |     It may require re-training the model in certain cases.
1417 | 
1418 | 
1419 |     Example
1420 |     --------
1421 |     >>> from PyRapidML.datasets import get_data
1422 |     >>> boston = extract_data('boston')
1423 |     >>> from PyRapidML.regression import *
1424 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1425 |     >>> lr = creating_model('lr')
1426 |     >>> plot_model(lr, plot = 'residual')
1427 | 
1428 | 
1429 |     estimator: scikit-learn compatible object
1430 |         Trained model object
1431 |    
1432 | 
1433 |     plot: str, default = 'residual'
1434 |         List of available plots (ID - Name):
1435 | 
1436 |         * 'residuals_interactive' - Interactive Residual plots
1437 |         * 'residuals' - Residuals Plot
1438 |         * 'error' - Prediction Error Plot
1439 |         * 'cooks' - Cooks Distance Plot
1440 |         * 'rfe' - Recursive Feat. Selection
1441 |         * 'learning' - Learning Curve
1442 |         * 'vc' - Validation Curve
1443 |         * 'manifold' - Manifold Learning
1444 |         * 'feature' - Feature Importance
1445 |         * 'feature_all' - Feature Importance (All)
1446 |         * 'parameter' - Model Hyperparameter
1447 |         * 'tree' - Decision Tree
1448 | 
1449 | 
1450 |     scale: float, default = 1
1451 |         The resolution scale of the figure.
1452 | 
1453 | 
1454 |     save: bool, default = False
1455 |         When set to True, plot is saved in the current working directory.
1456 | 
1457 | 
1458 |     fold: int or scikit-learn compatible CV generator, default = None
1459 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1460 |         parameter of the ``setup`` function is used. When an integer is passed, 
1461 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1462 |         ``setup`` function.
1463 | 
1464 | 
1465 |     fit_kwargs: dict, default = {} (empty dict)
1466 |         Dictionary of arguments passed to the fit method of the model.
1467 | 
1468 | 
1469 |     groups: str or array-like, with shape (n_samples,), default = None
1470 |         Optional group labels when GroupKFold is used for the cross validation.
1471 |         It takes an array with shape (n_samples, ) where n_samples is the number
1472 |         of rows in training dataset. When string is passed, it is interpreted as 
1473 |         the column name in the dataset containing group labels.
1474 | 
1475 | 
1476 |     use_train_data: bool, default = False
1477 |         When set to true, train data will be used for plots, instead
1478 |         of test data.
1479 | 
1480 | 
1481 |     verbose: bool, default = True
1482 |         When set to False, progress bar is not displayed.
1483 | 
1484 | 
1485 |     display_format: str, default = None
1486 |         To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'.
1487 |         Currently, not all plots are supported.
1488 | 
1489 | 
1490 |     Returns:
1491 |         None
1492 | 
1493 |     """
1494 | 
1495 |     return pycaret.internal.tabular.plot_model(
1496 |         estimator=estimator,
1497 |         plot=plot,
1498 |         scale=scale,
1499 |         save=save,
1500 |         fold=fold,
1501 |         fit_kwargs=fit_kwargs,
1502 |         groups=groups,
1503 |         verbose=verbose,
1504 |         use_train_data=use_train_data,
1505 |         system=True,
1506 |         display_format=display_format,
1507 |     )
1508 | 
1509 | 
1510 | def evaluate_model(
1511 |     estimator,
1512 |     fold: Optional[Union[int, Any]] = None,
1513 |     fit_kwargs: Optional[dict] = None,
1514 |     groups: Optional[Union[str, Any]] = None,
1515 |     use_train_data: bool = False,
1516 | ):
1517 | 
1518 |     """
1519 |     This function displays a user interface for analyzing performance of a trained
1520 |     model. It calls the ``plot_model`` function internally. 
1521 |     
1522 |     Example
1523 |     --------
1524 |     >>> from PyRapidML.datasets import get_data
1525 |     >>> boston = extract_data('boston')
1526 |     >>> from PyRapidML.regression import *
1527 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1528 |     >>> lr = creating_model('lr')
1529 |     >>> evaluate_model(lr)
1530 |     
1531 | 
1532 |     estimator: scikit-learn compatible object
1533 |         Trained model object
1534 | 
1535 | 
1536 |     fold: int or scikit-learn compatible CV generator, default = None
1537 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1538 |         parameter of the ``setup`` function is used. When an integer is passed, 
1539 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1540 |         ``setup`` function.
1541 | 
1542 | 
1543 |     fit_kwargs: dict, default = {} (empty dict)
1544 |         Dictionary of arguments passed to the fit method of the model.
1545 | 
1546 | 
1547 |     groups: str or array-like, with shape (n_samples,), default = None
1548 |         Optional group labels when GroupKFold is used for the cross validation.
1549 |         It takes an array with shape (n_samples, ) where n_samples is the number
1550 |         of rows in training dataset. When string is passed, it is interpreted as 
1551 |         the column name in the dataset containing group labels.
1552 | 
1553 | 
1554 |     use_train_data: bool, default = False
1555 |         When set to true, train data will be used for plots, instead
1556 |         of test data.
1557 | 
1558 | 
1559 |     Returns:
1560 |         None
1561 | 
1562 | 
1563 |     Warnings
1564 |     --------
1565 |     -   This function only works in IPython enabled Notebook.
1566 | 
1567 |     """
1568 | 
1569 |     return pycaret.internal.tabular.evaluate_model(
1570 |         estimator=estimator,
1571 |         fold=fold,
1572 |         fit_kwargs=fit_kwargs,
1573 |         groups=groups,
1574 |         use_train_data=use_train_data,
1575 |     )
1576 | 
1577 | 
1578 | def interpret_model(
1579 |     estimator,
1580 |     plot: str = "summary",
1581 |     feature: Optional[str] = None,
1582 |     observation: Optional[int] = None,
1583 |     use_train_data: bool = False,
1584 |     X_new_sample: Optional[pd.DataFrame] = None,
1585 |     save: bool = False,
1586 |     **kwargs,
1587 | ):
1588 | 
1589 |     """
1590 |     This function analyzes the predictions generated from a tree-based model. It is
1591 |     implemented based on the SHAP (SHapley Additive exPlanations). For more info on
1592 |     this, please see https://shap.readthedocs.io/en/latest/
1593 | 
1594 | 
1595 |     Example
1596 |     --------
1597 |     >>> from PyRapidML.datasets import get_data
1598 |     >>> boston = extract_data('boston')
1599 |     >>> from PyRapidML.regression import *
1600 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1601 |     >>> xgboost = creating_model('xgboost')
1602 |     >>> interpret_model(xgboost)
1603 | 
1604 |  
1605 |     estimator: scikit-learn compatible object
1606 |         Trained model object
1607 | 
1608 | 
1609 |     plot: str, default = 'summary'
1610 |         Type of plot. Available options are: 'summary', 'correlation', and 'reason'.
1611 | 
1612 | 
1613 |     feature: str, default = None
1614 |         Feature to check correlation with. This parameter is only required when ``plot``
1615 |         type is 'correlation'. When set to None, it uses the first column in the train
1616 |         dataset.
1617 | 
1618 | 
1619 |     observation: int, default = None
1620 |         Observation index number in holdout set to explain. When ``plot`` is not
1621 |         'reason', this parameter is ignored. 
1622 | 
1623 | 
1624 |     use_train_data: bool, default = False
1625 |         When set to true, train data will be used for plots, instead
1626 |         of test data.
1627 | 
1628 | 
1629 |     X_new_sample: pd.DataFrame, default = None
1630 |         Row from an out-of-sample dataframe (neither train nor test data) to be plotted.
1631 |         The sample must have the same columns as the raw input data, and it is transformed
1632 |         by the preprocessing pipeline automatically before plotting.
1633 | 
1634 | 
1635 |     save: bool, default = False
1636 |         When set to True, Plot is saved as a 'png' file in current working directory.
1637 | 
1638 | 
1639 |     **kwargs:
1640 |         Additional keyword arguments to pass to the plot.
1641 | 
1642 | 
1643 |     Returns:
1644 |         None
1645 | 
1646 |     """
1647 | 
1648 |     return pycaret.internal.tabular.interpret_model(
1649 |         estimator=estimator,
1650 |         plot=plot,
1651 |         feature=feature,
1652 |         observation=observation,
1653 |         use_train_data=use_train_data,
1654 |         X_new_sample=X_new_sample,
1655 |         save=save,
1656 |         **kwargs,
1657 |     )
1658 | 
1659 | 
1660 | def predict_model(
1661 |     estimator,
1662 |     data: Optional[pd.DataFrame] = None,
1663 |     round: int = 4,
1664 |     verbose: bool = True,
1665 | ) -> pd.DataFrame:
1666 | 
1667 |     """
1668 |     This function predicts ``Label`` using a trained model. When ``data`` is 
1669 |     None, it predicts label on the holdout set.
1670 |     
1671 | 
1672 |     Example
1673 |     -------
1674 |     >>> from PyRapidML.datasets import get_data
1675 |     >>> boston = extract_data('boston')
1676 |     >>> from PyRapidML.regression import *
1677 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1678 |     >>> lr = creating_model('lr')
1679 |     >>> pred_holdout = predict_model(lr)
1680 |     >>> pred_unseen = predict_model(lr, data = unseen_dataframe)
1681 | 
1682 | 
1683 |     estimator: scikit-learn compatible object
1684 |         Trained model object
1685 | 
1686 | 
1687 |     data : pandas.DataFrame
1688 |         Shape (n_samples, n_features). All features used during training 
1689 |         must be available in the unseen dataset.
1690 |         
1691 |     
1692 |     round: int, default = 4
1693 |         Number of decimal places to round predictions to.
1694 | 
1695 | 
1696 |     verbose: bool, default = True
1697 |         When set to False, holdout score grid is not printed.
1698 | 
1699 | 
1700 |     Returns:
1701 |         pandas.DataFrame
1702 | 
1703 | 
1704 |     Warnings
1705 |     --------
1706 |     - The behavior of the ``predict_model`` is changed in version 2.1 without backward 
1707 |       compatibility. As such, the pipelines trained using the version (<= 2.0), may not 
1708 |       work for inference with version >= 2.1. You can either retrain your models with a 
1709 |       newer version or downgrade the version for inference.
1710 |     
1711 |     
1712 |     """
1713 | 
1714 |     return pycaret.internal.tabular.predict_model(
1715 |         estimator=estimator,
1716 |         data=data,
1717 |         probability_threshold=None,
1718 |         encoded_labels=True,
1719 |         round=round,
1720 |         verbose=verbose,
1721 |         ml_usecase=MLUsecase.REGRESSION,
1722 |     )
1723 | 
1724 | 
1725 | def finalize_model(
1726 |     estimator,
1727 |     fit_kwargs: Optional[dict] = None,
1728 |     groups: Optional[Union[str, Any]] = None,
1729 |     model_only: bool = True,
1730 | ) -> Any:
1731 | 
1732 |     """
1733 |     This function trains a given estimator on the entire dataset including the 
1734 |     holdout set.
1735 | 
1736 |     
1737 |     Example
1738 |     --------
1739 |     >>> from PyRapidML.datasets import get_data
1740 |     >>> boston = extract_data('boston')
1741 |     >>> from PyRapidML.regression import *
1742 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1743 |     >>> lr = creating_model('lr')
1744 |     >>> final_lr = finalize_model(lr)
1745 | 
1746 | 
1747 |     estimator: scikit-learn compatible object
1748 |         Trained model object
1749 | 
1750 | 
1751 |     fit_kwargs: dict, default = {} (empty dict)
1752 |         Dictionary of arguments passed to the fit method of the model.
1753 | 
1754 | 
1755 |     groups: str or array-like, with shape (n_samples,), default = None
1756 |         Optional group labels when GroupKFold is used for the cross validation.
1757 |         It takes an array with shape (n_samples, ) where n_samples is the number
1758 |         of rows in training dataset. When string is passed, it is interpreted as 
1759 |         the column name in the dataset containing group labels.
1760 | 
1761 | 
1762 |     model_only: bool, default = True
1763 |         When set to False, only model object is re-trained and all the 
1764 |         transformations in Pipeline are ignored.
1765 | 
1766 | 
1767 |     Returns:
1768 |         Trained Model
1769 |        
1770 |          
1771 |     """
1772 | 
1773 |     return pycaret.internal.tabular.finalize_model(
1774 |         estimator=estimator,
1775 |         fit_kwargs=fit_kwargs,
1776 |         groups=groups,
1777 |         model_only=model_only,
1778 |     )
1779 | 
1780 | 
1781 | def deploy_model(
1782 |     model, model_name: str, authentication: dict, platform: str = "aws",
1783 | ):
1784 | 
1785 |     """
1786 |     This function deploys the transformation pipeline and trained model on cloud.
1787 |     
1788 |     
1789 |     Example
1790 |     -------
1791 |     >>> from PyRapidML.datasets import get_data
1792 |     >>> boston = extract_data('boston')
1793 |     >>> from PyRapidML.regression import *
1794 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1795 |     >>> lr = creating_model('lr')
1796 |     >>> deploy_model(model = lr, model_name = 'lr-for-deployment', platform = 'aws', authentication = {'bucket' : 'S3-bucket-name'})
1797 |         
1798 | 
1799 |     Amazon Web Service (AWS) users:
1800 |         To deploy a model on AWS S3 ('aws'), environment variables must be set in your
1801 |         local environment. To configure AWS environment variables, type ``aws configure`` 
1802 |         in the command line. Following information from the IAM portal of amazon console 
1803 |         account is required:
1804 | 
1805 |         - AWS Access Key ID
1806 |         - AWS Secret Key Access
1807 |         - Default Region Name (can be seen under Global settings on your AWS console)
1808 | 
1809 |         More info: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html
1810 | 
1811 | 
1812 |     Google Cloud Platform (GCP) users:
1813 |         To deploy a model on Google Cloud Platform ('gcp'), project must be created 
1814 |         using command line or GCP console. Once project is created, you must create 
1815 |         a service account and download the service account key as a JSON file to set 
1816 |         environment variables in your local environment. 
1817 | 
1818 |         More info: https://cloud.google.com/docs/authentication/production
1819 | 
1820 |     
1821 |     Microsoft Azure (Azure) users:
1822 |         To deploy a model on Microsoft Azure ('azure'), environment variables for connection
1823 |         string must be set in your local environment. Go to settings of storage account on
1824 |         Azure portal to access the connection string required. 
1825 | 
1826 |         More info: https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?toc=%2Fpython%2Fazure%2FTOC.json
1827 | 
1828 | 
1829 |     model: scikit-learn compatible object
1830 |         Trained model object
1831 |     
1832 | 
1833 |     model_name: str
1834 |         Name of model.
1835 |     
1836 | 
1837 |     authentication: dict
1838 |         Dictionary of applicable authentication tokens.
1839 | 
1840 |         When platform = 'aws':
1841 |         {'bucket' : 'S3-bucket-name'}
1842 | 
1843 |         When platform = 'gcp':
1844 |         {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'}
1845 | 
1846 |         When platform = 'azure':
1847 |         {'container': 'azure-container-name'}
1848 |     
1849 | 
1850 |     platform: str, default = 'aws'
1851 |         Name of the platform. Currently supported platforms: 'aws', 'gcp' and 'azure'.
1852 |     
1853 | 
1854 |     Returns:
1855 |         None
1856 |     
1857 |     """
1858 | 
1859 |     return pycaret.internal.tabular.deploy_model(
1860 |         model=model,
1861 |         model_name=model_name,
1862 |         authentication=authentication,
1863 |         platform=platform,
1864 |     )
1865 | 
1866 | 
1867 | def save_model(
1868 |     model, model_name: str, model_only: bool = False, verbose: bool = True, **kwargs
1869 | ):
1870 | 
1871 |     """
1872 |     This function saves the transformation pipeline and trained model object 
1873 |     into the current working directory as a pickle file for later use. 
1874 |     
1875 |     Example
1876 |     -------
1877 |     >>> from PyRapidML.datasets import get_data
1878 |     >>> boston = get_data('boston')
1879 |     >>> from PyRapidML.regression import *
1880 |     >>> exp_name = setup(data = boston,  target = 'medv')
1881 |     >>> lr = create_model('lr')
1882 |     >>> save_model(lr, 'saved_lr_model')
1883 |     
1884 | 
1885 |     model: scikit-learn compatible object
1886 |         Trained model object
1887 |     
1888 | 
1889 |     model_name: str
1890 |         Name of the model.
1891 |     
1892 | 
1893 |     model_only: bool, default = False
1894 |         When set to True, only trained model object is saved instead of the 
1895 |         entire pipeline.
1896 | 
1897 | 
1898 |     **kwargs: 
1899 |         Additional keyword arguments to pass to joblib.dump().
1900 | 
1901 | 
1902 |     verbose: bool, default = True
1903 |         Success message is not printed when verbose is set to False.
1904 | 
1905 | 
1906 |     Returns:
1907 |         Tuple of the model object and the filename.
1908 | 
1909 |     """
1910 | 
1911 |     return pycaret.internal.tabular.save_model(
1912 |         model=model,
1913 |         model_name=model_name,
1914 |         model_only=model_only,
1915 |         verbose=verbose,
1916 |         **kwargs,
1917 |     )
1918 | 
1919 | 
1920 | def load_model(
1921 |     model_name,
1922 |     platform: Optional[str] = None,
1923 |     authentication: Optional[Dict[str, str]] = None,
1924 |     verbose: bool = True,
1925 | ):
1926 | 
1927 |     """
1928 |     This function loads a previously saved pipeline.
1929 |     
1930 |     Example
1931 |     -------
1932 |     >>> from PyRapidML.regression import load_model
1933 |     >>> saved_lr = load_model('saved_lr_model')
1934 |     
1935 | 
1936 |     model_name: str
1937 |         Name of the model.
1938 |       
1939 | 
1940 |     platform: str, default = None
1941 |         Name of the cloud platform. Currently supported platforms: 
1942 |         'aws', 'gcp' and 'azure'.
1943 |     
1944 | 
1945 |     authentication: dict, default = None
1946 |         dictionary of applicable authentication tokens.
1947 | 
1948 |         when platform = 'aws':
1949 |         {'bucket' : 'S3-bucket-name'}
1950 | 
1951 |         when platform = 'gcp':
1952 |         {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'}
1953 | 
1954 |         when platform = 'azure':
1955 |         {'container': 'azure-container-name'}
1956 |     
1957 | 
1958 |     verbose: bool, default = True
1959 |         Success message is not printed when verbose is set to False.
1960 | 
1961 | 
1962 |     Returns:
1963 |         Trained Model
1964 | 
1965 |     """
1966 | 
1967 |     return pycaret.internal.tabular.load_model(
1968 |         model_name=model_name,
1969 |         platform=platform,
1970 |         authentication=authentication,
1971 |         verbose=verbose,
1972 |     )
1973 | 
1974 | 
1975 | def automl(optimize: str = "R2", use_holdout: bool = False) -> Any:
1976 | 
1977 |     """
1978 |     This function returns the best model out of all trained models in
1979 |     current session based on the ``optimize`` parameter. Metrics
1980 |     evaluated can be accessed using the ``get_metrics`` function. 
1981 | 
1982 | 
1983 |     Example
1984 |     -------
1985 |     >>> from PyRapidML.datasets import get_data
1986 |     >>> boston = extract_data('boston')
1987 |     >>> from PyRapidML.regression import *
1988 |     >>> exp_name = initializer(data = boston,  target = 'medv')
1989 |     >>> top3 = comparing_models(n_select = 3)
1990 |     >>> tuned_top3 = [tuning_model(i) for i in top3]
1991 |     >>> blender = blend_models(tuned_top3)
1992 |     >>> stacker = stack_models(tuned_top3)
1993 |     >>> best_mae_model = automl(optimize = 'MAE')
1994 | 
1995 | 
1996 |     optimize: str, default = 'R2'
1997 |         Metric to use for model selection. It also accepts custom metrics
1998 |         added using the ``add_metric`` function. 
1999 | 
2000 | 
2001 |     use_holdout: bool, default = False
2002 |         When set to True, metrics are evaluated on holdout set instead of CV.
2003 |       
2004 | 
2005 |     Returns:
2006 |         Trained Model
2007 | 
2008 | 
2009 |     """
2010 | 
2011 |     return pycaret.internal.tabular.automl(optimize=optimize, use_holdout=use_holdout)
2012 | 
2013 | 
2014 | def pull(pop: bool = False) -> pd.DataFrame:
2015 |     """
2016 |     Returns last printed score grid. Use ``pull`` function after
2017 |     any training function to store the score grid in pandas.DataFrame.
2018 | 
2019 | 
2020 |     pop: bool, default = False
2021 |         If True, will pop (remove) the returned dataframe from the
2022 |         display container.
2023 | 
2024 | 
2025 |     Returns:
2026 |         pandas.DataFrame
2027 | 
2028 |     """
2029 |     return pycaret.internal.tabular.pull(pop=pop)
2030 | 
2031 | 
2032 | def models(
2033 |     type: Optional[str] = None, internal: bool = False, raise_errors: bool = True,
2034 | ) -> pd.DataFrame:
2035 | 
2036 |     """
2037 |     Returns table of models available in the model library.
2038 | 
2039 |     Example
2040 |     -------
2041 |     >>> from PyRapidML.datasets import get_data
2042 |     >>> boston = extract_data('boston')
2043 |     >>> from PyRapidML.regression import *
2044 |     >>> exp_name = initializer(data = boston,  target = 'medv')    
2045 |     >>> all_models = models()
2046 | 
2047 | 
2048 |     type: str, default = None
2049 |         - linear : filters and only return linear models
2050 |         - tree : filters and only return tree based models
2051 |         - ensemble : filters and only return ensemble models
2052 |     
2053 | 
2054 |     internal: bool, default = False
2055 |         When True, will return extra columns and rows used internally.
2056 | 
2057 | 
2058 |     raise_errors: bool, default = True
2059 |         When False, will suppress all exceptions, ignoring models
2060 |         that couldn't be created.
2061 | 
2062 | 
2063 |     Returns:
2064 |         pandas.DataFrame
2065 | 
2066 |     """
2067 |     return pycaret.internal.tabular.models(
2068 |         type=type, internal=internal, raise_errors=raise_errors
2069 |     )
2070 | 
2071 | 
2072 | def get_metrics(
2073 |     reset: bool = False, include_custom: bool = True, raise_errors: bool = True,
2074 | ) -> pd.DataFrame:
2075 | 
2076 |     """
2077 |     Returns table of available metrics used for CV.
2078 | 
2079 | 
2080 |     Example
2081 |     -------
2082 |     >>> from PyRapidML.datasets import get_data
2083 |     >>> boston = extract_data('boston')
2084 |     >>> from PyRapidML.regression import *
2085 |     >>> exp_name = initializer(data = boston,  target = 'medv')    
2086 |     >>> all_metrics = get_metrics()
2087 | 
2088 | 
2089 |     reset: bool, default = False
2090 |         When True, will reset all changes made using the ``add_metric`` 
2091 |         and ``remove_metric`` function.
2092 | 
2093 | 
2094 |     include_custom: bool, default = True
2095 |         Whether to include user added (custom) metrics or not.
2096 | 
2097 | 
2098 |     raise_errors: bool, default = True
2099 |         If False, will suppress all exceptions, ignoring models that
2100 |         couldn't be created.
2101 | 
2102 | 
2103 |     Returns:
2104 |         pandas.DataFrame
2105 | 
2106 |     """
2107 | 
2108 |     return pycaret.internal.tabular.get_metrics(
2109 |         reset=reset, include_custom=include_custom, raise_errors=raise_errors,
2110 |     )
2111 | 
2112 | 
2113 | def add_metric(
2114 |     id: str, name: str, score_func: type, greater_is_better: bool = True, **kwargs,
2115 | ) -> pd.Series:
2116 | 
2117 |     """
2118 |     Adds a custom metric to be used for CV.
2119 | 
2120 | 
2121 |     Example
2122 |     -------
2123 |     >>> from PyRapidML.datasets import get_data
2124 |     >>> boston = extract_data('boston')
2125 |     >>> from PyRapidML.regression import *
2126 |     >>> exp_name = initializer(data = boston,  target = 'medv') 
2127 |     >>> from sklearn.metrics import explained_variance_score
2128 |     >>> add_metric('evs', 'EVS', explained_variance_score)
2129 | 
2130 | 
2131 |     id: str
2132 |         Unique id for the metric.
2133 | 
2134 | 
2135 |     name: str
2136 |         Display name of the metric.
2137 | 
2138 | 
2139 |     score_func: type
2140 |         Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``.
2141 | 
2142 | 
2143 |     greater_is_better: bool, default = True
2144 |         Whether ``score_func`` is higher the better or not.
2145 | 
2146 | 
2147 |     **kwargs:
2148 |         Arguments to be passed to score function.
2149 | 
2150 | 
2151 |     Returns:
2152 |         pandas.Series
2153 | 
2154 |     """
2155 | 
2156 |     return pycaret.internal.tabular.add_metric(
2157 |         id=id,
2158 |         name=name,
2159 |         score_func=score_func,
2160 |         target="pred",
2161 |         greater_is_better=greater_is_better,
2162 |         **kwargs,
2163 |     )
2164 | 
2165 | 
2166 | def remove_metric(name_or_id: str):
2167 | 
2168 |     """
2169 |     Removes a metric from CV.
2170 | 
2171 | 
2172 |     Example
2173 |     -------
2174 |     >>> from PyRapidML.datasets import get_data
2175 |     >>> boston = extract_data('boston')
2176 |     >>> from PyRapidML.regression import *
2177 |     >>> exp_name = initializer(data = boston,  target = 'mredv') 
2178 |     >>> remove_metric('MAPE')
2179 | 
2180 | 
2181 |     name_or_id: str
2182 |         Display name or ID of the metric.
2183 | 
2184 |     
2185 |     Returns:
2186 |         None
2187 | 
2188 |     """
2189 |     return pycaret.internal.tabular.remove_metric(name_or_id=name_or_id)
2190 | 
2191 | 
2192 | def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.DataFrame:
2193 | 
2194 |     """
2195 |     Returns a table of experiment logs. Only works when ``log_experiment``
2196 |     is True when initializing the ``setup`` function.
2197 | 
2198 | 
2199 |     Example
2200 |     -------
2201 |     >>> from PyRapidML.datasets import get_data
2202 |     >>> boston = extract_data('boston')
2203 |     >>> from PyRapidML.regression import *
2204 |     >>> exp_name = initializer(data = boston,  target = 'medv', log_experiment = True) 
2205 |     >>> best = comparing_models()
2206 |     >>> exp_logs = get_logs()
2207 | 
2208 | 
2209 |     experiment_name: str, default = None
2210 |         When None current active run is used.
2211 | 
2212 | 
2213 |     save: bool, default = False
2214 |         When set to True, csv file is saved in current working directory.
2215 | 
2216 | 
2217 |     Returns:
2218 |         pandas.DataFrame
2219 | 
2220 |     """
2221 | 
2222 |     return pycaret.internal.tabular.get_logs(experiment_name=experiment_name, save=save)
2223 | 
2224 | 
2225 | def get_config(variable: str):
2226 | 
2227 |     """
2228 |     This function retrieves the global variables created when initializing the 
2229 |     ``setup`` function. Following variables are accessible:
2230 | 
2231 |     - X: Transformed dataset (X)
2232 |     - y: Transformed dataset (y)  
2233 |     - X_train: Transformed train dataset (X)
2234 |     - X_test: Transformed test/holdout dataset (X)
2235 |     - y_train: Transformed train dataset (y)
2236 |     - y_test: Transformed test/holdout dataset (y)
2237 |     - seed: random state set through session_id
2238 |     - prep_pipe: Transformation pipeline
2239 |     - fold_shuffle_param: shuffle parameter used in Kfolds
2240 |     - n_jobs_param: n_jobs parameter used in model training
2241 |     - html_param: html_param configured through setup
2242 |     - create_model_container: results grid storage container
2243 |     - master_model_container: model storage container
2244 |     - display_container: results display container
2245 |     - exp_name_log: Name of experiment
2246 |     - logging_param: log_experiment param
2247 |     - log_plots_param: log_plots param
2248 |     - USI: Unique session ID parameter
2249 |     - fix_imbalance_param: fix_imbalance param
2250 |     - fix_imbalance_method_param: fix_imbalance_method param
2251 |     - data_before_preprocess: data before preprocessing
2252 |     - target_param: name of target variable
2253 |     - gpu_param: use_gpu param configured through setup
2254 |     - fold_generator: CV splitter configured in fold_strategy
2255 |     - fold_param: fold params defined in the setup
2256 |     - fold_groups_param: fold groups defined in the setup
2257 |     - stratify_param: stratify parameter defined in the setup
2258 |     - transform_target_param: transform_target_param in setup
2259 |     - transform_target_method_param: transform_target_method_param in setup
2260 | 
2261 | 
2262 |     Example
2263 |     -------
2264 |     >>> from PyRapidML.datasets import get_data
2265 |     >>> boston = extract_data('boston')
2266 |     >>> from PyRapidML.regression import *
2267 |     >>> exp_name = initializer(data = boston,  target = 'medv') 
2268 |     >>> X_train = get_config('X_train') 
2269 | 
2270 | 
2271 |     Returns:
2272 |         Global variable
2273 | 
2274 | 
2275 |     """
2276 | 
2277 |     return pycaret.internal.tabular.get_config(variable=variable)
2278 | 
2279 | 
2280 | def set_config(variable: str, value):
2281 | 
2282 |     """
2283 |     This function resets the global variables. Following variables are 
2284 |     accessible:
2285 | 
2286 |     - X: Transformed dataset (X)
2287 |     - y: Transformed dataset (y)  
2288 |     - X_train: Transformed train dataset (X)
2289 |     - X_test: Transformed test/holdout dataset (X)
2290 |     - y_train: Transformed train dataset (y)
2291 |     - y_test: Transformed test/holdout dataset (y)
2292 |     - seed: random state set through session_id
2293 |     - prep_pipe: Transformation pipeline
2294 |     - fold_shuffle_param: shuffle parameter used in Kfolds
2295 |     - n_jobs_param: n_jobs parameter used in model training
2296 |     - html_param: html_param configured through setup
2297 |     - create_model_container: results grid storage container
2298 |     - master_model_container: model storage container
2299 |     - display_container: results display container
2300 |     - exp_name_log: Name of experiment
2301 |     - logging_param: log_experiment param
2302 |     - log_plots_param: log_plots param
2303 |     - USI: Unique session ID parameter
2304 |     - fix_imbalance_param: fix_imbalance param
2305 |     - fix_imbalance_method_param: fix_imbalance_method param
2306 |     - data_before_preprocess: data before preprocessing
2307 |     - target_param: name of target variable
2308 |     - gpu_param: use_gpu param configured through setup
2309 |     - fold_generator: CV splitter configured in fold_strategy
2310 |     - fold_param: fold params defined in the setup
2311 |     - fold_groups_param: fold groups defined in the setup
2312 |     - stratify_param: stratify parameter defined in the setup
2313 |     - transform_target_param: transform_target_param in setup
2314 |     - transform_target_method_param: transform_target_method_param in setup
2315 | 
2316 | 
2317 |     Example
2318 |     -------
2319 |     >>> from PyRapidML.datasets import get_data
2320 |     >>> boston = extract_data('boston')
2321 |     >>> from PyRapidML.regression import *
2322 |     >>> exp_name = initializer(data = boston,  target = 'medv') 
2323 |     >>> set_config('seed', 123) 
2324 | 
2325 | 
2326 |     Returns:
2327 |         None
2328 | 
2329 |     """
2330 | 
2331 |     return pycaret.internal.tabular.set_config(variable=variable, value=value)
2332 | 
2333 | 
2334 | def save_config(file_name: str):
2335 | 
2336 |     """
2337 |     This function save all global variables to a pickle file, allowing to
2338 |     later resume without rerunning the ``setup``.
2339 | 
2340 | 
2341 |     Example
2342 |     -------
2343 |     >>> from PyRapidML.datasets import get_data
2344 |     >>> boston = extract_data('boston')
2345 |     >>> from PyRapidML.regression import *
2346 |     >>> exp_name = initializer(data = boston,  target = 'medv')
2347 |     >>> save_config('myvars.pkl') 
2348 | 
2349 | 
2350 |     Returns:
2351 |         None
2352 | 
2353 |     """
2354 | 
2355 |     return pycaret.internal.tabular.save_config(file_name=file_name)
2356 | 
2357 | 
2358 | def load_config(file_name: str):
2359 | 
2360 |     """
2361 |     This function loads global variables from a pickle file into Python
2362 |     environment.
2363 | 
2364 | 
2365 |     Example
2366 |     -------
2367 |     >>> from PyRapidML.regression import load_config
2368 |     >>> load_config('myvars.pkl') 
2369 | 
2370 | 
2371 |     Returns:
2372 |         Global variables
2373 | 
2374 |     """
2375 | 
2376 |     return pycaret.internal.tabular.load_config(file_name=file_name)
2377 | 
2378 | 
2379 | 


--------------------------------------------------------------------------------
/PyRapidML/classification.py:
--------------------------------------------------------------------------------
   1 | # Module: Classification
   2 | # Author: Zain Ali <zainbalouch3@gmail.com>
   3 | # License: MIT
   4 | # Release: PyRapidML
   5 | # Last modified : 05/06/2021
   6 | 
   7 | 
   8 | import pandas as pd
   9 | import numpy as np
  10 | 
  11 | import pycaret.internal.tabular
  12 | from pycaret.internal.Display import Display, is_in_colab, enable_colab
  13 | from typing import List, Tuple, Any, Union, Optional, Dict
  14 | import warnings
  15 | from IPython.utils import io
  16 | import traceback
  17 | 
  18 | from pycaret.internal.tabular import MLUsecase
  19 | 
  20 | warnings.filterwarnings("ignore")
  21 | 
  22 | 
  23 | def initializer(
  24 |     data: pd.DataFrame,
  25 |     target: str,
  26 |     train_size: float = 0.7,
  27 |     test_data: Optional[pd.DataFrame] = None,
  28 |     preprocess: bool = True,
  29 |     imputation_type: str = "simple",
  30 |     iterative_imputation_iters: int = 5,
  31 |     categorical_features: Optional[List[str]] = None,
  32 |     categorical_imputation: str = "constant",
  33 |     categorical_iterative_imputer: Union[str, Any] = "lightgbm",
  34 |     ordinal_features: Optional[Dict[str, list]] = None,
  35 |     high_cardinality_features: Optional[List[str]] = None,
  36 |     high_cardinality_method: str = "frequency",
  37 |     numeric_features: Optional[List[str]] = None,
  38 |     numeric_imputation: str = "mean",
  39 |     numeric_iterative_imputer: Union[str, Any] = "lightgbm",
  40 |     date_features: Optional[List[str]] = None,
  41 |     ignore_features: Optional[List[str]] = None,
  42 |     normalize: bool = False,
  43 |     normalize_method: str = "zscore",
  44 |     transformation: bool = False,
  45 |     transformation_method: str = "yeo-johnson",
  46 |     handle_unknown_categorical: bool = True,
  47 |     unknown_categorical_method: str = "least_frequent",
  48 |     pca: bool = False,
  49 |     pca_method: str = "linear",
  50 |     pca_components: Optional[float] = None,
  51 |     ignore_low_variance: bool = False,
  52 |     combine_rare_levels: bool = False,
  53 |     rare_level_threshold: float = 0.10,
  54 |     bin_numeric_features: Optional[List[str]] = None,
  55 |     remove_outliers: bool = False,
  56 |     outliers_threshold: float = 0.05,
  57 |     remove_multicollinearity: bool = False,
  58 |     multicollinearity_threshold: float = 0.9,
  59 |     remove_perfect_collinearity: bool = True,
  60 |     create_clusters: bool = False,
  61 |     cluster_iter: int = 20,
  62 |     polynomial_features: bool = False,
  63 |     polynomial_degree: int = 2,
  64 |     trigonometry_features: bool = False,
  65 |     polynomial_threshold: float = 0.1,
  66 |     group_features: Optional[List[str]] = None,
  67 |     group_names: Optional[List[str]] = None,
  68 |     feature_selection: bool = False,
  69 |     feature_selection_threshold: float = 0.8,
  70 |     feature_selection_method: str = "classic",
  71 |     feature_interaction: bool = False,
  72 |     feature_ratio: bool = False,
  73 |     interaction_threshold: float = 0.01,
  74 |     fix_imbalance: bool = False,
  75 |     fix_imbalance_method: Optional[Any] = None,
  76 |     data_split_shuffle: bool = True,
  77 |     data_split_stratify: Union[bool, List[str]] = False,
  78 |     fold_strategy: Union[str, Any] = "stratifiedkfold",
  79 |     fold: int = 10,
  80 |     fold_shuffle: bool = False,
  81 |     fold_groups: Optional[Union[str, pd.DataFrame]] = None,
  82 |     n_jobs: Optional[int] = -1,
  83 |     use_gpu: bool = False,
  84 |     custom_pipeline: Union[
  85 |         Any, Tuple[str, Any], List[Any], List[Tuple[str, Any]]
  86 |     ] = None,
  87 |     html: bool = True,
  88 |     session_id: Optional[int] = None,
  89 |     log_experiment: bool = False,
  90 |     experiment_name: Optional[str] = None,
  91 |     log_plots: Union[bool, list] = False,
  92 |     log_profile: bool = False,
  93 |     log_data: bool = False,
  94 |     silent: bool = False,
  95 |     verbose: bool = True,
  96 |     profile: bool = False,
  97 |     profile_kwargs: Dict[str, Any] = None,
  98 | ):
  99 | 
 100 |     """
 101 |     This function initializes the training environment and creates the transformation 
 102 |     pipeline. Setup function must be called before executing any other function. It takes 
 103 |     two mandatory parameters: ``data`` and ``target``. All the other parameters are
 104 |     optional.
 105 | 
 106 |     Example
 107 |     -------
 108 |     >>> from PyRapidML.datasets import get_data
 109 |     >>> juice = extract_data('juice')
 110 |     >>> from PyRapidML.classification import *
 111 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
 112 | 
 113 | 
 114 |     data: pandas.DataFrame
 115 |         Shape (n_samples, n_features), where n_samples is the number of samples and 
 116 |         n_features is the number of features.
 117 | 
 118 | 
 119 |     target: str
 120 |         Name of the target column to be passed in as a string. The target variable can 
 121 |         be either binary or multiclass.
 122 | 
 123 | 
 124 |     train_size: float, default = 0.7
 125 |         Proportion of the dataset to be used for training and validation. Should be 
 126 |         between 0.0 and 1.0.
 127 | 
 128 | 
 129 |     test_data: pandas.DataFrame, default = None
 130 |         If not None, test_data is used as a hold-out set and ``train_size`` parameter is 
 131 |         ignored. test_data must be labelled and the shape of data and test_data must 
 132 |         match. 
 133 | 
 134 | 
 135 |     preprocess: bool, default = True
 136 |         When set to False, no transformations are applied except for train_test_split 
 137 |         and custom transformations passed in ``custom_pipeline`` param. Data must be 
 138 |         ready for modeling (no missing values, no dates, categorical data encoding), 
 139 |         when preprocess is set to False. 
 140 | 
 141 | 
 142 |     imputation_type: str, default = 'simple'
 143 |         The type of imputation to use. Can be either 'simple' or 'iterative'.
 144 | 
 145 | 
 146 |     iterative_imputation_iters: int, default = 5
 147 |         Number of iterations. Ignored when ``imputation_type`` is not 'iterative'.	
 148 | 
 149 | 
 150 |     categorical_features: list of str, default = None
 151 |         If the inferred data types are not correct or the silent param is set to True,
 152 |         categorical_features param can be used to overwrite or define the data types. 
 153 |         It takes a list of strings with column names that are categorical.
 154 | 
 155 | 
 156 |     categorical_imputation: str, default = 'constant'
 157 |         Missing values in categorical features are imputed with a constant 'not_available'
 158 |         value. The other available option is 'mode'.
 159 | 
 160 | 
 161 |     categorical_iterative_imputer: str, default = 'lightgbm'
 162 |         Estimator for iterative imputation of missing values in categorical features.
 163 |         Ignored when ``imputation_type`` is not 'iterative'. 
 164 | 
 165 | 
 166 |     ordinal_features: dict, default = None
 167 |         Encode categorical features as ordinal. For example, a categorical feature with 
 168 |         'low', 'medium', 'high' values where low < medium < high can be passed as  
 169 |         ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }. 
 170 | 
 171 | 
 172 |     high_cardinality_features: list of str, default = None
 173 |         When categorical features contains many levels, it can be compressed into fewer
 174 |         levels using this parameter. It takes a list of strings with column names that 
 175 |         are categorical.
 176 | 
 177 | 
 178 |     high_cardinality_method: str, default = 'frequency'
 179 |         Categorical features with high cardinality are replaced with the frequency of
 180 |         values in each level occurring in the training dataset. Other available method
 181 |         is 'clustering' which trains the K-Means clustering algorithm on the statistical
 182 |         attribute of the training data and replaces the original value of feature with the 
 183 |         cluster label. The number of clusters is determined by optimizing Calinski-Harabasz 
 184 |         and Silhouette criterion. 
 185 | 
 186 | 
 187 |     numeric_features: list of str, default = None
 188 |         If the inferred data types are not correct or the silent param is set to True,
 189 |         numeric_features param can be used to overwrite or define the data types. 
 190 |         It takes a list of strings with column names that are numeric.
 191 | 
 192 | 
 193 |     numeric_imputation: str, default = 'mean'
 194 |         Missing values in numeric features are imputed with 'mean' value of the feature 
 195 |         in the training dataset. The other available option is 'median' or 'zero'.
 196 | 
 197 | 
 198 |     numeric_iterative_imputer: str, default = 'lightgbm'
 199 |         Estimator for iterative imputation of missing values in numeric features.
 200 |         Ignored when ``imputation_type`` is set to 'simple'. 
 201 | 
 202 | 
 203 |     date_features: list of str, default = None
 204 |         If the inferred data types are not correct or the silent param is set to True,
 205 |         date_features param can be used to overwrite or define the data types. It takes 
 206 |         a list of strings with column names that are DateTime.
 207 | 
 208 | 
 209 |     ignore_features: list of str, default = None
 210 |         ignore_features param can be used to ignore features during model training.
 211 |         It takes a list of strings with column names that are to be ignored.
 212 | 
 213 | 
 214 |     normalize: bool, default = False
 215 |         When set to True, it transforms the numeric features by scaling them to a given
 216 |         range. Type of scaling is defined by the ``normalize_method`` parameter.
 217 | 
 218 | 
 219 |     normalize_method: str, default = 'zscore'
 220 |         Defines the method for scaling. By default, normalize method is set to 'zscore'
 221 |         The standard zscore is calculated as z = (x - u) / s. Ignored when ``normalize`` 
 222 |         is not True. The other options are:
 223 |     
 224 |         - minmax: scales and translates each feature individually such that it is in 
 225 |           the range of 0 - 1.
 226 |         - maxabs: scales and translates each feature individually such that the 
 227 |           maximal absolute value of each feature will be 1.0. It does not 
 228 |           shift/center the data, and thus does not destroy any sparsity.
 229 |         - robust: scales and translates each feature according to the Interquartile 
 230 |           range. When the dataset contains outliers, robust scaler often gives 
 231 |           better results.
 232 | 
 233 | 
 234 |     transformation: bool, default = False
 235 |         When set to True, it applies the power transform to make data more Gaussian-like.
 236 |         Type of transformation is defined by the ``transformation_method`` parameter.
 237 | 
 238 | 
 239 |     transformation_method: str, default = 'yeo-johnson'
 240 |         Defines the method for transformation. By default, the transformation method is 
 241 |         set to 'yeo-johnson'. The other available option for transformation is 'quantile'. 
 242 |         Ignored when ``transformation`` is not True.
 243 | 
 244 |     
 245 |     handle_unknown_categorical: bool, default = True
 246 |         When set to True, unknown categorical levels in unseen data are replaced by the
 247 |         most or least frequent level as learned in the training dataset. 
 248 | 
 249 | 
 250 |     unknown_categorical_method: str, default = 'least_frequent'
 251 |         Method used to replace unknown categorical levels in unseen data. Method can be
 252 |         set to 'least_frequent' or 'most_frequent'.
 253 | 
 254 | 
 255 |     pca: bool, default = False
 256 |         When set to True, dimensionality reduction is applied to project the data into 
 257 |         a lower dimensional space using the method defined in ``pca_method`` parameter. 
 258 |         
 259 | 
 260 |     pca_method: str, default = 'linear'
 261 |         The 'linear' method performs uses Singular Value  Decomposition. Other options are:
 262 |         
 263 |         - kernel: dimensionality reduction through the use of RVF kernel.
 264 |         - incremental: replacement for 'linear' pca when the dataset is too large.
 265 | 
 266 | 
 267 |     pca_components: int or float, default = None
 268 |         Number of components to keep. if pca_components is a float, it is treated as a 
 269 |         target percentage for information retention. When pca_components is an integer
 270 |         it is treated as the number of features to be kept. pca_components must be less
 271 |         than the original number of features. Ignored when ``pca`` is not True.
 272 | 
 273 | 
 274 |     ignore_low_variance: bool, default = False
 275 |         When set to True, all categorical features with insignificant variances are 
 276 |         removed from the data. The variance is calculated using the ratio of unique 
 277 |         values to the number of samples, and the ratio of the most common value to the 
 278 |         frequency of the second most common value.
 279 | 
 280 |     
 281 |     combine_rare_levels: bool, default = False
 282 |         When set to True, frequency percentile for levels in categorical features below 
 283 |         a certain threshold is combined into a single level.
 284 | 
 285 |     
 286 |     rare_level_threshold: float, default = 0.1
 287 |         Percentile distribution below which rare categories are combined. Ignored when
 288 |         ``combine_rare_levels`` is not True.
 289 | 
 290 |     
 291 |     bin_numeric_features: list of str, default = None
 292 |         To convert numeric features into categorical, bin_numeric_features parameter can 
 293 |         be used. It takes a list of strings with column names to be discretized. It does
 294 |         so by using 'sturges' rule to determine the number of clusters and then apply
 295 |         KMeans algorithm. Original values of the feature are then replaced by the
 296 |         cluster label.
 297 | 
 298 | 
 299 |     remove_outliers: bool, default = False
 300 |         When set to True, outliers from the training data are removed using the Singular 
 301 |         Value Decomposition.
 302 | 
 303 | 
 304 |     outliers_threshold: float, default = 0.05
 305 |         The percentage outliers to be removed from the training dataset. Ignored when 
 306 |         ``remove_outliers`` is not True.
 307 | 
 308 | 
 309 |     remove_multicollinearity: bool, default = False
 310 |         When set to True, features with the inter-correlations higher than the defined 
 311 |         threshold are removed. When two features are highly correlated with each other, 
 312 |         the feature that is less correlated with the target variable is removed. Only
 313 |         considers numeric features.
 314 | 
 315 |     multicollinearity_threshold: float, default = 0.9
 316 |         Threshold for correlated features. Ignored when ``remove_multicollinearity``
 317 |         is not True.
 318 | 
 319 |     
 320 |     remove_perfect_collinearity: bool, default = True
 321 |         When set to True, perfect collinearity (features with correlation = 1) is removed
 322 |         from the dataset, when two features are 100% correlated, one of it is randomly 
 323 |         removed from the dataset.
 324 | 
 325 | 
 326 |     create_clusters: bool, default = False
 327 |         When set to True, an additional feature is created in training dataset where each 
 328 |         instance is assigned to a cluster. The number of clusters is determined by 
 329 |         optimizing Calinski-Harabasz and Silhouette criterion.
 330 | 
 331 | 
 332 |     cluster_iter: int, default = 20
 333 |         Number of iterations for creating cluster. Each iteration represents cluster 
 334 |         size. Ignored when ``create_clusters`` is not True. 
 335 | 
 336 | 
 337 |     polynomial_features: bool, default = False
 338 |         When set to True, new features are derived using existing numeric features. 
 339 | 
 340 | 
 341 |     polynomial_degree: int, default = 2
 342 |         Degree of polynomial features. For example, if an input sample is two dimensional 
 343 |         and of the form [a, b], the polynomial features with degree = 2 are: 
 344 |         [1, a, b, a^2, ab, b^2]. Ignored when ``polynomial_features`` is not True.
 345 | 
 346 | 
 347 |     trigonometry_features: bool, default = False
 348 |         When set to True, new features are derived using existing numeric features.
 349 | 
 350 | 
 351 |     polynomial_threshold: float, default = 0.1
 352 |         When ``polynomial_features`` or ``trigonometry_features`` is True, new features
 353 |         are derived from the existing numeric features. This may sometimes result in too 
 354 |         large feature space. polynomial_threshold parameter can be used to deal with this  
 355 |         problem. It does so by using combination of Random Forest, AdaBoost and Linear 
 356 |         correlation. All derived features that falls within the percentile distribution 
 357 |         are kept and rest of the features are removed.
 358 | 
 359 | 
 360 |     group_features: list or list of list, default = None
 361 |         When the dataset contains features with related characteristics, group_features
 362 |         parameter can be used for feature extraction. It takes a list of strings with 
 363 |         column names that are related.
 364 | 
 365 |         
 366 |     group_names: list, default = None
 367 |         Group names to be used in naming new features. When the length of group_names 
 368 |         does not match with the length of ``group_features``, new features are named 
 369 |         sequentially group_1, group_2, etc. It is ignored when ``group_features`` is
 370 |         None.
 371 | 
 372 |     
 373 |     feature_selection: bool, default = False
 374 |         When set to True, a subset of features are selected using a combination of 
 375 |         various permutation importance techniques including Random Forest, Adaboost 
 376 |         and Linear correlation with target variable. The size of the subset is 
 377 |         dependent on the ``feature_selection_threshold`` parameter. 
 378 | 
 379 | 
 380 |     feature_selection_threshold: float, default = 0.8
 381 |         Threshold value used for feature selection. When ``polynomial_features`` or 
 382 |         ``feature_interaction`` is True, it is recommended to keep the threshold low
 383 |         to avoid large feature spaces. Setting a very low value may be efficient but 
 384 |         could result in under-fitting.
 385 | 
 386 |     
 387 |     feature_selection_method: str, default = 'classic'
 388 |         Algorithm for feature selection. 'classic' method uses permutation feature
 389 |         importance techniques. Other possible value is 'boruta' which uses boruta
 390 |         algorithm for feature selection. 
 391 | 
 392 |     
 393 |     feature_interaction: bool, default = False 
 394 |         When set to True, new features are created by interacting (a * b) all the 
 395 |         numeric variables in the dataset. This feature is not scalable and may not
 396 |         work as expected on datasets with large feature space.
 397 | 
 398 |     
 399 |     feature_ratio: bool, default = False
 400 |         When set to True, new features are created by calculating the ratios (a / b) 
 401 |         between all numeric variables in the dataset. This feature is not scalable and 
 402 |         may not work as expected on datasets with large feature space.
 403 | 
 404 |     
 405 |     interaction_threshold: bool, default = 0.01
 406 |         Similar to polynomial_threshold, It is used to compress a sparse matrix of newly 
 407 |         created features through interaction. Features whose importance based on the 
 408 |         combination  of  Random Forest, AdaBoost and Linear correlation falls within the 
 409 |         percentile of the  defined threshold are kept in the dataset. Remaining features 
 410 |         are dropped before further processing.
 411 | 
 412 |     
 413 |     fix_imbalance: bool, default = False
 414 |         When training dataset has unequal distribution of target class it can be balanced 
 415 |         using this parameter. When set to True, SMOTE (Synthetic Minority Over-sampling 
 416 |         Technique) is applied by default to create synthetic datapoints for minority class.
 417 | 
 418 | 
 419 |     fix_imbalance_method: obj, default = None
 420 |         When ``fix_imbalance`` is True, 'imblearn' compatible object with 'fit_resample'
 421 |         method can be passed. When set to None, 'imblearn.over_sampling.SMOTE' is used.  
 422 |         
 423 | 
 424 |     data_split_shuffle: bool, default = True
 425 |         When set to False, prevents shuffling of rows during 'train_test_split'.
 426 | 
 427 | 
 428 |     data_split_stratify: bool or list, default = False
 429 |         Controls stratification during 'train_test_split'. When set to True, will 
 430 |         stratify by target column. To stratify on any other columns, pass a list of 
 431 |         column names. Ignored when ``data_split_shuffle`` is False.
 432 | 
 433 | 
 434 |     fold_strategy: str or sklearn CV generator object, default = 'stratifiedkfold'
 435 |         Choice of cross validation strategy. Possible values are:
 436 | 
 437 |         * 'kfold'
 438 |         * 'stratifiedkfold'
 439 |         * 'groupkfold'
 440 |         * 'timeseries'
 441 |         * a custom CV generator object compatible with scikit-learn.
 442 | 
 443 | 
 444 |     fold: int, default = 10
 445 |         Number of folds to be used in cross validation. Must be at least 2. This is
 446 |         a global setting that can be over-written at function level by using ``fold``
 447 |         parameter. Ignored when ``fold_strategy`` is a custom object.
 448 | 
 449 | 
 450 |     fold_shuffle: bool, default = False
 451 |         Controls the shuffle parameter of CV. Only applicable when ``fold_strategy``
 452 |         is 'kfold' or 'stratifiedkfold'. Ignored when ``fold_strategy`` is a custom
 453 |         object.
 454 | 
 455 |     
 456 |     fold_groups: str or array-like, with shape (n_samples,), default = None
 457 |         Optional group labels when 'GroupKFold' is used for the cross validation.
 458 |         It takes an array with shape (n_samples, ) where n_samples is the number
 459 |         of rows in the training dataset. When string is passed, it is interpreted 
 460 |         as the column name in the dataset containing group labels.
 461 | 
 462 | 
 463 |     n_jobs: int, default = -1
 464 |         The number of jobs to run in parallel (for functions that supports parallel 
 465 |         processing) -1 means using all processors. To run all functions on single 
 466 |         processor set n_jobs to None.
 467 | 
 468 | 
 469 |     use_gpu: bool or str, default = False
 470 |         When set to True, it will use GPU for training with algorithms that support it, 
 471 |         and fall back to CPU if they are unavailable. When set to 'force', it will only
 472 |         use GPU-enabled algorithms and raise exceptions when they are unavailable. When 
 473 |         False, all algorithms are trained using CPU only.
 474 | 
 475 |         GPU enabled algorithms:
 476 |         
 477 |         - Extreme Gradient Boosting, requires no further installation
 478 | 
 479 |         - CatBoost Classifier, requires no further installation
 480 |           (GPU is only enabled when data > 50,000 rows)  
 481 |         
 482 |         - Light Gradient Boosting Machine, requires GPU installation
 483 |           https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html
 484 | 
 485 |         - Logistic Regression, Ridge Classifier, Random Forest, K Neighbors Classifier,
 486 |           Support Vector Machine, requires cuML >= 0.15 
 487 |           https://github.com/rapidsai/cuml
 488 | 
 489 | 
 490 |     custom_pipeline: (str, transformer) or list of (str, transformer), default = None
 491 |         When passed, will append the custom transformers in the preprocessing pipeline
 492 |         and are applied on each CV fold separately and on the final fit. All the custom
 493 |         transformations are applied after 'train_test_split' and before PyRapidML's internal 
 494 |         transformations. 
 495 | 
 496 | 
 497 |     html: bool, default = True
 498 |         When set to False, prevents runtime display of monitor. This must be set to False
 499 |         when the environment does not support IPython. For example, command line terminal,
 500 |         Databricks Notebook, Spyder and other similar IDEs. 
 501 | 
 502 | 
 503 |     session_id: int, default = None
 504 |         Controls the randomness of experiment. It is equivalent to 'random_state' in
 505 |         scikit-learn. When None, a pseudo random number is generated. This can be used 
 506 |         for later reproducibility of the entire experiment.
 507 | 
 508 | 
 509 |     log_experiment: bool, default = False
 510 |         When set to True, all metrics and parameters are logged on the ``MLFlow`` server.
 511 | 
 512 | 
 513 |     experiment_name: str, default = None
 514 |         Name of the experiment for logging. Ignored when ``log_experiment`` is not True.
 515 | 
 516 | 
 517 |     log_plots: bool or list, default = False
 518 |         When set to True, certain plots are logged automatically in the ``MLFlow`` server. 
 519 |         To change the type of plots to be logged, pass a list containing plot IDs. Refer
 520 |         to documentation of ``plot_model``. Ignored when ``log_experiment`` is not True.
 521 | 
 522 | 
 523 |     log_profile: bool, default = False
 524 |         When set to True, data profile is logged on the ``MLflow`` server as a html file.
 525 |         Ignored when ``log_experiment`` is not True. 
 526 | 
 527 | 
 528 |     log_data: bool, default = False
 529 |         When set to True, dataset is logged on the ``MLflow`` server as a csv file.
 530 |         Ignored when ``log_experiment`` is not True.
 531 |         
 532 | 
 533 |     silent: bool, default = False
 534 |         Controls the confirmation input of data types when ``setup`` is executed. When
 535 |         executing in completely automated mode or on a remote kernel, this must be True.
 536 | 
 537 |     
 538 |     verbose: bool, default = True
 539 |         When set to False, Information grid is not printed.
 540 | 
 541 | 
 542 |     profile: bool, default = False
 543 |         When set to True, an interactive EDA report is displayed. 
 544 | 
 545 | 
 546 |     profile_kwargs: dict, default = {} (empty dict)
 547 |         Dictionary of arguments passed to the ProfileReport method used
 548 |         to create the EDA report. Ignored if ``profile`` is False.
 549 | 
 550 | 
 551 |     Returns:
 552 |         Global variables that can be changed using the ``set_config`` function.
 553 |         
 554 |     """
 555 | 
 556 |     available_plots = {
 557 |         "parameter": "Hyperparameters",
 558 |         "auc": "AUC",
 559 |         "confusion_matrix": "Confusion Matrix",
 560 |         "threshold": "Threshold",
 561 |         "pr": "Precision Recall",
 562 |         "error": "Prediction Error",
 563 |         "class_report": "Class Report",
 564 |         "rfe": "Feature Selection",
 565 |         "learning": "Learning Curve",
 566 |         "manifold": "Manifold Learning",
 567 |         "calibration": "Calibration Curve",
 568 |         "vc": "Validation Curve",
 569 |         "dimension": "Dimensions",
 570 |         "feature": "Feature Importance",
 571 |         "feature_all": "Feature Importance (All)",
 572 |         "boundary": "Decision Boundary",
 573 |         "lift": "Lift Chart",
 574 |         "gain": "Gain Chart",
 575 |         "tree": "Decision Tree",
 576 |     }
 577 | 
 578 |     if log_plots == True:
 579 |         log_plots = ["auc", "confusion_matrix", "feature"]
 580 | 
 581 |     return pycaret.internal.tabular.setup(
 582 |         ml_usecase="classification",
 583 |         available_plots=available_plots,
 584 |         data=data,
 585 |         target=target,
 586 |         train_size=train_size,
 587 |         test_data=test_data,
 588 |         preprocess=preprocess,
 589 |         imputation_type=imputation_type,
 590 |         iterative_imputation_iters=iterative_imputation_iters,
 591 |         categorical_features=categorical_features,
 592 |         categorical_imputation=categorical_imputation,
 593 |         categorical_iterative_imputer=categorical_iterative_imputer,
 594 |         ordinal_features=ordinal_features,
 595 |         high_cardinality_features=high_cardinality_features,
 596 |         high_cardinality_method=high_cardinality_method,
 597 |         numeric_features=numeric_features,
 598 |         numeric_imputation=numeric_imputation,
 599 |         numeric_iterative_imputer=numeric_iterative_imputer,
 600 |         date_features=date_features,
 601 |         ignore_features=ignore_features,
 602 |         normalize=normalize,
 603 |         normalize_method=normalize_method,
 604 |         transformation=transformation,
 605 |         transformation_method=transformation_method,
 606 |         handle_unknown_categorical=handle_unknown_categorical,
 607 |         unknown_categorical_method=unknown_categorical_method,
 608 |         pca=pca,
 609 |         pca_method=pca_method,
 610 |         pca_components=pca_components,
 611 |         ignore_low_variance=ignore_low_variance,
 612 |         combine_rare_levels=combine_rare_levels,
 613 |         rare_level_threshold=rare_level_threshold,
 614 |         bin_numeric_features=bin_numeric_features,
 615 |         remove_outliers=remove_outliers,
 616 |         outliers_threshold=outliers_threshold,
 617 |         remove_multicollinearity=remove_multicollinearity,
 618 |         multicollinearity_threshold=multicollinearity_threshold,
 619 |         remove_perfect_collinearity=remove_perfect_collinearity,
 620 |         create_clusters=create_clusters,
 621 |         cluster_iter=cluster_iter,
 622 |         polynomial_features=polynomial_features,
 623 |         polynomial_degree=polynomial_degree,
 624 |         trigonometry_features=trigonometry_features,
 625 |         polynomial_threshold=polynomial_threshold,
 626 |         group_features=group_features,
 627 |         group_names=group_names,
 628 |         feature_selection=feature_selection,
 629 |         feature_selection_threshold=feature_selection_threshold,
 630 |         feature_selection_method=feature_selection_method,
 631 |         feature_interaction=feature_interaction,
 632 |         feature_ratio=feature_ratio,
 633 |         interaction_threshold=interaction_threshold,
 634 |         fix_imbalance=fix_imbalance,
 635 |         fix_imbalance_method=fix_imbalance_method,
 636 |         data_split_shuffle=data_split_shuffle,
 637 |         data_split_stratify=data_split_stratify,
 638 |         fold_strategy=fold_strategy,
 639 |         fold=fold,
 640 |         fold_shuffle=fold_shuffle,
 641 |         fold_groups=fold_groups,
 642 |         n_jobs=n_jobs,
 643 |         use_gpu=use_gpu,
 644 |         custom_pipeline=custom_pipeline,
 645 |         html=html,
 646 |         session_id=session_id,
 647 |         log_experiment=log_experiment,
 648 |         experiment_name=experiment_name,
 649 |         log_plots=log_plots,
 650 |         log_profile=log_profile,
 651 |         log_data=log_data,
 652 |         silent=silent,
 653 |         verbose=verbose,
 654 |         profile=profile,
 655 |         profile_kwargs=profile_kwargs,
 656 |     )
 657 | 
 658 | 
 659 | def comparing_models(
 660 |     include: Optional[List[Union[str, Any]]] = None,
 661 |     exclude: Optional[List[str]] = None,
 662 |     fold: Optional[Union[int, Any]] = None,
 663 |     round: int = 4,
 664 |     cross_validation: bool = True,
 665 |     sort: str = "Accuracy",
 666 |     n_select: int = 1,
 667 |     budget_time: Optional[float] = None,
 668 |     turbo: bool = True,
 669 |     errors: str = "ignore",
 670 |     fit_kwargs: Optional[dict] = None,
 671 |     groups: Optional[Union[str, Any]] = None,
 672 |     verbose: bool = True,
 673 | ) -> Union[Any, List[Any]]:
 674 | 
 675 |     """
 676 |     This function trains and evaluates performance of all estimators available in the 
 677 |     model library using cross validation. The output of this function is a score grid 
 678 |     with average cross validated scores. Metrics evaluated during CV can be accessed 
 679 |     using the ``get_metrics`` function. Custom metrics can be added or removed using 
 680 |     ``add_metric`` and ``remove_metric`` function.
 681 | 
 682 |     Example
 683 |     -------
 684 |     >>> from PyRapidML.datasets import get_data
 685 |     >>> juice = extract_data('juice')
 686 |     >>> from PyRapidML.classification import *
 687 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
 688 |     >>> best_model = comparing_models() 
 689 | 
 690 | 
 691 |     include: list of str or scikit-learn compatible object, default = None
 692 |         To train and evaluate select models, list containing model ID or scikit-learn 
 693 |         compatible object can be passed in include param. To see a list of all models 
 694 |         available in the model library use the ``models`` function. 
 695 | 
 696 | 
 697 |     exclude: list of str, default = None
 698 |         To omit certain models from training and evaluation, pass a list containing 
 699 |         model id in the exclude parameter. To see a list of all models available
 700 |         in the model library use the ``models`` function. 
 701 | 
 702 | 
 703 |     fold: int or scikit-learn compatible CV generator, default = None
 704 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
 705 |         parameter of the ``setup`` function is used. When an integer is passed, 
 706 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
 707 |         ``setup`` function.
 708 | 
 709 | 
 710 |     round: int, default = 4
 711 |         Number of decimal places the metrics in the score grid will be rounded to.
 712 | 
 713 | 
 714 |     cross_validation: bool, default = True
 715 |         When set to False, metrics are evaluated on holdout set. ``fold`` param
 716 |         is ignored when cross_validation is set to False.
 717 | 
 718 | 
 719 |     sort: str, default = 'Accuracy'
 720 |         The sort order of the score grid. It also accepts custom metrics that are
 721 |         added through the ``add_metric`` function.
 722 | 
 723 | 
 724 |     n_select: int, default = 1
 725 |         Number of top_n models to return. For example, to select top 3 models use
 726 |         n_select = 3.
 727 | 
 728 | 
 729 |     budget_time: int or float, default = None
 730 |         If not None, will terminate execution of the function after budget_time 
 731 |         minutes have passed and return results up to that point.
 732 | 
 733 | 
 734 |     turbo: bool, default = True
 735 |         When set to True, it excludes estimators with longer training times. To
 736 |         see which algorithms are excluded use the ``models`` function.
 737 | 
 738 | 
 739 |     errors: str, default = 'ignore'
 740 |         When set to 'ignore', will skip the model with exceptions and continue.
 741 |         If 'raise', will break the function when exceptions are raised.
 742 | 
 743 | 
 744 |     fit_kwargs: dict, default = {} (empty dict)
 745 |         Dictionary of arguments passed to the fit method of the model.
 746 | 
 747 | 
 748 |     groups: str or array-like, with shape (n_samples,), default = None
 749 |         Optional group labels when 'GroupKFold' is used for the cross validation.
 750 |         It takes an array with shape (n_samples, ) where n_samples is the number
 751 |         of rows in the training dataset. When string is passed, it is interpreted 
 752 |         as the column name in the dataset containing group labels.
 753 | 
 754 | 
 755 |     verbose: bool, default = True
 756 |         Score grid is not printed when verbose is set to False.
 757 |     
 758 |     
 759 |     Returns:
 760 |         Trained model or list of trained models, depending on the ``n_select`` param.
 761 | 
 762 |     Warnings
 763 |     --------
 764 |     - Changing turbo parameter to False may result in very high training times with 
 765 |       datasets exceeding 10,000 rows.
 766 | 
 767 |     - AUC for estimators that does not support 'predict_proba' is shown as 0.0000. 
 768 | 
 769 |     - No models are logged in ``MLFlow`` when ``cross_validation`` parameter is False.
 770 |     """
 771 | 
 772 |     return pycaret.internal.tabular.compare_models(
 773 |         include=include,
 774 |         exclude=exclude,
 775 |         fold=fold,
 776 |         round=round,
 777 |         cross_validation=cross_validation,
 778 |         sort=sort,
 779 |         n_select=n_select,
 780 |         budget_time=budget_time,
 781 |         turbo=turbo,
 782 |         errors=errors,
 783 |         fit_kwargs=fit_kwargs,
 784 |         groups=groups,
 785 |         verbose=verbose,
 786 |     )
 787 | 
 788 | 
 789 | def creating_model(
 790 |     estimator: Union[str, Any],
 791 |     fold: Optional[Union[int, Any]] = None,
 792 |     round: int = 4,
 793 |     cross_validation: bool = True,
 794 |     fit_kwargs: Optional[dict] = None,
 795 |     groups: Optional[Union[str, Any]] = None,
 796 |     verbose: bool = True,
 797 |     **kwargs,
 798 | ) -> Any:
 799 | 
 800 |     """  
 801 |     This function trains and evaluates the performance of a given estimator 
 802 |     using cross validation. The output of this function is a score grid with 
 803 |     CV scores by fold. Metrics evaluated during CV can be accessed using the 
 804 |     ``get_metrics`` function. Custom metrics can be added or removed using 
 805 |     ``add_metric`` and ``remove_metric`` function. All the available models
 806 |     can be accessed using the ``models`` function.
 807 | 
 808 |     Example
 809 |     -------
 810 |     >>> from PyRapidML.datasets import get_data
 811 |     >>> juice = extract_data('juice')
 812 |     >>> from PyRapidML.classification import *
 813 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
 814 |     >>> lr = creating_model('lr')
 815 | 
 816 | 
 817 |     estimator: str or scikit-learn compatible object
 818 |         ID of an estimator available in model library or pass an untrained 
 819 |         model object consistent with scikit-learn API. Estimators available  
 820 |         in the model library (ID - Name):
 821 | 
 822 |         * 'lr' - Logistic Regression             
 823 |         * 'knn' - K Neighbors Classifier          
 824 |         * 'nb' - Naive Bayes             
 825 |         * 'dt' - Decision Tree Classifier                   
 826 |         * 'svm' - SVM - Linear Kernel	            
 827 |         * 'rbfsvm' - SVM - Radial Kernel               
 828 |         * 'gpc' - Gaussian Process Classifier                  
 829 |         * 'mlp' - MLP Classifier                  
 830 |         * 'ridge' - Ridge Classifier                
 831 |         * 'rf' - Random Forest Classifier                   
 832 |         * 'qda' - Quadratic Discriminant Analysis                  
 833 |         * 'ada' - Ada Boost Classifier                 
 834 |         * 'gbc' - Gradient Boosting Classifier                  
 835 |         * 'lda' - Linear Discriminant Analysis                  
 836 |         * 'et' - Extra Trees Classifier                   
 837 |         * 'xgboost' - Extreme Gradient Boosting              
 838 |         * 'lightgbm' - Light Gradient Boosting Machine             
 839 |         * 'catboost' - CatBoost Classifier       
 840 | 
 841 | 
 842 |     fold: int or scikit-learn compatible CV generator, default = None
 843 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
 844 |         parameter of the ``setup`` function is used. When an integer is passed, 
 845 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
 846 |         ``setup`` function.
 847 |         
 848 | 
 849 |     round: int, default = 4
 850 |         Number of decimal places the metrics in the score grid will be rounded to. 
 851 | 
 852 | 
 853 |     cross_validation: bool, default = True
 854 |         When set to False, metrics are evaluated on holdout set. ``fold`` param
 855 |         is ignored when cross_validation is set to False.
 856 | 
 857 | 
 858 |     fit_kwargs: dict, default = {} (empty dict)
 859 |         Dictionary of arguments passed to the fit method of the model.
 860 | 
 861 | 
 862 |     groups: str or array-like, with shape (n_samples,), default = None
 863 |         Optional group labels when GroupKFold is used for the cross validation.
 864 |         It takes an array with shape (n_samples, ) where n_samples is the number
 865 |         of rows in training dataset. When string is passed, it is interpreted as 
 866 |         the column name in the dataset containing group labels.
 867 | 
 868 | 
 869 |     verbose: bool, default = True
 870 |         Score grid is not printed when verbose is set to False.
 871 | 
 872 | 
 873 |     **kwargs**:
 874 |         Additional keyword arguments to pass to the estimator.
 875 | 
 876 | 
 877 |     Returns:
 878 |         Trained Model
 879 | 
 880 | 
 881 |     Warnings
 882 |     --------
 883 |     - AUC for estimators that does not support 'predict_proba' is shown as 0.0000.
 884 | 
 885 |     - Models are not logged on the ``MLFlow`` server when ``cross_validation`` param
 886 |       is set to False.
 887 | 
 888 |     """
 889 | 
 890 |     return pycaret.internal.tabular.create_model_supervised(
 891 |         estimator=estimator,
 892 |         fold=fold,
 893 |         round=round,
 894 |         cross_validation=cross_validation,
 895 |         fit_kwargs=fit_kwargs,
 896 |         groups=groups,
 897 |         verbose=verbose,
 898 |         **kwargs,
 899 |     )
 900 | 
 901 | 
 902 | def tuning_model(
 903 |     estimator,
 904 |     fold: Optional[Union[int, Any]] = None,
 905 |     round: int = 4,
 906 |     n_iter: int = 10,
 907 |     custom_grid: Optional[Union[Dict[str, list], Any]] = None,
 908 |     optimize: str = "Accuracy",
 909 |     custom_scorer=None,
 910 |     search_library: str = "scikit-learn",
 911 |     search_algorithm: Optional[str] = None,
 912 |     early_stopping: Any = False,
 913 |     early_stopping_max_iters: int = 10,
 914 |     choose_better: bool = False,
 915 |     fit_kwargs: Optional[dict] = None,
 916 |     groups: Optional[Union[str, Any]] = None,
 917 |     return_tuner: bool = False,
 918 |     verbose: bool = True,
 919 |     tuner_verbose: Union[int, bool] = True,
 920 |     **kwargs,
 921 | ) -> Any:
 922 | 
 923 |     """
 924 |     This function tunes the hyperparameters of a given estimator. The output of
 925 |     this function is a score grid with CV scores by fold of the best selected 
 926 |     model based on ``optimize`` parameter. Metrics evaluated during CV can be 
 927 |     accessed using the ``get_metrics`` function. Custom metrics can be added
 928 |     or removed using ``add_metric`` and ``remove_metric`` function. 
 929 | 
 930 |     Example
 931 |     -------
 932 |     >>> from PyRapidML.datasets import get_data
 933 |     >>> juice = extract_data('juice')
 934 |     >>> from PyRapidML.classification import *
 935 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
 936 |     >>> lr = creating_model('lr')
 937 |     >>> tuned_lr = tuning_model(lr) 
 938 | 
 939 | 
 940 |     estimator: scikit-learn compatible object
 941 |         Trained model object
 942 | 
 943 | 
 944 |     fold: int or scikit-learn compatible CV generator, default = None
 945 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
 946 |         parameter of the ``setup`` function is used. When an integer is passed, 
 947 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
 948 |         ``setup`` function.
 949 |         
 950 | 
 951 |     round: int, default = 4
 952 |         Number of decimal places the metrics in the score grid will be rounded to. 
 953 | 
 954 | 
 955 |     n_iter: int, default = 10
 956 |         Number of iterations in the grid search. Increasing 'n_iter' may improve 
 957 |         model performance but also increases the training time.
 958 | 
 959 | 
 960 |     custom_grid: dictionary, default = None
 961 |         To define custom search space for hyperparameters, pass a dictionary with 
 962 |         parameter name and values to be iterated. Custom grids must be in a format 
 963 |         supported by the defined ``search_library``.
 964 | 
 965 | 
 966 |     optimize: str, default = 'Accuracy'
 967 |         Metric name to be evaluated for hyperparameter tuning. It also accepts custom 
 968 |         metrics that are added through the ``add_metric`` function.
 969 | 
 970 | 
 971 |     custom_scorer: object, default = None
 972 |         custom scoring strategy can be passed to tune hyperparameters of the model. 
 973 |         It must be created using ``sklearn.make_scorer``. It is equivalent of adding
 974 |         custom metric using the ``add_metric`` function and passing the name of the
 975 |         custom metric in the ``optimize`` parameter. 
 976 |         Will be deprecated in future.
 977 | 
 978 | 
 979 |     search_library: str, default = 'scikit-learn'
 980 |         The search library used for tuning hyperparameters. Possible values:
 981 | 
 982 |         - 'scikit-learn' - default, requires no further installation
 983 |             https://github.com/scikit-learn/scikit-learn
 984 | 
 985 |         - 'scikit-optimize' - ``pip install scikit-optimize`` 
 986 |             https://scikit-optimize.github.io/stable/
 987 | 
 988 |         - 'tune-sklearn' - ``pip install tune-sklearn ray[tune]`` 
 989 |             https://github.com/ray-project/tune-sklearn
 990 | 
 991 |         - 'optuna' - ``pip install optuna`` 
 992 |             https://optuna.org/
 993 | 
 994 | 
 995 |     search_algorithm: str, default = None
 996 |         The search algorithm depends on the ``search_library`` parameter.
 997 |         Some search algorithms require additional libraries to be installed.
 998 |         If None, will use search library-specific default algorithm.
 999 | 
1000 |         - 'scikit-learn' possible values:
1001 |             - 'random' : random grid search (default)
1002 |             - 'grid' : grid search
1003 | 
1004 |         - 'scikit-optimize' possible values:
1005 |             - 'bayesian' : Bayesian search (default)
1006 | 
1007 |         - 'tune-sklearn' possible values:
1008 |             - 'random' : random grid search (default)
1009 |             - 'grid' : grid search
1010 |             - 'bayesian' : ``pip install scikit-optimize``
1011 |             - 'hyperopt' : ``pip install hyperopt``
1012 |             - 'optuna' : ``pip install optuna``
1013 |             - 'bohb' : ``pip install hpbandster ConfigSpace``
1014 | 
1015 |         - 'optuna' possible values:
1016 |             - 'random' : randomized search
1017 |             - 'tpe' : Tree-structured Parzen Estimator search (default)
1018 | 
1019 | 
1020 |     early_stopping: bool or str or object, default = False
1021 |         Use early stopping to stop fitting to a hyperparameter configuration 
1022 |         if it performs poorly. Ignored when ``search_library`` is scikit-learn, 
1023 |         or if the estimator does not have 'partial_fit' attribute. If False or 
1024 |         None, early stopping will not be used. Can be either an object accepted 
1025 |         by the search library or one of the following:
1026 | 
1027 |         - 'asha' for Asynchronous Successive Halving Algorithm
1028 |         - 'hyperband' for Hyperband
1029 |         - 'median' for Median Stopping Rule
1030 |         - If False or None, early stopping will not be used.
1031 | 
1032 | 
1033 |     early_stopping_max_iters: int, default = 10
1034 |         Maximum number of epochs to run for each sampled configuration.
1035 |         Ignored if ``early_stopping`` is False or None.
1036 | 
1037 | 
1038 |     choose_better: bool, default = False
1039 |         When set to True, the returned object is always better performing. The
1040 |         metric used for comparison is defined by the ``optimize`` parameter.  
1041 | 
1042 | 
1043 |     fit_kwargs: dict, default = {} (empty dict)
1044 |         Dictionary of arguments passed to the fit method of the tuner.
1045 | 
1046 | 
1047 |     groups: str or array-like, with shape (n_samples,), default = None
1048 |         Optional group labels when GroupKFold is used for the cross validation.
1049 |         It takes an array with shape (n_samples, ) where n_samples is the number
1050 |         of rows in training dataset. When string is passed, it is interpreted as 
1051 |         the column name in the dataset containing group labels.
1052 | 
1053 | 
1054 |     return_tuner: bool, default = False
1055 |         When set to True, will return a tuple of (model, tuner_object). 
1056 | 
1057 | 
1058 |     verbose: bool, default = True
1059 |         Score grid is not printed when verbose is set to False.
1060 | 
1061 | 
1062 |     tuner_verbose: bool or in, default = True
1063 |         If True or above 0, will print messages from the tuner. Higher values
1064 |         print more messages. Ignored when ``verbose`` param is False.
1065 | 
1066 | 
1067 |     **kwargs**:
1068 |         Additional keyword arguments to pass to the optimizer.
1069 | 
1070 | 
1071 |     Returns:
1072 |         Trained Model and Optional Tuner Object when ``return_tuner`` is True. 
1073 | 
1074 | 
1075 |     Warnings
1076 |     --------
1077 |     - Using 'grid' as ``search_algorithm`` may result in very long computation.
1078 |       Only recommended with smaller search spaces that can be defined in the
1079 |       ``custom_grid`` parameter.
1080 | 
1081 |     - ``search_library`` 'tune-sklearn' does not support GPU models.
1082 | 
1083 |     """
1084 | 
1085 |     return pycaret.internal.tabular.tune_model_supervised(
1086 |         estimator=estimator,
1087 |         fold=fold,
1088 |         round=round,
1089 |         n_iter=n_iter,
1090 |         custom_grid=custom_grid,
1091 |         optimize=optimize,
1092 |         custom_scorer=custom_scorer,
1093 |         search_library=search_library,
1094 |         search_algorithm=search_algorithm,
1095 |         early_stopping=early_stopping,
1096 |         early_stopping_max_iters=early_stopping_max_iters,
1097 |         choose_better=choose_better,
1098 |         fit_kwargs=fit_kwargs,
1099 |         groups=groups,
1100 |         return_tuner=return_tuner,
1101 |         verbose=verbose,
1102 |         tuner_verbose=tuner_verbose,
1103 |         **kwargs,
1104 |     )
1105 | 
1106 | 
1107 | def ensemble_model(
1108 |     estimator,
1109 |     method: str = "Bagging",
1110 |     fold: Optional[Union[int, Any]] = None,
1111 |     n_estimators: int = 10,
1112 |     round: int = 4,
1113 |     choose_better: bool = False,
1114 |     optimize: str = "Accuracy",
1115 |     fit_kwargs: Optional[dict] = None,
1116 |     groups: Optional[Union[str, Any]] = None,
1117 |     verbose: bool = True,
1118 | ) -> Any:
1119 | 
1120 |     """  
1121 |     This function ensembles a given estimator. The output of this function is 
1122 |     a score grid with CV scores by fold. Metrics evaluated during CV can be 
1123 |     accessed using the ``get_metrics`` function. Custom metrics can be added
1124 |     or removed using ``add_metric`` and ``remove_metric`` function. 
1125 | 
1126 | 
1127 |     Example
1128 |     -------
1129 |     >>> from PyRapidML.datasets import get_data
1130 |     >>> juice = extract_data('juice')
1131 |     >>> from PyRapidML.classification import *
1132 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
1133 |     >>> dt = creating_model('dt')
1134 |     >>> bagged_dt = ensemble_model(dt, method = 'Bagging')
1135 |     
1136 | 
1137 |     estimator: scikit-learn compatible object
1138 |         Trained model object
1139 | 
1140 | 
1141 |     method: str, default = 'Bagging'
1142 |         Method for ensembling base estimator. It can be 'Bagging' or 'Boosting'. 
1143 | 
1144 | 
1145 |     fold: int or scikit-learn compatible CV generator, default = None
1146 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1147 |         parameter of the ``setup`` function is used. When an integer is passed, 
1148 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1149 |         ``setup`` function.
1150 |         
1151 | 
1152 |     n_estimators: int, default = 10
1153 |         The number of base estimators in the ensemble. In case of perfect fit, the 
1154 |         learning procedure is stopped early.
1155 | 
1156 |         
1157 |     round: int, default = 4
1158 |         Number of decimal places the metrics in the score grid will be rounded to. 
1159 | 
1160 | 
1161 |     choose_better: bool, default = False
1162 |         When set to True, the returned object is always better performing. The
1163 |         metric used for comparison is defined by the ``optimize`` parameter. 
1164 | 
1165 | 
1166 |     optimize: str, default = 'Accuracy'
1167 |         Metric to compare for model selection when ``choose_better`` is True.
1168 | 
1169 | 
1170 |     fit_kwargs: dict, default = {} (empty dict)
1171 |         Dictionary of arguments passed to the fit method of the model.
1172 | 
1173 | 
1174 |     groups: str or array-like, with shape (n_samples,), default = None
1175 |         Optional group labels when GroupKFold is used for the cross validation.
1176 |         It takes an array with shape (n_samples, ) where n_samples is the number
1177 |         of rows in training dataset. When string is passed, it is interpreted as 
1178 |         the column name in the dataset containing group labels.
1179 | 
1180 | 
1181 |     verbose: bool, default = True
1182 |         Score grid is not printed when verbose is set to False.
1183 | 
1184 | 
1185 |     Returns:
1186 |         Trained Model
1187 | 
1188 | 
1189 |     Warnings
1190 |     --------
1191 |     - Method 'Boosting' is not supported for estimators that do not have 'class_weights' 
1192 |       or 'predict_proba' attributes. 
1193 | 
1194 |     """
1195 | 
1196 |     return pycaret.internal.tabular.ensemble_model(
1197 |         estimator=estimator,
1198 |         method=method,
1199 |         fold=fold,
1200 |         n_estimators=n_estimators,
1201 |         round=round,
1202 |         choose_better=choose_better,
1203 |         optimize=optimize,
1204 |         fit_kwargs=fit_kwargs,
1205 |         groups=groups,
1206 |         verbose=verbose,
1207 |     )
1208 | 
1209 | 
1210 | def blend_models(
1211 |     estimator_list: list,
1212 |     fold: Optional[Union[int, Any]] = None,
1213 |     round: int = 4,
1214 |     choose_better: bool = False,
1215 |     optimize: str = "Accuracy",
1216 |     method: str = "auto",
1217 |     weights: Optional[List[float]] = None,
1218 |     fit_kwargs: Optional[dict] = None,
1219 |     groups: Optional[Union[str, Any]] = None,
1220 |     verbose: bool = True,
1221 | ) -> Any:
1222 | 
1223 |     """
1224 |     This function trains a Soft Voting / Majority Rule classifier for select
1225 |     models passed in the ``estimator_list`` param. The output of this function 
1226 |     is a score grid with CV scores by fold. Metrics evaluated during CV can be 
1227 |     accessed using the ``get_metrics`` function. Custom metrics can be added
1228 |     or removed using ``add_metric`` and ``remove_metric`` function.
1229 | 
1230 | 
1231 |     Example
1232 |     -------
1233 |     >>> from PyRapidML.datasets import get_data
1234 |     >>> juice = get_data('juice')
1235 |     >>> from PyRapidML.classification import *
1236 |     >>> exp_name = setup(data = juice,  target = 'Purchase')
1237 |     >>> top3 = compare_models(n_select = 3)
1238 |     >>> blender = blend_models(top3)
1239 | 
1240 | 
1241 |     estimator_list: list of scikit-learn compatible objects
1242 |         List of trained model objects
1243 | 
1244 | 
1245 |     fold: int or scikit-learn compatible CV generator, default = None
1246 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1247 |         parameter of the ``setup`` function is used. When an integer is passed, 
1248 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1249 |         ``setup`` function.
1250 | 
1251 | 
1252 |     round: int, default = 4
1253 |         Number of decimal places the metrics in the score grid will be rounded to.
1254 | 
1255 | 
1256 |     choose_better: bool, default = False
1257 |         When set to True, the returned object is always better performing. The
1258 |         metric used for comparison is defined by the ``optimize`` parameter. 
1259 | 
1260 | 
1261 |     optimize: str, default = 'Accuracy'
1262 |         Metric to compare for model selection when ``choose_better`` is True.
1263 | 
1264 | 
1265 |     method: str, default = 'auto'
1266 |         'hard' uses predicted class labels for majority rule voting. 'soft', predicts 
1267 |         the class label based on the argmax of the sums of the predicted probabilities, 
1268 |         which is recommended for an ensemble of well-calibrated classifiers. Default 
1269 |         value, 'auto', will try to use 'soft' and fall back to 'hard' if the former is 
1270 |         not supported.
1271 | 
1272 | 
1273 |     weights: list, default = None
1274 |         Sequence of weights (float or int) to weight the occurrences of predicted class 
1275 |         labels (hard voting) or class probabilities before averaging (soft voting). Uses 
1276 |         uniform weights when None.
1277 | 
1278 | 
1279 |     fit_kwargs: dict, default = {} (empty dict)
1280 |         Dictionary of arguments passed to the fit method of the model.
1281 | 
1282 | 
1283 |     groups: str or array-like, with shape (n_samples,), default = None
1284 |         Optional group labels when GroupKFold is used for the cross validation.
1285 |         It takes an array with shape (n_samples, ) where n_samples is the number
1286 |         of rows in training dataset. When string is passed, it is interpreted as 
1287 |         the column name in the dataset containing group labels.
1288 | 
1289 | 
1290 |     verbose: bool, default = True
1291 |         Score grid is not printed when verbose is set to False.
1292 | 
1293 | 
1294 |     Returns:
1295 |         Trained Model
1296 | 
1297 |     """
1298 | 
1299 |     return pycaret.internal.tabular.blend_models(
1300 |         estimator_list=estimator_list,
1301 |         fold=fold,
1302 |         round=round,
1303 |         choose_better=choose_better,
1304 |         optimize=optimize,
1305 |         method=method,
1306 |         weights=weights,
1307 |         fit_kwargs=fit_kwargs,
1308 |         groups=groups,
1309 |         verbose=verbose,
1310 |     )
1311 | 
1312 | 
1313 | def stack_models(
1314 |     estimator_list: list,
1315 |     meta_model=None,
1316 |     fold: Optional[Union[int, Any]] = None,
1317 |     round: int = 4,
1318 |     method: str = "auto",
1319 |     restack: bool = True,
1320 |     choose_better: bool = False,
1321 |     optimize: str = "Accuracy",
1322 |     fit_kwargs: Optional[dict] = None,
1323 |     groups: Optional[Union[str, Any]] = None,
1324 |     verbose: bool = True,
1325 | ) -> Any:
1326 | 
1327 |     """
1328 |     This function trains a meta model over select estimators passed in 
1329 |     the ``estimator_list`` parameter. The output of this function is a 
1330 |     score grid with CV scores by fold. Metrics evaluated during CV can 
1331 |     be accessed using the ``get_metrics`` function. Custom metrics 
1332 |     can be added or removed using ``add_metric`` and ``remove_metric`` 
1333 |     function.
1334 | 
1335 |     
1336 |     Example
1337 |     -------
1338 |     >>> from PyRapidML.datasets import get_data
1339 |     >>> juice = extract_data('juice')
1340 |     >>> from PyRapidML.classification import *
1341 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
1342 |     >>> top3 = comparing_models(n_select = 3)
1343 |     >>> stacker = stack_models(top3)
1344 | 
1345 | 
1346 |     estimator_list: list of scikit-learn compatible objects
1347 |         List of trained model objects
1348 | 
1349 | 
1350 |     meta_model: scikit-learn compatible object, default = None
1351 |         When None, Logistic Regression is trained as a meta model.
1352 | 
1353 | 
1354 |     fold: int or scikit-learn compatible CV generator, default = None
1355 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1356 |         parameter of the ``setup`` function is used. When an integer is passed, 
1357 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1358 |         ``setup`` function.
1359 | 
1360 | 
1361 |     round: int, default = 4
1362 |         Number of decimal places the metrics in the score grid will be rounded to.
1363 | 
1364 | 
1365 |     method: str, default = 'auto'
1366 |         When set to 'auto', it will invoke, for each estimator, 'predict_proba',
1367 |         'decision_function' or 'predict' in that order. Other, manually pass one
1368 |         of the value from 'predict_proba', 'decision_function' or 'predict'. 
1369 |         
1370 |         
1371 |     restack: bool, default = True
1372 |         When set to False, only the predictions of estimators will be used as 
1373 |         training data for the ``meta_model``.
1374 | 
1375 | 
1376 |     choose_better: bool, default = False
1377 |         When set to True, the returned object is always better performing. The
1378 |         metric used for comparison is defined by the ``optimize`` parameter. 
1379 | 
1380 | 
1381 |     optimize: str, default = 'Accuracy'
1382 |         Metric to compare for model selection when ``choose_better`` is True.
1383 | 
1384 | 
1385 |     fit_kwargs: dict, default = {} (empty dict)
1386 |         Dictionary of arguments passed to the fit method of the model.
1387 | 
1388 | 
1389 |     groups: str or array-like, with shape (n_samples,), default = None
1390 |         Optional group labels when GroupKFold is used for the cross validation.
1391 |         It takes an array with shape (n_samples, ) where n_samples is the number
1392 |         of rows in training dataset. When string is passed, it is interpreted as 
1393 |         the column name in the dataset containing group labels.
1394 | 
1395 | 
1396 |     verbose: bool, default = True
1397 |         Score grid is not printed when verbose is set to False.
1398 | 
1399 | 
1400 |     Returns:
1401 |         Trained Model
1402 | 
1403 | 
1404 |     Warnings
1405 |     --------
1406 |     - When ``method`` is not set to 'auto', it will check if the defined method
1407 |       is available for all estimators passed in ``estimator_list``. If the method is 
1408 |       not implemented by any estimator, it will raise an error.
1409 | 
1410 |     """
1411 | 
1412 |     return pycaret.internal.tabular.stack_models(
1413 |         estimator_list=estimator_list,
1414 |         meta_model=meta_model,
1415 |         fold=fold,
1416 |         round=round,
1417 |         method=method,
1418 |         restack=restack,
1419 |         choose_better=choose_better,
1420 |         optimize=optimize,
1421 |         fit_kwargs=fit_kwargs,
1422 |         groups=groups,
1423 |         verbose=verbose,
1424 |     )
1425 | 
1426 | 
1427 | def plot_model(
1428 |     estimator,
1429 |     plot: str = "auc",
1430 |     scale: float = 1,
1431 |     save: bool = False,
1432 |     fold: Optional[Union[int, Any]] = None,
1433 |     fit_kwargs: Optional[dict] = None,
1434 |     groups: Optional[Union[str, Any]] = None,
1435 |     use_train_data: bool = False,
1436 |     verbose: bool = True,
1437 |     display_format: Optional[str] = None,
1438 | ) -> str:
1439 | 
1440 |     """
1441 |     This function analyzes the performance of a trained model on holdout set. 
1442 |     It may require re-training the model in certain cases.
1443 | 
1444 |     Example
1445 |     -------
1446 |     >>> from PyRapidML.datasets import get_data
1447 |     >>> juice = extract_data('juice')
1448 |     >>> from PyRapidML.classification import *
1449 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
1450 |     >>> lr = creating_model('lr')
1451 |     >>> plot_model(lr, plot = 'auc')
1452 | 
1453 | 
1454 |     estimator: scikit-learn compatible object
1455 |         Trained model object
1456 | 
1457 | 
1458 |     plot: str, default = 'auc'
1459 |         List of available plots (ID - Name):
1460 | 
1461 |         * 'auc' - Area Under the Curve
1462 |         * 'threshold' - Discrimination Threshold
1463 |         * 'pr' - Precision Recall Curve
1464 |         * 'confusion_matrix' - Confusion Matrix
1465 |         * 'error' - Class Prediction Error
1466 |         * 'class_report' - Classification Report
1467 |         * 'boundary' - Decision Boundary
1468 |         * 'rfe' - Recursive Feature Selection
1469 |         * 'learning' - Learning Curve
1470 |         * 'manifold' - Manifold Learning
1471 |         * 'calibration' - Calibration Curve
1472 |         * 'vc' - Validation Curve
1473 |         * 'dimension' - Dimension Learning
1474 |         * 'feature' - Feature Importance
1475 |         * 'feature_all' - Feature Importance (All)
1476 |         * 'parameter' - Model Hyperparameter
1477 |         * 'lift' - Lift Curve
1478 |         * 'gain' - Gain Chart
1479 |         * 'tree' - Decision Tree
1480 | 
1481 | 
1482 |     scale: float, default = 1
1483 |         The resolution scale of the figure.
1484 | 
1485 | 
1486 |     save: bool, default = False
1487 |         When set to True, plot is saved in the current working directory.
1488 | 
1489 | 
1490 |     fold: int or scikit-learn compatible CV generator, default = None
1491 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1492 |         parameter of the ``setup`` function is used. When an integer is passed, 
1493 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1494 |         ``setup`` function.
1495 | 
1496 | 
1497 |     fit_kwargs: dict, default = {} (empty dict)
1498 |         Dictionary of arguments passed to the fit method of the model.
1499 | 
1500 | 
1501 |     groups: str or array-like, with shape (n_samples,), default = None
1502 |         Optional group labels when GroupKFold is used for the cross validation.
1503 |         It takes an array with shape (n_samples, ) where n_samples is the number
1504 |         of rows in training dataset. When string is passed, it is interpreted as 
1505 |         the column name in the dataset containing group labels.
1506 | 
1507 | 
1508 |     use_train_data: bool, default = False
1509 |         When set to true, train data will be used for plots, instead
1510 |         of test data.
1511 | 
1512 | 
1513 |     verbose: bool, default = True
1514 |         When set to False, progress bar is not displayed.
1515 | 
1516 | 
1517 |     display_format: str, default = None
1518 |         To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'.
1519 |         Currently, not all plots are supported.
1520 | 
1521 | 
1522 |     Returns:
1523 |         None
1524 |         
1525 | 
1526 |     Warnings
1527 |     --------
1528 |     -   Estimators that does not support 'predict_proba' attribute cannot be used for
1529 |         'AUC' and 'calibration' plots. 
1530 |               
1531 |     -   When the target is multiclass, 'calibration', 'threshold', 'manifold' and 'rfe' 
1532 |         plots are not available.
1533 | 
1534 |     -   When the 'max_features' parameter of a trained model object is not equal to 
1535 |         the number of samples in training set, the 'rfe' plot is not available.
1536 | 
1537 |     """
1538 | 
1539 |     return pycaret.internal.tabular.plot_model(
1540 |         estimator=estimator,
1541 |         plot=plot,
1542 |         scale=scale,
1543 |         save=save,
1544 |         fold=fold,
1545 |         fit_kwargs=fit_kwargs,
1546 |         groups=groups,
1547 |         verbose=verbose,
1548 |         use_train_data=use_train_data,
1549 |         system=True,
1550 |         display_format=display_format,
1551 |     )
1552 | 
1553 | 
1554 | def evaluate_model(
1555 |     estimator,
1556 |     fold: Optional[Union[int, Any]] = None,
1557 |     fit_kwargs: Optional[dict] = None,
1558 |     groups: Optional[Union[str, Any]] = None,
1559 |     use_train_data: bool = False,
1560 | ):
1561 | 
1562 |     """
1563 |     This function displays a user interface for analyzing performance of a trained
1564 |     model. It calls the ``plot_model`` function internally. 
1565 |     
1566 | 
1567 |     Example
1568 |     -------
1569 |     >>> from PyRapidML.datasets import get_data
1570 |     >>> juice = extract_data('juice')
1571 |     >>> from PyRapidML.classification import *
1572 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
1573 |     >>> lr = creating_model('lr')
1574 |     >>> evaluate_model(lr)
1575 |     
1576 | 
1577 |     estimator: scikit-learn compatible object
1578 |         Trained model object
1579 | 
1580 | 
1581 |     fold: int or scikit-learn compatible CV generator, default = None
1582 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1583 |         parameter of the ``setup`` function is used. When an integer is passed, 
1584 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1585 |         ``setup`` function.
1586 | 
1587 | 
1588 |     fit_kwargs: dict, default = {} (empty dict)
1589 |         Dictionary of arguments passed to the fit method of the model.
1590 | 
1591 | 
1592 |     groups: str or array-like, with shape (n_samples,), default = None
1593 |         Optional group labels when GroupKFold is used for the cross validation.
1594 |         It takes an array with shape (n_samples, ) where n_samples is the number
1595 |         of rows in training dataset. When string is passed, it is interpreted as 
1596 |         the column name in the dataset containing group labels.
1597 | 
1598 | 
1599 |     use_train_data: bool, default = False
1600 |         When set to true, train data will be used for plots, instead
1601 |         of test data.
1602 | 
1603 | 
1604 |     Returns:
1605 |         None
1606 | 
1607 | 
1608 |     Warnings
1609 |     --------
1610 |     -   This function only works in IPython enabled Notebook.
1611 | 
1612 |     """
1613 | 
1614 |     return pycaret.internal.tabular.evaluate_model(
1615 |         estimator=estimator,
1616 |         fold=fold,
1617 |         fit_kwargs=fit_kwargs,
1618 |         groups=groups,
1619 |         use_train_data=use_train_data,
1620 |     )
1621 | 
1622 | 
1623 | def interpret_model(
1624 |     estimator,
1625 |     plot: str = "summary",
1626 |     feature: Optional[str] = None,
1627 |     observation: Optional[int] = None,
1628 |     use_train_data: bool = False,
1629 |     X_new_sample: Optional[pd.DataFrame] = None,
1630 |     save: bool = False,
1631 |     **kwargs,
1632 | ):
1633 | 
1634 |     """ 
1635 |     This function analyzes the predictions generated from a tree-based model. It is
1636 |     implemented based on the SHAP (SHapley Additive exPlanations). For more info on
1637 |     this, please see https://shap.readthedocs.io/en/latest/
1638 | 
1639 |     Example
1640 |     -------
1641 |     >>> from PyRapidML.datasets import get_data
1642 |     >>> juice = extract_data('juice')
1643 |     >>> from PyRapidML.classification import *
1644 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
1645 |     >>> xgboost = creating_model('xgboost')
1646 |     >>> interpret_model(xgboost)
1647 | 
1648 | 
1649 |     estimator: scikit-learn compatible object
1650 |         Trained model object
1651 | 
1652 | 
1653 |     plot: str, default = 'summary'
1654 |         Type of plot. Available options are: 'summary', 'correlation', and 'reason'.
1655 | 
1656 | 
1657 |     feature: str, default = None
1658 |         Feature to check correlation with. This parameter is only required when ``plot``
1659 |         type is 'correlation'. When set to None, it uses the first column in the train
1660 |         dataset.
1661 | 
1662 | 
1663 |     observation: int, default = None
1664 |         Observation index number in holdout set to explain. When ``plot`` is not
1665 |         'reason', this parameter is ignored. 
1666 | 
1667 | 
1668 |     use_train_data: bool, default = False
1669 |         When set to true, train data will be used for plots, instead
1670 |         of test data.
1671 | 
1672 | 
1673 |     X_new_sample: pd.DataFrame, default = None
1674 |         Row from an out-of-sample dataframe (neither train nor test data) to be plotted.
1675 |         The sample must have the same columns as the raw input data, and it is transformed
1676 |         by the preprocessing pipeline automatically before plotting.
1677 | 
1678 | 
1679 |     save: bool, default = False
1680 |         When set to True, Plot is saved as a 'png' file in current working directory.
1681 | 
1682 | 
1683 |     **kwargs**:
1684 |         Additional keyword arguments to pass to the plot.
1685 | 
1686 | 
1687 |     Returns:
1688 |         None
1689 | 
1690 |     """
1691 | 
1692 |     return pycaret.internal.tabular.interpret_model(
1693 |         estimator=estimator,
1694 |         plot=plot,
1695 |         feature=feature,
1696 |         observation=observation,
1697 |         use_train_data=use_train_data,
1698 |         X_new_sample=X_new_sample,
1699 |         save=save,
1700 |         **kwargs,
1701 |     )
1702 | 
1703 | 
1704 | def calibrate_model(
1705 |     estimator,
1706 |     method: str = "sigmoid",
1707 |     fold: Optional[Union[int, Any]] = None,
1708 |     round: int = 4,
1709 |     fit_kwargs: Optional[dict] = None,
1710 |     groups: Optional[Union[str, Any]] = None,
1711 |     verbose: bool = True,
1712 | ) -> Any:
1713 | 
1714 |     """  
1715 |     This function calibrates the probability of a given estimator using isotonic
1716 |     or logistic regression. The output of this function is a score grid with CV 
1717 |     scores by fold. Metrics evaluated during CV can be accessed using the 
1718 |     ``get_metrics`` function. Custom metrics can be added or removed using 
1719 |     ``add_metric`` and ``remove_metric`` function. 
1720 | 
1721 | 
1722 |     Example
1723 |     -------
1724 |     >>> from PyRapidML.datasets import get_data
1725 |     >>> juice = extract_data('juice')
1726 |     >>> from PyRapidML.classification import *
1727 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
1728 |     >>> dt = creating_model('dt')
1729 |     >>> calibrated_dt = calibrate_model(dt)
1730 | 
1731 | 
1732 |     estimator: scikit-learn compatible object
1733 |         Trained model object
1734 |     
1735 | 
1736 |     method: str, default = 'sigmoid'
1737 |         The method to use for calibration. Can be 'sigmoid' which corresponds to 
1738 |         Platt's method or 'isotonic' which is a non-parametric approach. 
1739 | 
1740 | 
1741 |     fold: int or scikit-learn compatible CV generator, default = None
1742 |         Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 
1743 |         parameter of the ``setup`` function is used. When an integer is passed, 
1744 |         it is interpreted as the 'n_splits' parameter of the CV generator in the 
1745 |         ``setup`` function.
1746 | 
1747 | 
1748 |     round: int, default = 4
1749 |         Number of decimal places the metrics in the score grid will be rounded to. 
1750 | 
1751 | 
1752 |     fit_kwargs: dict, default = {} (empty dict)
1753 |         Dictionary of arguments passed to the fit method of the model.
1754 | 
1755 | 
1756 |     groups: str or array-like, with shape (n_samples,), default = None
1757 |         Optional group labels when GroupKFold is used for the cross validation.
1758 |         It takes an array with shape (n_samples, ) where n_samples is the number
1759 |         of rows in training dataset. When string is passed, it is interpreted as 
1760 |         the column name in the dataset containing group labels.
1761 | 
1762 | 
1763 |     verbose: bool, default = True
1764 |         Score grid is not printed when verbose is set to False.
1765 | 
1766 | 
1767 |     Returns:
1768 |         Trained Model
1769 | 
1770 | 
1771 |     Warnings
1772 |     --------
1773 |     - Avoid isotonic calibration with too few calibration samples (< 1000) since it 
1774 |       tends to overfit.
1775 | 
1776 |     """
1777 | 
1778 |     return pycaret.internal.tabular.calibrate_model(
1779 |         estimator=estimator,
1780 |         method=method,
1781 |         fold=fold,
1782 |         round=round,
1783 |         fit_kwargs=fit_kwargs,
1784 |         groups=groups,
1785 |         verbose=verbose,
1786 |     )
1787 | 
1788 | 
1789 | def optimize_threshold(
1790 |     estimator,
1791 |     true_positive: int = 0,
1792 |     true_negative: int = 0,
1793 |     false_positive: int = 0,
1794 |     false_negative: int = 0,
1795 | ):
1796 | 
1797 |     """
1798 |     This function optimizes probability threshold for a given estimator using 
1799 |     custom cost function. The function displays a plot of optimized cost as a
1800 |     function of probability threshold between 0.0 to 1.0 and returns the 
1801 |     optimized threshold value as a numpy float. 
1802 | 
1803 | 
1804 |     Example
1805 |     -------
1806 |     >>> from PyRapidML.datasets import get_data
1807 |     >>> juice = get_data('juice')
1808 |     >>> from PyRapidML.classification import *
1809 |     >>> exp_name = setup(data = juice,  target = 'Purchase')
1810 |     >>> lr = create_model('lr')
1811 |     >>> optimize_threshold(lr, true_negative = 10, false_negative = -100)
1812 | 
1813 | 
1814 |     estimator: scikit-learn compatible object
1815 |         Trained model object
1816 |     
1817 | 
1818 |     true_positive: int, default = 0
1819 |         Cost function or returns for true positive.  
1820 |     
1821 | 
1822 |     true_negative: int, default = 0
1823 |         Cost function or returns for true negative.
1824 |     
1825 | 
1826 |     false_positive: int, default = 0
1827 |         Cost function or returns for false positive.    
1828 |     
1829 | 
1830 |     false_negative: int, default = 0
1831 |         Cost function or returns for false negative.       
1832 |     
1833 | 
1834 |     Returns:
1835 |         numpy.float64 
1836 | 
1837 | 
1838 |     Warnings
1839 |     --------
1840 |     - This function is not supported when target is multiclass. 
1841 | 
1842 |     """
1843 | 
1844 |     return pycaret.internal.tabular.optimize_threshold(
1845 |         estimator=estimator,
1846 |         true_positive=true_positive,
1847 |         true_negative=true_negative,
1848 |         false_positive=false_positive,
1849 |         false_negative=false_negative,
1850 |     )
1851 | 
1852 | 
1853 | def predict_model(
1854 |     estimator,
1855 |     data: Optional[pd.DataFrame] = None,
1856 |     probability_threshold: Optional[float] = None,
1857 |     encoded_labels: bool = False,
1858 |     raw_score: bool = False,
1859 |     round: int = 4,
1860 |     verbose: bool = True,
1861 | ) -> pd.DataFrame:
1862 | 
1863 |     """
1864 |     This function predicts ``Label`` and ``Score`` (probability of predicted 
1865 |     class) using a trained model. When ``data`` is None, it predicts label and 
1866 |     score on the holdout set.
1867 |     
1868 |     
1869 |     Example
1870 |     -------
1871 |     >>> from PyRapidML.datasets import get_data
1872 |     >>> juice = extract_data('juice')
1873 |     >>> from PyRapidML.classification import *
1874 |     >>> exp_name = setup(data = juice,  target = 'Purchase')
1875 |     >>> lr = creating_model('lr')
1876 |     >>> pred_holdout = predict_model(lr)
1877 |     >>> pred_unseen = predict_model(lr, data = unseen_dataframe)
1878 |         
1879 | 
1880 |     estimator: scikit-learn compatible object
1881 |         Trained model object
1882 | 
1883 | 
1884 |     data: pandas.DataFrame
1885 |         Shape (n_samples, n_features). All features used during training 
1886 |         must be available in the unseen dataset.
1887 |     
1888 | 
1889 |     probability_threshold: float, default = None
1890 |         Threshold for converting predicted probability to class label.
1891 |         It defaults to 0.5 for all classifiers unless explicitly defined 
1892 |         in this parameter. 
1893 | 
1894 | 
1895 |     encoded_labels: bool, default = False
1896 |         When set to True, will return labels encoded as an integer.
1897 | 
1898 | 
1899 |     raw_score: bool, default = False
1900 |         When set to True, scores for all labels will be returned.
1901 | 
1902 | 
1903 |     round: int, default = 4
1904 |         Number of decimal places the metrics in the score grid will be rounded to. 
1905 | 
1906 | 
1907 |     verbose: bool, default = True
1908 |         When set to False, holdout score grid is not printed.
1909 | 
1910 | 
1911 |     Returns:
1912 |         pandas.DataFrame
1913 | 
1914 | 
1915 |     Warnings
1916 |     --------
1917 |     - The behavior of the ``predict_model`` is changed in version 2.1 without backward 
1918 |       compatibility. As such, the pipelines trained using the version (<= 2.0), may not 
1919 |       work for inference with version >= 2.1. You can either retrain your models with a 
1920 |       newer version or downgrade the version for inference.
1921 | 
1922 |     """
1923 | 
1924 |     return pycaret.internal.tabular.predict_model(
1925 |         estimator=estimator,
1926 |         data=data,
1927 |         probability_threshold=probability_threshold,
1928 |         encoded_labels=encoded_labels,
1929 |         raw_score=raw_score,
1930 |         round=round,
1931 |         verbose=verbose,
1932 |         ml_usecase=MLUsecase.CLASSIFICATION,
1933 |     )
1934 | 
1935 | 
1936 | def finalize_model(
1937 |     estimator,
1938 |     fit_kwargs: Optional[dict] = None,
1939 |     groups: Optional[Union[str, Any]] = None,
1940 |     model_only: bool = True,
1941 | ) -> Any:
1942 | 
1943 |     """
1944 |     This function trains a given estimator on the entire dataset including the 
1945 |     holdout set. 
1946 |     
1947 |     
1948 |     Example
1949 |     -------
1950 |     >>> from PyRapidML.datasets import get_data
1951 |     >>> juice = extract_data('juice')
1952 |     >>> from PyRapidML.classification import *
1953 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
1954 |     >>> lr = creating_model('lr')
1955 |     >>> final_lr = finalize_model(lr)
1956 |     
1957 | 
1958 |     estimator: scikit-learn compatible object
1959 |         Trained model object
1960 | 
1961 | 
1962 |     fit_kwargs: dict, default = {} (empty dict)
1963 |         Dictionary of arguments passed to the fit method of the model.
1964 | 
1965 | 
1966 |     groups: str or array-like, with shape (n_samples,), default = None
1967 |         Optional group labels when GroupKFold is used for the cross validation.
1968 |         It takes an array with shape (n_samples, ) where n_samples is the number
1969 |         of rows in training dataset. When string is passed, it is interpreted as 
1970 |         the column name in the dataset containing group labels.
1971 | 
1972 | 
1973 |     model_only: bool, default = True
1974 |         When set to False, only model object is re-trained and all the 
1975 |         transformations in Pipeline are ignored.
1976 | 
1977 | 
1978 |     Returns:
1979 |         Trained Model
1980 |       
1981 |     """
1982 | 
1983 |     return pycaret.internal.tabular.finalize_model(
1984 |         estimator=estimator,
1985 |         fit_kwargs=fit_kwargs,
1986 |         groups=groups,
1987 |         model_only=model_only,
1988 |     )
1989 | 
1990 | 
1991 | def deploy_model(
1992 |     model, model_name: str, authentication: dict, platform: str = "aws",
1993 | ):
1994 | 
1995 |     """
1996 |     This function deploys the transformation pipeline and trained model on cloud.
1997 |     
1998 | 
1999 |     Example
2000 |     -------
2001 |     >>> from PyRapidML.datasets import get_data
2002 |     >>> juice = extract_data('juice')
2003 |     >>> from PyRapidML.classification import *
2004 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
2005 |     >>> lr = creating_model('lr')
2006 |     >>> deploy_model(model = lr, model_name = 'lr-for-deployment', platform = 'aws', authentication = {'bucket' : 'S3-bucket-name'})
2007 |         
2008 | 
2009 |     Amazon Web Service (AWS) users:
2010 |         To deploy a model on AWS S3 ('aws'), environment variables must be set in your
2011 |         local environment. To configure AWS environment variables, type ``aws configure`` 
2012 |         in the command line. Following information from the IAM portal of amazon console 
2013 |         account is required:
2014 | 
2015 |         - AWS Access Key ID
2016 |         - AWS Secret Key Access
2017 |         - Default Region Name (can be seen under Global settings on your AWS console)
2018 | 
2019 |         More info: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html
2020 | 
2021 | 
2022 |     Google Cloud Platform (GCP) users:
2023 |         To deploy a model on Google Cloud Platform ('gcp'), project must be created 
2024 |         using command line or GCP console. Once project is created, you must create 
2025 |         a service account and download the service account key as a JSON file to set 
2026 |         environment variables in your local environment. 
2027 | 
2028 |         More info: https://cloud.google.com/docs/authentication/production
2029 | 
2030 |     
2031 |     Microsoft Azure (Azure) users:
2032 |         To deploy a model on Microsoft Azure ('azure'), environment variables for connection
2033 |         string must be set in your local environment. Go to settings of storage account on
2034 |         Azure portal to access the connection string required. 
2035 | 
2036 |         More info: https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?toc=%2Fpython%2Fazure%2FTOC.json
2037 | 
2038 | 
2039 |     model: scikit-learn compatible object
2040 |         Trained model object
2041 |     
2042 | 
2043 |     model_name: str
2044 |         Name of model.
2045 |     
2046 | 
2047 |     authentication: dict
2048 |         Dictionary of applicable authentication tokens.
2049 | 
2050 |         When platform = 'aws':
2051 |         {'bucket' : 'S3-bucket-name'}
2052 | 
2053 |         When platform = 'gcp':
2054 |         {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'}
2055 | 
2056 |         When platform = 'azure':
2057 |         {'container': 'azure-container-name'}
2058 |     
2059 | 
2060 |     platform: str, default = 'aws'
2061 |         Name of the cloud platform. Currently supported platforms: 'aws', 'gcp' and 'azure'.
2062 |     
2063 | 
2064 |     Returns:
2065 |         None
2066 | 
2067 |     """
2068 | 
2069 |     return pycaret.internal.tabular.deploy_model(
2070 |         model=model,
2071 |         model_name=model_name,
2072 |         authentication=authentication,
2073 |         platform=platform,
2074 |     )
2075 | 
2076 | 
2077 | def save_model(
2078 |     model, model_name: str, model_only: bool = False, verbose: bool = True, **kwargs
2079 | ):
2080 | 
2081 |     """
2082 |     This function saves the transformation pipeline and trained model object 
2083 |     into the current working directory as a pickle file for later use. 
2084 |     
2085 |     Example
2086 |     -------
2087 |     >>> from PyRapidML.datasets import get_data
2088 |     >>> juice = extract_data('juice')
2089 |     >>> from PyRapidML.classification import *
2090 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')
2091 |     >>> lr = creatinng_model('lr')
2092 |     >>> save_model(lr, 'saved_lr_model')
2093 |     
2094 | 
2095 |     model: scikit-learn compatible object
2096 |         Trained model object
2097 |     
2098 | 
2099 |     model_name: str
2100 |         Name of the model.
2101 |     
2102 | 
2103 |     model_only: bool, default = False
2104 |         When set to True, only trained model object is saved instead of the 
2105 |         entire pipeline.
2106 | 
2107 | 
2108 |     verbose: bool, default = True
2109 |         Success message is not printed when verbose is set to False.
2110 | 
2111 | 
2112 |     **kwargs**:
2113 |         Additional keyword arguments to pass to joblib.dump().
2114 | 
2115 | 
2116 |     Returns:
2117 |         Tuple of the model object and the filename.
2118 | 
2119 |     """
2120 | 
2121 |     return pycaret.internal.tabular.save_model(
2122 |         model=model,
2123 |         model_name=model_name,
2124 |         model_only=model_only,
2125 |         verbose=verbose,
2126 |         **kwargs,
2127 |     )
2128 | 
2129 | 
2130 | def load_model(
2131 |     model_name,
2132 |     platform: Optional[str] = None,
2133 |     authentication: Optional[Dict[str, str]] = None,
2134 |     verbose: bool = True,
2135 | ):
2136 | 
2137 |     """
2138 |     This function loads a previously saved pipeline.
2139 |     
2140 | 
2141 |     Example
2142 |     -------
2143 |     >>> from PyRapidML.classification import load_model
2144 |     >>> saved_lr = load_model('saved_lr_model')
2145 | 
2146 | 
2147 |     model_name: str
2148 |         Name of the model.
2149 |       
2150 | 
2151 |     platform: str, default = None
2152 |         Name of the cloud platform. Currently supported platforms: 
2153 |         'aws', 'gcp' and 'azure'.
2154 |     
2155 | 
2156 |     authentication: dict, default = None
2157 |         dictionary of applicable authentication tokens.
2158 | 
2159 |         when platform = 'aws':
2160 |         {'bucket' : 'S3-bucket-name'}
2161 | 
2162 |         when platform = 'gcp':
2163 |         {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'}
2164 | 
2165 |         when platform = 'azure':
2166 |         {'container': 'azure-container-name'}
2167 |     
2168 | 
2169 |     verbose: bool, default = True
2170 |         Success message is not printed when verbose is set to False.
2171 | 
2172 | 
2173 |     Returns:
2174 |         Trained Model
2175 | 
2176 |     """
2177 | 
2178 |     return pycaret.internal.tabular.load_model(
2179 |         model_name=model_name,
2180 |         platform=platform,
2181 |         authentication=authentication,
2182 |         verbose=verbose,
2183 |     )
2184 | 
2185 | 
2186 | def automl(optimize: str = "Accuracy", use_holdout: bool = False) -> Any:
2187 | 
2188 |     """ 
2189 |     This function returns the best model out of all trained models in
2190 |     current session based on the ``optimize`` parameter. Metrics
2191 |     evaluated can be accessed using the ``get_metrics`` function. 
2192 | 
2193 |     
2194 |     Example
2195 |     -------
2196 |     >>> from PyRapidML.datasets import get_data
2197 |     >>> juice = extract_data('juice')
2198 |     >>> from PyRapidML.classification import *
2199 |     >>> exp_name = setup(data = juice,  target = 'Purchase')
2200 |     >>> top3 = comparing_models(n_select = 3)
2201 |     >>> tuned_top3 = [tuning_model(i) for i in top3]
2202 |     >>> blender = blend_models(tuned_top3)
2203 |     >>> stacker = stack_models(tuned_top3)
2204 |     >>> best_auc_model = automl(optimize = 'AUC')
2205 | 
2206 | 
2207 |     optimize: str, default = 'Accuracy'
2208 |         Metric to use for model selection. It also accepts custom metrics
2209 |         added using the ``add_metric`` function. 
2210 | 
2211 | 
2212 |     use_holdout: bool, default = False
2213 |         When set to True, metrics are evaluated on holdout set instead of CV.
2214 |       
2215 | 
2216 |     Returns:
2217 |         Trained Model
2218 | 
2219 |     """
2220 |     return pycaret.internal.tabular.automl(optimize=optimize, use_holdout=use_holdout)
2221 | 
2222 | 
2223 | def pull(pop: bool = False) -> pd.DataFrame:
2224 | 
2225 |     """  
2226 |     Returns last printed score grid. Use ``pull`` function after
2227 |     any training function to store the score grid in pandas.DataFrame.
2228 | 
2229 | 
2230 |     pop: bool, default = False
2231 |         If True, will pop (remove) the returned dataframe from the
2232 |         display container.
2233 | 
2234 | 
2235 |     Returns:
2236 |         pandas.DataFrame
2237 | 
2238 |     """
2239 |     return pycaret.internal.tabular.pull(pop=pop)
2240 | 
2241 | 
2242 | def models(
2243 |     type: Optional[str] = None, internal: bool = False, raise_errors: bool = True,
2244 | ) -> pd.DataFrame:
2245 | 
2246 |     """
2247 |     Returns table of models available in the model library.
2248 | 
2249 |     Example
2250 |     -------
2251 |     >>> from PyRapidML.datasets import get_data
2252 |     >>> juice = extract_data('juice')
2253 |     >>> from PyRapidML.classification import *
2254 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')    
2255 |     >>> all_models = models()
2256 | 
2257 | 
2258 |     type: str, default = None
2259 |         - linear : filters and only return linear models
2260 |         - tree : filters and only return tree based models
2261 |         - ensemble : filters and only return ensemble models
2262 |     
2263 | 
2264 |     internal: bool, default = False
2265 |         When True, will return extra columns and rows used internally.
2266 | 
2267 | 
2268 |     raise_errors: bool, default = True
2269 |         When False, will suppress all exceptions, ignoring models
2270 |         that couldn't be created.
2271 | 
2272 | 
2273 |     Returns:
2274 |         pandas.DataFrame
2275 | 
2276 |     """
2277 |     return pycaret.internal.tabular.models(
2278 |         type=type, internal=internal, raise_errors=raise_errors
2279 |     )
2280 | 
2281 | 
2282 | def get_metrics(
2283 |     reset: bool = False, include_custom: bool = True, raise_errors: bool = True,
2284 | ) -> pd.DataFrame:
2285 | 
2286 |     """
2287 |     Returns table of available metrics used for CV.
2288 | 
2289 | 
2290 |     Example
2291 |     -------
2292 |     >>> from PyRapidML.datasets import get_data
2293 |     >>> juice = extract_data('juice')
2294 |     >>> from PyRapidML.classification import *
2295 |     >>> exp_name = initializer(data = juice,  target = 'Purchase')    
2296 |     >>> all_metrics = get_metrics()
2297 | 
2298 | 
2299 |     reset: bool, default = False
2300 |         When True, will reset all changes made using the ``add_metric`` 
2301 |         and ``remove_metric`` function.
2302 | 
2303 | 
2304 |     include_custom: bool, default = True
2305 |         Whether to include user added (custom) metrics or not.
2306 | 
2307 | 
2308 |     raise_errors: bool, default = True
2309 |         If False, will suppress all exceptions, ignoring models that
2310 |         couldn't be created.
2311 | 
2312 | 
2313 |     Returns:
2314 |         pandas.DataFrame
2315 | 
2316 |     """
2317 | 
2318 |     return pycaret.internal.tabular.get_metrics(
2319 |         reset=reset, include_custom=include_custom, raise_errors=raise_errors,
2320 |     )
2321 | 
2322 | 
2323 | def add_metric(
2324 |     id: str,
2325 |     name: str,
2326 |     score_func: type,
2327 |     target: str = "pred",
2328 |     greater_is_better: bool = True,
2329 |     multiclass: bool = True,
2330 |     **kwargs,
2331 | ) -> pd.Series:
2332 | 
2333 |     """ 
2334 |     Adds a custom metric to be used for CV.
2335 | 
2336 | 
2337 |     Example
2338 |     -------
2339 |     >>> from PyRapidML.datasets import get_data
2340 |     >>> juice = extract_data('juice')
2341 |     >>> from PyRapidML.classification import *
2342 |     >>> exp_name = initializer(data = juice,  target = 'Purchase') 
2343 |     >>> from sklearn.metrics import log_loss
2344 |     >>> add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)
2345 | 
2346 | 
2347 |     id: str
2348 |         Unique id for the metric.
2349 | 
2350 | 
2351 |     name: str
2352 |         Display name of the metric.
2353 | 
2354 | 
2355 |     score_func: type
2356 |         Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``.
2357 | 
2358 | 
2359 |     target: str, default = 'pred'
2360 |         The target of the score function.
2361 | 
2362 |         - 'pred' for the prediction table
2363 |         - 'pred_proba' for pred_proba
2364 |         - 'threshold' for decision_function or predict_proba
2365 | 
2366 | 
2367 |     greater_is_better: bool, default = True
2368 |         Whether ``score_func`` is higher the better or not.
2369 | 
2370 | 
2371 |     multiclass: bool, default = True
2372 |         Whether the metric supports multiclass target.
2373 | 
2374 | 
2375 |     **kwargs**:
2376 |         Arguments to be passed to score function.
2377 | 
2378 | 
2379 |     Returns:
2380 |         pandas.Series
2381 | 
2382 |     """
2383 | 
2384 |     return pycaret.internal.tabular.add_metric(
2385 |         id=id,
2386 |         name=name,
2387 |         score_func=score_func,
2388 |         target=target,
2389 |         greater_is_better=greater_is_better,
2390 |         multiclass=multiclass,
2391 |         **kwargs,
2392 |     )
2393 | 
2394 | 
2395 | def remove_metric(name_or_id: str):
2396 | 
2397 |     """  
2398 |     Removes a metric from CV.
2399 | 
2400 | 
2401 |     Example
2402 |     -------
2403 |     >>> from PyRapidML.datasets import get_data
2404 |     >>> juice = extract_data('juice')
2405 |     >>> from PyRapidML.classification import *
2406 |     >>> exp_name = initializer(data = juice,  target = 'Purchase') 
2407 |     >>> remove_metric('MCC')
2408 | 
2409 | 
2410 |     name_or_id: str
2411 |         Display name or ID of the metric.
2412 | 
2413 |     
2414 |     Returns:
2415 |         None
2416 | 
2417 |     """
2418 |     return pycaret.internal.tabular.remove_metric(name_or_id=name_or_id)
2419 | 
2420 | 
2421 | def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.DataFrame:
2422 | 
2423 |     """
2424 |     Returns a table of experiment logs. Only works when ``log_experiment``
2425 |     is True when initializing the ``setup`` function.
2426 | 
2427 | 
2428 |     Example
2429 |     -------
2430 |     >>> from PyRapidML.datasets import get_data
2431 |     >>> juice = extract_data('juice')
2432 |     >>> from PyRapidML.classification import *
2433 |     >>> exp_name = initializer(data = juice,  target = 'Purchase', log_experiment = True) 
2434 |     >>> best = comparing_models()
2435 |     >>> exp_logs = get_logs()
2436 | 
2437 | 
2438 |     experiment_name: str, default = None
2439 |         When None current active run is used.
2440 | 
2441 | 
2442 |     save: bool, default = False
2443 |         When set to True, csv file is saved in current working directory.
2444 | 
2445 | 
2446 |     Returns:
2447 |         pandas.DataFrame
2448 | 
2449 |     """
2450 | 
2451 |     return pycaret.internal.tabular.get_logs(experiment_name=experiment_name, save=save)
2452 | 
2453 | 
2454 | def get_config(variable: str):
2455 | 
2456 |     """
2457 |     This function retrieves the global variables created when initializing the 
2458 |     ``setup`` function. Following variables are accessible:
2459 | 
2460 |     - X: Transformed dataset (X)
2461 |     - y: Transformed dataset (y)  
2462 |     - X_train: Transformed train dataset (X)
2463 |     - X_test: Transformed test/holdout dataset (X)
2464 |     - y_train: Transformed train dataset (y)
2465 |     - y_test: Transformed test/holdout dataset (y)
2466 |     - seed: random state set through session_id
2467 |     - prep_pipe: Transformation pipeline
2468 |     - fold_shuffle_param: shuffle parameter used in Kfolds
2469 |     - n_jobs_param: n_jobs parameter used in model training
2470 |     - html_param: html_param configured through setup
2471 |     - create_model_container: results grid storage container
2472 |     - master_model_container: model storage container
2473 |     - display_container: results display container
2474 |     - exp_name_log: Name of experiment
2475 |     - logging_param: log_experiment param
2476 |     - log_plots_param: log_plots param
2477 |     - USI: Unique session ID parameter
2478 |     - fix_imbalance_param: fix_imbalance param
2479 |     - fix_imbalance_method_param: fix_imbalance_method param
2480 |     - data_before_preprocess: data before preprocessing
2481 |     - target_param: name of target variable
2482 |     - gpu_param: use_gpu param configured through setup
2483 |     - fold_generator: CV splitter configured in fold_strategy
2484 |     - fold_param: fold params defined in the setup
2485 |     - fold_groups_param: fold groups defined in the setup
2486 |     - stratify_param: stratify parameter defined in the setup
2487 | 
2488 | 
2489 |     Example
2490 |     -------
2491 |     >>> from PyRapidML.datasets import get_data
2492 |     >>> juice = extract_data('juice')
2493 |     >>> from PyRapidML.classification import *
2494 |     >>> exp_name = initializer(data = juice,  target = 'Purchase') 
2495 |     >>> X_train = get_config('X_train') 
2496 | 
2497 | 
2498 |     Returns:
2499 |         Global variable
2500 | 
2501 |     """
2502 | 
2503 |     return pycaret.internal.tabular.get_config(variable=variable)
2504 | 
2505 | 
2506 | def set_config(variable: str, value):
2507 | 
2508 |     """
2509 |     This function resets the global variables. Following variables are 
2510 |     accessible:
2511 | 
2512 |     - X: Transformed dataset (X)
2513 |     - y: Transformed dataset (y)  
2514 |     - X_train: Transformed train dataset (X)
2515 |     - X_test: Transformed test/holdout dataset (X)
2516 |     - y_train: Transformed train dataset (y)
2517 |     - y_test: Transformed test/holdout dataset (y)
2518 |     - seed: random state set through session_id
2519 |     - prep_pipe: Transformation pipeline
2520 |     - fold_shuffle_param: shuffle parameter used in Kfolds
2521 |     - n_jobs_param: n_jobs parameter used in model training
2522 |     - html_param: html_param configured through setup
2523 |     - create_model_container: results grid storage container
2524 |     - master_model_container: model storage container
2525 |     - display_container: results display container
2526 |     - exp_name_log: Name of experiment
2527 |     - logging_param: log_experiment param
2528 |     - log_plots_param: log_plots param
2529 |     - USI: Unique session ID parameter
2530 |     - fix_imbalance_param: fix_imbalance param
2531 |     - fix_imbalance_method_param: fix_imbalance_method param
2532 |     - data_before_preprocess: data before preprocessing
2533 |     - target_param: name of target variable
2534 |     - gpu_param: use_gpu param configured through setup
2535 |     - fold_generator: CV splitter configured in fold_strategy
2536 |     - fold_param: fold params defined in the setup
2537 |     - fold_groups_param: fold groups defined in the setup
2538 |     - stratify_param: stratify parameter defined in the setup
2539 | 
2540 |     Example
2541 |     -------
2542 |     >>> from PyRapidML.datasets import get_data
2543 |     >>> juice = extract_data('juice')
2544 |     >>> from PyRapidML.classification import *
2545 |     >>> exp_name = initializer(data = juice,  target = 'Purchase') 
2546 |     >>> set_config('seed', 123) 
2547 | 
2548 | 
2549 |     Returns:
2550 |         None
2551 | 
2552 |     """
2553 | 
2554 |     return pycaret.internal.tabular.set_config(variable=variable, value=value)
2555 | 
2556 | 
2557 | def save_config(file_name: str):
2558 | 
2559 |     """
2560 |     This function save all global variables to a pickle file, allowing to
2561 |     later resume without rerunning the ``setup``.
2562 | 
2563 | 
2564 |     Example
2565 |     -------
2566 |     >>> from PyRapidML.datasets import get_data
2567 |     >>> juice = extract_data('juice')
2568 |     >>> from PyRapidML.classification import *
2569 |     >>> exp_name = initializer(data = juice,  target = 'Purchase') 
2570 |     >>> save_config('myvars.pkl') 
2571 | 
2572 | 
2573 |     Returns:
2574 |         None
2575 | 
2576 |     """
2577 | 
2578 |     return pycaret.internal.tabular.save_config(file_name=file_name)
2579 | 
2580 | 
2581 | def load_config(file_name: str):
2582 | 
2583 |     """
2584 |     This function loads global variables from a pickle file into Python
2585 |     environment.
2586 | 
2587 | 
2588 |     Example
2589 |     -------
2590 |     >>> from PyRapidML.classification import load_config
2591 |     >>> load_config('myvars.pkl') 
2592 | 
2593 | 
2594 |     Returns:
2595 |         Global variables
2596 | 
2597 |     """
2598 | 
2599 |     return pycaret.internal.tabular.load_config(file_name=file_name)
2600 | 


--------------------------------------------------------------------------------