├── .idea ├── .name ├── .gitignore ├── vcs.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── PyRapidML.iml ├── PyRapidML ├── __init__.py ├── .DS_Store ├── datasets.py ├── eda.py ├── utils.py ├── regression.py └── classification.py ├── .DS_Store ├── docs ├── .DS_Store ├── source │ ├── .DS_Store │ ├── api │ │ ├── .DS_Store │ │ ├── datasets.rst │ │ ├── regression.rst │ │ ├── classification.rst │ │ ├── eda.rst │ │ └── natural_language_processing.rst │ ├── _static │ │ ├── .DS_Store │ │ └── css │ │ │ └── custom.css │ ├── requirements.txt │ ├── index.rst │ └── conf.py ├── Makefile └── make.bat ├── tests └── .DS_Store ├── Tutorials ├── .DS_Store ├── Regression │ ├── .DS_Store │ └── Final ET Model 30May2021.pkl └── Classification │ ├── .DS_Store │ └── Final RF Model 11Nov2020.pkl ├── .readthedocs.yaml.swp ├── .readthedocs.yaml ├── setup.py ├── LICENSE ├── .gitignore └── README.md /.idea/.name: -------------------------------------------------------------------------------- 1 | index.rst -------------------------------------------------------------------------------- /PyRapidML/__init__.py: -------------------------------------------------------------------------------- 1 | from PyRapidML.utils import __version__ 2 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/.DS_Store -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/.DS_Store -------------------------------------------------------------------------------- /tests/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/tests/.DS_Store -------------------------------------------------------------------------------- /PyRapidML/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/PyRapidML/.DS_Store -------------------------------------------------------------------------------- /Tutorials/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/.DS_Store -------------------------------------------------------------------------------- /.readthedocs.yaml.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/.readthedocs.yaml.swp -------------------------------------------------------------------------------- /docs/source/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/.DS_Store -------------------------------------------------------------------------------- /docs/source/api/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/api/.DS_Store -------------------------------------------------------------------------------- /docs/source/_static/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/docs/source/_static/.DS_Store -------------------------------------------------------------------------------- /Tutorials/Regression/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Regression/.DS_Store -------------------------------------------------------------------------------- /docs/source/api/datasets.rst: -------------------------------------------------------------------------------- 1 | Datasets 2 | =================== 3 | .. automodule:: PyRapidML.datasets 4 | :members: -------------------------------------------------------------------------------- /Tutorials/Classification/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Classification/.DS_Store -------------------------------------------------------------------------------- /docs/source/api/regression.rst: -------------------------------------------------------------------------------- 1 | Regression 2 | =================== 3 | .. automodule:: PyRapidML.regression 4 | :members: -------------------------------------------------------------------------------- /docs/source/api/classification.rst: -------------------------------------------------------------------------------- 1 | Classification 2 | =================== 3 | .. automodule:: PyRapidML.classification 4 | :members: -------------------------------------------------------------------------------- /docs/source/api/eda.rst: -------------------------------------------------------------------------------- 1 | Exploratory Data Analysis 2 | ===================================== 3 | .. automodule:: PyRapidML.eda 4 | :members: -------------------------------------------------------------------------------- /Tutorials/Regression/Final ET Model 30May2021.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Regression/Final ET Model 30May2021.pkl -------------------------------------------------------------------------------- /docs/source/api/natural_language_processing.rst: -------------------------------------------------------------------------------- 1 | NLP 2 | =================== 3 | .. automodule:: PyRapidML.natural_language_processing 4 | :members: -------------------------------------------------------------------------------- /Tutorials/Classification/Final RF Model 11Nov2020.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zainali5/PyRapidML/HEAD/Tutorials/Classification/Final RF Model 11Nov2020.pkl -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/PyRapidML.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /docs/source/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=3.0.0 2 | sphinx-rtd-theme>=0.5.0 3 | pandas 4 | scipy<=1.5.4 5 | numpy==1.19.5 6 | seaborn 7 | matplotlib 8 | IPython 9 | joblib 10 | scikit-learn==0.23.2 11 | ipywidgets 12 | yellowbrick>=1.0.1 13 | lightgbm>=2.3.1 14 | plotly>=4.4.1 15 | wordcloud 16 | textblob 17 | cufflinks>=0.17.0 18 | umap-learn 19 | pyLDAvis 20 | gensim<4.0.0 21 | spacy<2.4.0 22 | nltk 23 | mlxtend>=0.17.0 24 | pyod 25 | pandas-profiling>=2.8.0 26 | kmodes>=0.10.1 27 | mlflow 28 | imbalanced-learn==0.7.0 29 | scikit-plot #for lift and gain charts 30 | Boruta 31 | pycaret 32 | typing 33 | 34 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF 13 | formats: 14 | - pdf 15 | 16 | # Optionally set the version of Python and requirements required to build your docs 17 | python: 18 | version: 3.7 19 | install: 20 | - requirements: docs/source/requirements.txt 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r") as f: 4 | long_description = f.read() 5 | 6 | setup( 7 | name="PyRapidML", # Replace with your own username 8 | version="1.0.13", 9 | author="Zain Ali", 10 | author_email="zainbalouch3@gmail.com", 11 | description="An open source and low code machine learning library for quick and robust analysis", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/Zainali5/PyRapidML", 15 | packages=find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ] 21 | ) 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Zainali5 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/source/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | .rst-content dl:not(.docutils) dt:first-child { 2 | margin-top: 0; 3 | } 4 | 5 | .rst-content dl:not(.docutils) dl dt { 6 | margin-bottom: 4px; 7 | border: none; 8 | border-left: solid 3px #ccc; 9 | background: #f0f0f0; 10 | color: #555; 11 | } 12 | 13 | .rst-content dl table, 14 | .rst-content dl ul, 15 | .rst-content dl ol, 16 | .rst-content dl p { 17 | margin-bottom: 8px !important; 18 | } 19 | 20 | .rst-content dl:not(.docutils) dt { 21 | display: table; 22 | margin: 6px 0; 23 | font-size: 90%; 24 | line-height: normal; 25 | background: #e7f2fa; 26 | color: #2980b9; 27 | border-top: solid 3px #6ab0de; 28 | padding: 6px; 29 | position: relative; 30 | } 31 | 32 | html.writer-html5 .rst-content dl.field-list { 33 | display: initial; 34 | } 35 | 36 | html.writer-html5 .rst-content dl.field-list > dd, 37 | html.writer-html5 .rst-content dl.field-list > dt { 38 | margin-bottom: 4px; 39 | padding-left: 6px; 40 | } 41 | 42 | p { 43 | line-height: 20px; 44 | font-size: 14px; 45 | } 46 | 47 | html.writer-html5 .rst-content dl.field-list > dt:after { 48 | content: initial; 49 | } 50 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. PyRapidML documentation master file, created by 2 | sphinx-quickstart on Tue Jun 8 22:08:54 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | PyRapid Homepage! 7 | ===================================== 8 | 9 | PyRapidML is an open source Python library which not only helps in automating Machine Learning Workflows but also helps in building end to end ML algorithms . 10 | PyRapidML is essentially a Python wrapper around several machine learning libraries and frameworks such as Pycaret, scikit-learn, XGBoost, LightGBM, CatBoost, spaCy, Optuna, Hyperopt, Ray, and many more. 11 | 12 | PyRapidML is a low code library which means writing basic and less lines of code, one can achieve high accuracy in their machine learning models. 13 | There's no need to write hefty lines of code as PyRapidML would compare all possible machine learning algorithms to solve your problem in just a single line of code. 14 | Once PyRapidML gives you the best algorithm. You can further tune the model (in just a single line of code) to further tune the model. 15 | 16 | Are you tired of writing hefty lines of code for your data science problem? 17 | Are you having difficulty figuring out what algorithm performs the best? 18 | Is it hard for you to compare multiple algorithms and see which one has the best accuracy? 19 | Do you face issues in Hyperparameter tuning? 20 | Do you want easy model deployments? 21 | Do you a dream of auto-ml? 22 | Are you facing problems in Exploratory data analysis? 23 | Do you want a library that can automatically perform all steps of data science lifecycle? 24 | Do you want a library that can do Exploratory data analysis, Feature Engineering, Feature Selection, Compare multiple Machine Learning Algos, Hyperparamter tuning, model deployments? 25 | 26 | If the answer is Yes to the above questions then PyRapidML is the library for you. 27 | 28 | 29 | .. toctree:: 30 | :maxdepth: 2 31 | :hidden: 32 | :caption: Getting Started 33 | 34 | self 35 | 36 | .. toctree:: 37 | :maxdepth: 2 38 | :hidden: 39 | :caption: Documentation 40 | 41 | api/classification 42 | api/regression 43 | api/natural_language_processing 44 | api/datasets 45 | api/eda -------------------------------------------------------------------------------- /PyRapidML/datasets.py: -------------------------------------------------------------------------------- 1 | # Module: Datasets 2 | # Author: Zain Ali 3 | # License: MIT 4 | # Release: PyRapidML 5 | # Last modified : 30/05/2021 6 | 7 | 8 | def extract_data(dataset="index", save_copy=False, profile=False, verbose=True): 9 | 10 | """ 11 | This function loads sample datasets from git repository. List of available 12 | datasets can be checked using ``get_data('index')``. 13 | 14 | 15 | Example 16 | ------- 17 | >>> from PyRapidML.datasets import get_data 18 | >>> all_datasets = extract_data('index') 19 | >>> juice = extract_data('juice') 20 | 21 | 22 | dataset: str, default = 'index' 23 | Index value of dataset. 24 | 25 | 26 | save_copy: bool, default = False 27 | When set to true, it saves a copy in current working directory. 28 | 29 | 30 | profile: bool, default = False 31 | When set to true, an interactive EDA report is displayed. 32 | 33 | 34 | verbose: bool, default = True 35 | When set to False, head of data is not displayed. 36 | 37 | 38 | Returns: 39 | pandas.DataFrame 40 | 41 | 42 | Warnings 43 | -------- 44 | - Use of ``extract_data`` requires internet connection. 45 | 46 | """ 47 | 48 | import pandas as pd 49 | import os.path 50 | from IPython.display import display, HTML, clear_output, update_display 51 | 52 | address = "https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/" 53 | extension = ".csv" 54 | filename = str(dataset) + extension 55 | 56 | complete_address = address + filename 57 | 58 | if os.path.isfile(filename): 59 | data = pd.read_csv(filename) 60 | else: 61 | data = pd.read_csv(complete_address) 62 | 63 | # create a copy for pandas profiler 64 | data_for_profiling = data.copy() 65 | 66 | if save_copy: 67 | save_name = filename 68 | data.to_csv(save_name, index=False) 69 | 70 | if dataset == "index": 71 | display(data) 72 | 73 | else: 74 | if profile: 75 | import pandas_profiling 76 | 77 | pf = pandas_profiling.ProfileReport(data_for_profiling) 78 | display(pf) 79 | 80 | else: 81 | if verbose: 82 | display(data.head()) 83 | 84 | return data 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("../..")) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "PyRapidML" 22 | copyright = "2021, Zain Ali" 23 | author = "Zain Ali" 24 | 25 | # The full version, including alpha/beta/rc tags 26 | release = "1.0.13" 27 | 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | 35 | extensions = [ 36 | "sphinx_rtd_theme", 37 | "sphinx.ext.autodoc", 38 | "sphinx.ext.napoleon", 39 | ] 40 | 41 | napoleon_google_docstring = True 42 | napoleon_numpy_docstring = True 43 | 44 | autodoc_mock_imports = ["setup"] 45 | # Add any paths that contain templates here, relative to this directory. 46 | templates_path = ["_templates"] 47 | 48 | # List of patterns, relative to source directory, that match files and 49 | # directories to ignore when looking for source files. 50 | # This pattern also affects html_static_path and html_extra_path. 51 | exclude_patterns = [] 52 | 53 | # Sort methods by the order they are found in the source files 54 | autodoc_member_order = "bysource" 55 | 56 | 57 | # -- Options for HTML output ------------------------------------------------- 58 | 59 | # The theme to use for HTML and HTML Help pages. See the documentation for 60 | # a list of builtin themes. 61 | # 62 | html_theme = "sphinx_rtd_theme" 63 | 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | html_static_path = ["_static"] 68 | 69 | html_css_files = ["css/custom.css"] 70 | 71 | master_doc = "index" 72 | -------------------------------------------------------------------------------- /PyRapidML/eda.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Zain Ali 3 | # License: MIT 4 | # Release: PyRapidML 5 | # Last modified : 31/05/2021 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | def check_na(dataset): 11 | """ 12 | 13 | This function checks missing values and gives the % of missing values in each feature 14 | This function checks missing values and gives the % of missing values in each feature 15 | 16 | 17 | Example 18 | ------- 19 | >>> from PyRapidML.eda import check_na 20 | >>> na_perc = check_na(df) 21 | 22 | df: dataframe 23 | 24 | 25 | """ 26 | # Here we will check the percentage of nan values present in each feature 27 | ## 1 -step make the list of features which has missing values 28 | features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1] 29 | ## 2- step print the feature name and the percentage of missing values 30 | if len(features_with_na) > 0: 31 | for feature in features_with_na: 32 | print(feature, np.round(dataset[feature].isnull().mean(), 4), ' % missing values') 33 | #return pycaret.internal.tabular.check_na(dataset=dataset) 34 | else: 35 | print("No Missing Values") 36 | 37 | 38 | 39 | 40 | def numerical_features(dataset): 41 | # list of numerical variables 42 | """ 43 | 44 | This function tells total numerical features and further tell how many of them are discrete and continuous 45 | This function checks missing values and gives the % of missing values in each feature 46 | 47 | Example 48 | ------- 49 | >>> from PyRapidML.eda import numerical_features 50 | >>> num_fea = numerical_features(df) 51 | 52 | df: dataframe 53 | 54 | 55 | """ 56 | numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O'] 57 | 58 | print('Number of numerical variables: ', len(numerical_features)) 59 | 60 | # visualise the numerical variables 61 | #print(dataset[numerical_features].head()) 62 | ## Numerical variables are usually of 2 type 63 | ## 1. Continous variable and Discrete Variables 64 | 65 | discrete_feature=[feature for feature in numerical_features if len(dataset[feature].unique())<25] 66 | print("Discrete Variables Count: {}".format(len(discrete_feature))) 67 | 68 | continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature] 69 | print("Continuous feature Count {}".format(len(continuous_feature))) 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## PyRapidML 2 | # Introduction 3 | Are you tired of writing hefty lines of code for your data science problem? 4 | Are you having difficulty figuring out what algorithm performs the best? 5 | Is it hard for you to compare multiple algorithms and see which one has the best accuracy? 6 | Do you face issues in Hyperparameter tuning? 7 | Do you want easy model deployments? 8 | Do you a dream of auto-ml? 9 | Are you facing problems in Exploratory data analysis? 10 | Do you want a library that can automatically perform all steps of data science lifecycle? 11 | Do you want a library that can do Exploratory data analysis, Feature Engineering, Feature Selection, Compare multiple Machine Learning Algos, Hyperparamter tuning, model deployments? 12 | 13 | If the answer is Yes to the above questions then PyRapidML is the library for you. 14 | 15 | PyRapidML is an open source Python machine learning library. 16 | PyRapidML is essentially a Python wrapper around several machine learning libraries and frameworks such as Pycaret, scikit-learn, XGBoost, LightGBM, CatBoost, spaCy, Optuna, Hyperopt, Ray, and many more. 17 | 18 | PyRapidML is an open source Python library which not only helps in automating Machine Learning Workflows but also helps in building end to end ML algorithms . 19 | 20 | PyRapidML is a low code library which means writing basic and less lines of code, one can achieve high accuracy in their machine learning models. 21 | There's no need to write hefty lines of code as PyRapidML would compare all possible machine learning algorithms to solve your problem in just a single line of code. 22 | Once PyRapidML gives you the best algorithm. You can further tune the model (in just a single line of code) to further tune the model. 23 | 24 | Initial idea of PyRapidML was inspired by PyCaret library in Python. 25 | 26 | # What data science problems PyRapidML can cater? 27 | Regression 28 | Classification 29 | Natural Language Processing 30 | 31 | # What PyRapidML has to offer currently? 32 | Data Prep 33 | Exploratory Data Analysis 34 | Model Training 35 | Finding the best ML model 36 | Hyperparameter tuning 37 | Model Deployment 38 | Analysis and Interpretability 39 | 40 | 41 | # Who is this library for? 42 | This library is for: 43 | Data Scientists 44 | Citizen Data Scientists 45 | Data Science Students 46 | Data Analysts 47 | Data Professionals who want to build end to end data science solutions 48 | 49 | # How to install this library? 50 | pip install PyRapidML 51 | 52 | # Important Links 53 | - Docs: https://pyrapidml.readthedocs.io/en/latest/ 54 | - Github Link: https://github.com/Zainali5/PyRapidML 55 | - Pypi link: https://pypi.org/project/PyRapidML/1.0.13/ 56 | # Current Release 57 | PyRapidML 1.0.13 is now available. The easiest way to install PyRapidML is using pip. 58 | -------------------------------------------------------------------------------- /PyRapidML/utils.py: -------------------------------------------------------------------------------- 1 | # Module: Utility 2 | # Author: Zain Ali 3 | # License: MIT 4 | # Release: PyRapidML 5 | # Last modified : 31/05/2021 6 | 7 | import pandas as pd 8 | 9 | version_ = "1.0.13" 10 | nightly_version_ = "1.0.13" 11 | 12 | __version__ = version_ 13 | 14 | 15 | def version(): 16 | return version_ 17 | 18 | 19 | def nightly_version(): 20 | return nightly_version_ 21 | 22 | 23 | def check_metric(actual: pd.Series, prediction: pd.Series, metric: str, round: int = 4): 24 | 25 | """ 26 | Function to evaluate classification and regression metrics. 27 | 28 | 29 | actual : pandas.Series 30 | Actual values of the target variable. 31 | 32 | 33 | prediction : pandas.Series 34 | Predicted values of the target variable. 35 | 36 | 37 | metric : str 38 | Metric to use. 39 | 40 | 41 | round: integer, default = 4 42 | Number of decimal places the metrics will be rounded to. 43 | 44 | 45 | Returns: 46 | float 47 | 48 | """ 49 | 50 | # general dependencies 51 | import pycaret.containers.metrics.classification 52 | import pycaret.containers.metrics.regression 53 | 54 | globals_dict = {"y": prediction} 55 | metric_containers = { 56 | **pycaret.containers.metrics.classification.get_all_metric_containers( 57 | globals_dict 58 | ), 59 | **pycaret.containers.metrics.regression.get_all_metric_containers(globals_dict), 60 | } 61 | metrics = {v.name: v.score_func for k, v in metric_containers.items()} 62 | 63 | # metric calculation starts here 64 | 65 | if metric in metrics: 66 | try: 67 | result = metrics[metric](actual, prediction) 68 | except: 69 | from sklearn.preprocessing import LabelEncoder 70 | 71 | le = LabelEncoder() 72 | actual = le.fit_transform(actual) 73 | prediction = le.transform(prediction) 74 | result = metrics[metric](actual, prediction) 75 | result = result.round(round) 76 | return float(result) 77 | else: 78 | raise ValueError( 79 | f"Couldn't find metric '{metric}' Possible metrics are: {', '.join(metrics.keys())}." 80 | ) 81 | 82 | 83 | def enable_colab(): 84 | from IPython.display import display, HTML, clear_output, update_display 85 | 86 | """ 87 | Function to render plotly visuals in colab. 88 | """ 89 | 90 | def configure_plotly_browser_state(): 91 | 92 | import IPython 93 | 94 | display( 95 | IPython.core.display.HTML( 96 | """ 97 | 98 | 106 | """ 107 | ) 108 | ) 109 | 110 | import IPython 111 | 112 | IPython.get_ipython().events.register( 113 | "pre_run_cell", configure_plotly_browser_state 114 | ) 115 | print("Colab mode enabled.") 116 | 117 | 118 | def get_system_logs(): 119 | 120 | """ 121 | Read and print 'logs.log' file from current active directory 122 | """ 123 | 124 | with open("logs.log", "r") as file: 125 | lines = file.read().splitlines() 126 | 127 | for line in lines: 128 | if not line: 129 | continue 130 | 131 | columns = [col.strip() for col in line.split(":") if col] 132 | print(columns) 133 | -------------------------------------------------------------------------------- /PyRapidML/regression.py: -------------------------------------------------------------------------------- 1 | # Module: Regression 2 | # Author: Zain Ali 3 | # License: MIT 4 | # Release: PyRapidML 5 | # Last modified : 31/05/2021 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | import pycaret.internal.tabular 11 | from pycaret.internal.Display import Display, is_in_colab, enable_colab 12 | from typing import List, Tuple, Any, Union, Optional, Dict 13 | import warnings 14 | from IPython.utils import io 15 | 16 | from pycaret.internal.tabular import MLUsecase 17 | 18 | warnings.filterwarnings("ignore") 19 | 20 | 21 | def initializer( 22 | data: pd.DataFrame, 23 | target: str, 24 | train_size: float = 0.7, 25 | test_data: Optional[pd.DataFrame] = None, 26 | preprocess: bool = True, 27 | imputation_type: str = "simple", 28 | iterative_imputation_iters: int = 5, 29 | categorical_features: Optional[List[str]] = None, 30 | categorical_imputation: str = "constant", 31 | categorical_iterative_imputer: Union[str, Any] = "lightgbm", 32 | ordinal_features: Optional[Dict[str, list]] = None, 33 | high_cardinality_features: Optional[List[str]] = None, 34 | high_cardinality_method: str = "frequency", 35 | numeric_features: Optional[List[str]] = None, 36 | numeric_imputation: str = "mean", 37 | numeric_iterative_imputer: Union[str, Any] = "lightgbm", 38 | date_features: Optional[List[str]] = None, 39 | ignore_features: Optional[List[str]] = None, 40 | normalize: bool = False, 41 | normalize_method: str = "zscore", 42 | transformation: bool = False, 43 | transformation_method: str = "yeo-johnson", 44 | handle_unknown_categorical: bool = True, 45 | unknown_categorical_method: str = "least_frequent", 46 | pca: bool = False, 47 | pca_method: str = "linear", 48 | pca_components: Optional[float] = None, 49 | ignore_low_variance: bool = False, 50 | combine_rare_levels: bool = False, 51 | rare_level_threshold: float = 0.10, 52 | bin_numeric_features: Optional[List[str]] = None, 53 | remove_outliers: bool = False, 54 | outliers_threshold: float = 0.05, 55 | remove_multicollinearity: bool = False, 56 | multicollinearity_threshold: float = 0.9, 57 | remove_perfect_collinearity: bool = True, 58 | create_clusters: bool = False, 59 | cluster_iter: int = 20, 60 | polynomial_features: bool = False, 61 | polynomial_degree: int = 2, 62 | trigonometry_features: bool = False, 63 | polynomial_threshold: float = 0.1, 64 | group_features: Optional[List[str]] = None, 65 | group_names: Optional[List[str]] = None, 66 | feature_selection: bool = False, 67 | feature_selection_threshold: float = 0.8, 68 | feature_selection_method: str = "classic", 69 | feature_interaction: bool = False, 70 | feature_ratio: bool = False, 71 | interaction_threshold: float = 0.01, 72 | transform_target: bool = False, 73 | transform_target_method: str = "box-cox", 74 | data_split_shuffle: bool = True, 75 | data_split_stratify: Union[bool, List[str]] = False, 76 | fold_strategy: Union[str, Any] = "kfold", 77 | fold: int = 10, 78 | fold_shuffle: bool = False, 79 | fold_groups: Optional[Union[str, pd.DataFrame]] = None, 80 | n_jobs: Optional[int] = -1, 81 | use_gpu: bool = False, 82 | custom_pipeline: Union[ 83 | Any, Tuple[str, Any], List[Any], List[Tuple[str, Any]] 84 | ] = None, 85 | html: bool = True, 86 | session_id: Optional[int] = None, 87 | log_experiment: bool = False, 88 | experiment_name: Optional[str] = None, 89 | log_plots: Union[bool, list] = False, 90 | log_profile: bool = False, 91 | log_data: bool = False, 92 | silent: bool = False, 93 | verbose: bool = True, 94 | profile: bool = False, 95 | profile_kwargs: Dict[str, Any] = None, 96 | ): 97 | """ 98 | This function initializes the training environment and creates the transformation 99 | pipeline. Setup function must be called before executing any other function. It takes 100 | two mandatory parameters: ``data`` and ``target``. All the other parameters are 101 | optional. 102 | 103 | Example 104 | ------- 105 | >>> from PyRapidML.datasets import get_data 106 | >>> boston = extract_data('boston') 107 | >>> from PyRapidML.regression import * 108 | >>> exp_name = initializer(data = boston, target = 'medv') 109 | 110 | 111 | data : pandas.DataFrame 112 | Shape (n_samples, n_features), where n_samples is the number of samples and 113 | n_features is the number of features. 114 | 115 | 116 | target: str 117 | Name of the target column to be passed in as a string. The target variable can 118 | be either binary or multiclass. 119 | 120 | 121 | train_size: float, default = 0.7 122 | Proportion of the dataset to be used for training and validation. Should be 123 | between 0.0 and 1.0. 124 | 125 | 126 | test_data: pandas.DataFrame, default = None 127 | If not None, test_data is used as a hold-out set and ``train_size`` parameter is 128 | ignored. test_data must be labelled and the shape of data and test_data must 129 | match. 130 | 131 | 132 | preprocess: bool, default = True 133 | When set to False, no transformations are applied except for train_test_split 134 | and custom transformations passed in ``custom_pipeline`` param. Data must be 135 | ready for modeling (no missing values, no dates, categorical data encoding), 136 | when preprocess is set to False. 137 | 138 | 139 | imputation_type: str, default = 'simple' 140 | The type of imputation to use. Can be either 'simple' or 'iterative'. 141 | 142 | 143 | iterative_imputation_iters: int, default = 5 144 | Number of iterations. Ignored when ``imputation_type`` is not 'iterative'. 145 | 146 | 147 | categorical_features: list of str, default = None 148 | If the inferred data types are not correct or the silent param is set to True, 149 | categorical_features param can be used to overwrite or define the data types. 150 | It takes a list of strings with column names that are categorical. 151 | 152 | 153 | categorical_imputation: str, default = 'constant' 154 | Missing values in categorical features are imputed with a constant 'not_available' 155 | value. The other available option is 'mode'. 156 | 157 | 158 | categorical_iterative_imputer: str, default = 'lightgbm' 159 | Estimator for iterative imputation of missing values in categorical features. 160 | Ignored when ``imputation_type`` is not 'iterative'. 161 | 162 | 163 | ordinal_features: dict, default = None 164 | Encode categorical features as ordinal. For example, a categorical feature with 165 | 'low', 'medium', 'high' values where low < medium < high can be passed as 166 | ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }. 167 | 168 | 169 | high_cardinality_features: list of str, default = None 170 | When categorical features contains many levels, it can be compressed into fewer 171 | levels using this parameter. It takes a list of strings with column names that 172 | are categorical. 173 | 174 | 175 | high_cardinality_method: str, default = 'frequency' 176 | Categorical features with high cardinality are replaced with the frequency of 177 | values in each level occurring in the training dataset. Other available method 178 | is 'clustering' which trains the K-Means clustering algorithm on the statistical 179 | attribute of the training data and replaces the original value of feature with the 180 | cluster label. The number of clusters is determined by optimizing Calinski-Harabasz 181 | and Silhouette criterion. 182 | 183 | 184 | numeric_features: list of str, default = None 185 | If the inferred data types are not correct or the silent param is set to True, 186 | numeric_features param can be used to overwrite or define the data types. 187 | It takes a list of strings with column names that are numeric. 188 | 189 | 190 | numeric_imputation: str, default = 'mean' 191 | Missing values in numeric features are imputed with 'mean' value of the feature 192 | in the training dataset. The other available option is 'median' or 'zero'. 193 | 194 | 195 | numeric_iterative_imputer: str, default = 'lightgbm' 196 | Estimator for iterative imputation of missing values in numeric features. 197 | Ignored when ``imputation_type`` is set to 'simple'. 198 | 199 | 200 | date_features: list of str, default = None 201 | If the inferred data types are not correct or the silent param is set to True, 202 | date_features param can be used to overwrite or define the data types. It takes 203 | a list of strings with column names that are DateTime. 204 | 205 | 206 | ignore_features: list of str, default = None 207 | ignore_features param can be used to ignore features during model training. 208 | It takes a list of strings with column names that are to be ignored. 209 | 210 | 211 | normalize: bool, default = False 212 | When set to True, it transforms the numeric features by scaling them to a given 213 | range. Type of scaling is defined by the ``normalize_method`` parameter. 214 | 215 | 216 | normalize_method: str, default = 'zscore' 217 | Defines the method for scaling. By default, normalize method is set to 'zscore' 218 | The standard zscore is calculated as z = (x - u) / s. Ignored when ``normalize`` 219 | is not True. The other options are: 220 | 221 | - minmax: scales and translates each feature individually such that it is in 222 | the range of 0 - 1. 223 | - maxabs: scales and translates each feature individually such that the 224 | maximal absolute value of each feature will be 1.0. It does not 225 | shift/center the data, and thus does not destroy any sparsity. 226 | - robust: scales and translates each feature according to the Interquartile 227 | range. When the dataset contains outliers, robust scaler often gives 228 | better results. 229 | 230 | 231 | transformation: bool, default = False 232 | When set to True, it applies the power transform to make data more Gaussian-like. 233 | Type of transformation is defined by the ``transformation_method`` parameter. 234 | 235 | 236 | transformation_method: str, default = 'yeo-johnson' 237 | Defines the method for transformation. By default, the transformation method is 238 | set to 'yeo-johnson'. The other available option for transformation is 'quantile'. 239 | Ignored when ``transformation`` is not True. 240 | 241 | 242 | handle_unknown_categorical: bool, default = True 243 | When set to True, unknown categorical levels in unseen data are replaced by the 244 | most or least frequent level as learned in the training dataset. 245 | 246 | 247 | unknown_categorical_method: str, default = 'least_frequent' 248 | Method used to replace unknown categorical levels in unseen data. Method can be 249 | set to 'least_frequent' or 'most_frequent'. 250 | 251 | 252 | pca: bool, default = False 253 | When set to True, dimensionality reduction is applied to project the data into 254 | a lower dimensional space using the method defined in ``pca_method`` parameter. 255 | 256 | 257 | pca_method: str, default = 'linear' 258 | The 'linear' method performs uses Singular Value Decomposition. Other options are: 259 | 260 | - kernel: dimensionality reduction through the use of RBF kernel. 261 | - incremental: replacement for 'linear' pca when the dataset is too large. 262 | 263 | 264 | pca_components: int or float, default = None 265 | Number of components to keep. if pca_components is a float, it is treated as a 266 | target percentage for information retention. When pca_components is an integer 267 | it is treated as the number of features to be kept. pca_components must be less 268 | than the original number of features. Ignored when ``pca`` is not True. 269 | 270 | 271 | ignore_low_variance: bool, default = False 272 | When set to True, all categorical features with insignificant variances are 273 | removed from the data. The variance is calculated using the ratio of unique 274 | values to the number of samples, and the ratio of the most common value to the 275 | frequency of the second most common value. 276 | 277 | 278 | combine_rare_levels: bool, default = False 279 | When set to True, frequency percentile for levels in categorical features below 280 | a certain threshold is combined into a single level. 281 | 282 | 283 | rare_level_threshold: float, default = 0.1 284 | Percentile distribution below which rare categories are combined. Ignored when 285 | ``combine_rare_levels`` is not True. 286 | 287 | 288 | bin_numeric_features: list of str, default = None 289 | To convert numeric features into categorical, bin_numeric_features parameter can 290 | be used. It takes a list of strings with column names to be discretized. It does 291 | so by using 'sturges' rule to determine the number of clusters and then apply 292 | KMeans algorithm. Original values of the feature are then replaced by the 293 | cluster label. 294 | 295 | 296 | remove_outliers: bool, default = False 297 | When set to True, outliers from the training data are removed using the Singular 298 | Value Decomposition. 299 | 300 | 301 | outliers_threshold: float, default = 0.05 302 | The percentage outliers to be removed from the training dataset. Ignored when 303 | ``remove_outliers`` is not True. 304 | 305 | 306 | remove_multicollinearity: bool, default = False 307 | When set to True, features with the inter-correlations higher than the defined 308 | threshold are removed. When two features are highly correlated with each other, 309 | the feature that is less correlated with the target variable is removed. Only 310 | considers numeric features. 311 | 312 | 313 | multicollinearity_threshold: float, default = 0.9 314 | Threshold for correlated features. Ignored when ``remove_multicollinearity`` 315 | is not True. 316 | 317 | 318 | remove_perfect_collinearity: bool, default = True 319 | When set to True, perfect collinearity (features with correlation = 1) is removed 320 | from the dataset, when two features are 100% correlated, one of it is randomly 321 | removed from the dataset. 322 | 323 | 324 | create_clusters: bool, default = False 325 | When set to True, an additional feature is created in training dataset where each 326 | instance is assigned to a cluster. The number of clusters is determined by 327 | optimizing Calinski-Harabasz and Silhouette criterion. 328 | 329 | 330 | cluster_iter: int, default = 20 331 | Number of iterations for creating cluster. Each iteration represents cluster 332 | size. Ignored when ``create_clusters`` is not True. 333 | 334 | 335 | polynomial_features: bool, default = False 336 | When set to True, new features are derived using existing numeric features. 337 | 338 | 339 | polynomial_degree: int, default = 2 340 | Degree of polynomial features. For example, if an input sample is two dimensional 341 | and of the form [a, b], the polynomial features with degree = 2 are: 342 | [1, a, b, a^2, ab, b^2]. Ignored when ``polynomial_features`` is not True. 343 | 344 | 345 | trigonometry_features: bool, default = False 346 | When set to True, new features are derived using existing numeric features. 347 | 348 | 349 | polynomial_threshold: float, default = 0.1 350 | When ``polynomial_features`` or ``trigonometry_features`` is True, new features 351 | are derived from the existing numeric features. This may sometimes result in too 352 | large feature space. polynomial_threshold parameter can be used to deal with this 353 | problem. It does so by using combination of Random Forest, AdaBoost and Linear 354 | correlation. All derived features that falls within the percentile distribution 355 | are kept and rest of the features are removed. 356 | 357 | 358 | group_features: list or list of list, default = None 359 | When the dataset contains features with related characteristics, group_features 360 | parameter can be used for feature extraction. It takes a list of strings with 361 | column names that are related. 362 | 363 | 364 | group_names: list, default = None 365 | Group names to be used in naming new features. When the length of group_names 366 | does not match with the length of ``group_features``, new features are named 367 | sequentially group_1, group_2, etc. It is ignored when ``group_features`` is 368 | None. 369 | 370 | 371 | feature_selection: bool, default = False 372 | When set to True, a subset of features are selected using a combination of 373 | various permutation importance techniques including Random Forest, Adaboost 374 | and Linear correlation with target variable. The size of the subset is 375 | dependent on the ``feature_selection_threshold`` parameter. 376 | 377 | 378 | feature_selection_threshold: float, default = 0.8 379 | Threshold value used for feature selection. When ``polynomial_features`` or 380 | ``feature_interaction`` is True, it is recommended to keep the threshold low 381 | to avoid large feature spaces. Setting a very low value may be efficient but 382 | could result in under-fitting. 383 | 384 | 385 | feature_selection_method: str, default = 'classic' 386 | Algorithm for feature selection. 'classic' method uses permutation feature 387 | importance techniques. Other possible value is 'boruta' which uses boruta 388 | algorithm for feature selection. 389 | 390 | 391 | feature_interaction: bool, default = False 392 | When set to True, new features are created by interacting (a * b) all the 393 | numeric variables in the dataset. This feature is not scalable and may not 394 | work as expected on datasets with large feature space. 395 | 396 | 397 | feature_ratio: bool, default = False 398 | When set to True, new features are created by calculating the ratios (a / b) 399 | between all numeric variables in the dataset. This feature is not scalable and 400 | may not work as expected on datasets with large feature space. 401 | 402 | 403 | interaction_threshold: bool, default = 0.01 404 | Similar to polynomial_threshold, It is used to compress a sparse matrix of newly 405 | created features through interaction. Features whose importance based on the 406 | combination of Random Forest, AdaBoost and Linear correlation falls within the 407 | percentile of the defined threshold are kept in the dataset. Remaining features 408 | are dropped before further processing. 409 | 410 | 411 | transform_target: bool, default = False 412 | When set to True, target variable is transformed using the method defined in 413 | ``transform_target_method`` param. Target transformation is applied separately 414 | from feature transformations. 415 | 416 | 417 | transform_target_method: str, default = 'box-cox' 418 | 'Box-cox' and 'yeo-johnson' methods are supported. Box-Cox requires input data to 419 | be strictly positive, while Yeo-Johnson supports both positive or negative data. 420 | When transform_target_method is 'box-cox' and target variable contains negative 421 | values, method is internally forced to 'yeo-johnson' to avoid exceptions. 422 | 423 | 424 | data_split_shuffle: bool, default = True 425 | When set to False, prevents shuffling of rows during 'train_test_split'. 426 | 427 | 428 | data_split_stratify: bool or list, default = False 429 | Controls stratification during 'train_test_split'. When set to True, will 430 | stratify by target column. To stratify on any other columns, pass a list of 431 | column names. Ignored when ``data_split_shuffle`` is False. 432 | 433 | 434 | fold_strategy: str or sklearn CV generator object, default = 'kfold' 435 | Choice of cross validation strategy. Possible values are: 436 | 437 | * 'kfold' 438 | * 'stratifiedkfold' 439 | * 'groupkfold' 440 | * 'timeseries' 441 | * a custom CV generator object compatible with scikit-learn. 442 | 443 | 444 | fold: int, default = 10 445 | Number of folds to be used in cross validation. Must be at least 2. This is 446 | a global setting that can be over-written at function level by using ``fold`` 447 | parameter. Ignored when ``fold_strategy`` is a custom object. 448 | 449 | 450 | fold_shuffle: bool, default = False 451 | Controls the shuffle parameter of CV. Only applicable when ``fold_strategy`` 452 | is 'kfold' or 'stratifiedkfold'. Ignored when ``fold_strategy`` is a custom 453 | object. 454 | 455 | 456 | fold_groups: str or array-like, with shape (n_samples,), default = None 457 | Optional group labels when 'GroupKFold' is used for the cross validation. 458 | It takes an array with shape (n_samples, ) where n_samples is the number 459 | of rows in the training dataset. When string is passed, it is interpreted 460 | as the column name in the dataset containing group labels. 461 | 462 | 463 | n_jobs: int, default = -1 464 | The number of jobs to run in parallel (for functions that supports parallel 465 | processing) -1 means using all processors. To run all functions on single 466 | processor set n_jobs to None. 467 | 468 | 469 | use_gpu: bool or str, default = False 470 | When set to True, it will use GPU for training with algorithms that support it, 471 | and fall back to CPU if they are unavailable. When set to 'force', it will only 472 | use GPU-enabled algorithms and raise exceptions when they are unavailable. When 473 | False, all algorithms are trained using CPU only. 474 | 475 | GPU enabled algorithms: 476 | 477 | - Extreme Gradient Boosting, requires no further installation 478 | 479 | - CatBoost Regressor, requires no further installation 480 | (GPU is only enabled when data > 50,000 rows) 481 | 482 | - Light Gradient Boosting Machine, requires GPU installation 483 | https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html 484 | 485 | - Linear Regression, Lasso Regression, Ridge Regression, K Neighbors Regressor, 486 | Random Forest, Support Vector Regression, Elastic Net requires cuML >= 0.15 487 | https://github.com/rapidsai/cuml 488 | 489 | 490 | custom_pipeline: (str, transformer) or list of (str, transformer), default = None 491 | When passed, will append the custom transformers in the preprocessing pipeline 492 | and are applied on each CV fold separately and on the final fit. All the custom 493 | transformations are applied after 'train_test_split' and before PyRapidML's internal 494 | transformations. 495 | 496 | 497 | html: bool, default = True 498 | When set to False, prevents runtime display of monitor. This must be set to False 499 | when the environment does not support IPython. For example, command line terminal, 500 | Databricks Notebook, Spyder and other similar IDEs. 501 | 502 | 503 | session_id: int, default = None 504 | Controls the randomness of experiment. It is equivalent to 'random_state' in 505 | scikit-learn. When None, a pseudo random number is generated. This can be used 506 | for later reproducibility of the entire experiment. 507 | 508 | 509 | log_experiment: bool, default = False 510 | When set to True, all metrics and parameters are logged on the ``MLFlow`` server. 511 | 512 | 513 | experiment_name: str, default = None 514 | Name of the experiment for logging. Ignored when ``log_experiment`` is not True. 515 | 516 | 517 | log_plots: bool or list, default = False 518 | When set to True, certain plots are logged automatically in the ``MLFlow`` server. 519 | To change the type of plots to be logged, pass a list containing plot IDs. Refer 520 | to documentation of ``plot_model``. Ignored when ``log_experiment`` is not True. 521 | 522 | 523 | log_profile: bool, default = False 524 | When set to True, data profile is logged on the ``MLflow`` server as a html file. 525 | Ignored when ``log_experiment`` is not True. 526 | 527 | 528 | log_data: bool, default = False 529 | When set to True, dataset is logged on the ``MLflow`` server as a csv file. 530 | Ignored when ``log_experiment`` is not True. 531 | 532 | 533 | silent: bool, default = False 534 | Controls the confirmation input of data types when ``setup`` is executed. When 535 | executing in completely automated mode or on a remote kernel, this must be True. 536 | 537 | 538 | verbose: bool, default = True 539 | When set to False, Information grid is not printed. 540 | 541 | 542 | profile: bool, default = False 543 | When set to True, an interactive EDA report is displayed. 544 | 545 | 546 | profile_kwargs: dict, default = {} (empty dict) 547 | Dictionary of arguments passed to the ProfileReport method used 548 | to create the EDA report. Ignored if ``profile`` is False. 549 | 550 | 551 | Returns: 552 | Global variables that can be changed using the ``set_config`` function. 553 | 554 | """ 555 | available_plots = { 556 | "parameter": "Hyperparameters", 557 | "residuals": "Residuals", 558 | "error": "Prediction Error", 559 | "cooks": "Cooks Distance", 560 | "rfe": "Feature Selection", 561 | "learning": "Learning Curve", 562 | "manifold": "Manifold Learning", 563 | "vc": "Validation Curve", 564 | "feature": "Feature Importance", 565 | "feature_all": "Feature Importance (All)", 566 | "tree": "Decision Tree", 567 | "residuals_interactive": "Interactive Residuals", 568 | } 569 | 570 | if log_plots == True: 571 | log_plots = ["residuals", "error", "feature"] 572 | 573 | return pycaret.internal.tabular.setup( 574 | ml_usecase="regression", 575 | available_plots=available_plots, 576 | data=data, 577 | target=target, 578 | train_size=train_size, 579 | test_data=test_data, 580 | preprocess=preprocess, 581 | imputation_type=imputation_type, 582 | iterative_imputation_iters=iterative_imputation_iters, 583 | categorical_features=categorical_features, 584 | categorical_imputation=categorical_imputation, 585 | categorical_iterative_imputer=categorical_iterative_imputer, 586 | ordinal_features=ordinal_features, 587 | high_cardinality_features=high_cardinality_features, 588 | high_cardinality_method=high_cardinality_method, 589 | numeric_features=numeric_features, 590 | numeric_imputation=numeric_imputation, 591 | numeric_iterative_imputer=numeric_iterative_imputer, 592 | date_features=date_features, 593 | ignore_features=ignore_features, 594 | normalize=normalize, 595 | normalize_method=normalize_method, 596 | transformation=transformation, 597 | transformation_method=transformation_method, 598 | handle_unknown_categorical=handle_unknown_categorical, 599 | unknown_categorical_method=unknown_categorical_method, 600 | pca=pca, 601 | pca_method=pca_method, 602 | pca_components=pca_components, 603 | ignore_low_variance=ignore_low_variance, 604 | combine_rare_levels=combine_rare_levels, 605 | rare_level_threshold=rare_level_threshold, 606 | bin_numeric_features=bin_numeric_features, 607 | remove_outliers=remove_outliers, 608 | outliers_threshold=outliers_threshold, 609 | remove_multicollinearity=remove_multicollinearity, 610 | multicollinearity_threshold=multicollinearity_threshold, 611 | remove_perfect_collinearity=remove_perfect_collinearity, 612 | create_clusters=create_clusters, 613 | cluster_iter=cluster_iter, 614 | polynomial_features=polynomial_features, 615 | polynomial_degree=polynomial_degree, 616 | trigonometry_features=trigonometry_features, 617 | polynomial_threshold=polynomial_threshold, 618 | group_features=group_features, 619 | group_names=group_names, 620 | feature_selection=feature_selection, 621 | feature_selection_threshold=feature_selection_threshold, 622 | feature_selection_method=feature_selection_method, 623 | feature_interaction=feature_interaction, 624 | feature_ratio=feature_ratio, 625 | interaction_threshold=interaction_threshold, 626 | transform_target=transform_target, 627 | transform_target_method=transform_target_method, 628 | data_split_shuffle=data_split_shuffle, 629 | data_split_stratify=data_split_stratify, 630 | fold_strategy=fold_strategy, 631 | fold=fold, 632 | fold_shuffle=fold_shuffle, 633 | fold_groups=fold_groups, 634 | n_jobs=n_jobs, 635 | use_gpu=use_gpu, 636 | custom_pipeline=custom_pipeline, 637 | html=html, 638 | session_id=session_id, 639 | log_experiment=log_experiment, 640 | experiment_name=experiment_name, 641 | log_plots=log_plots, 642 | log_profile=log_profile, 643 | log_data=log_data, 644 | silent=silent, 645 | verbose=verbose, 646 | profile=profile, 647 | profile_kwargs=profile_kwargs, 648 | ) 649 | 650 | 651 | def comparing_models( 652 | include: Optional[List[Union[str, Any]]] = None, 653 | exclude: Optional[List[str]] = None, 654 | fold: Optional[Union[int, Any]] = None, 655 | round: int = 4, 656 | cross_validation: bool = True, 657 | sort: str = "R2", 658 | n_select: int = 1, 659 | budget_time: Optional[float] = None, 660 | turbo: bool = True, 661 | errors: str = "ignore", 662 | fit_kwargs: Optional[dict] = None, 663 | groups: Optional[Union[str, Any]] = None, 664 | verbose: bool = True, 665 | ): 666 | 667 | """ 668 | This function trains and evaluates performance of all estimators available in the 669 | model library using cross validation. The output of this function is a score grid 670 | with average cross validated scores. Metrics evaluated during CV can be accessed 671 | using the ``get_metrics`` function. Custom metrics can be added or removed using 672 | ``add_metric`` and ``remove_metric`` function. 673 | 674 | 675 | Example 676 | -------- 677 | >>> from PyRapidML.datasets import get_data 678 | >>> boston = extract_data('boston') 679 | >>> from PyRapidML.regression import * 680 | >>> exp_name = initializer(data = boston, target = 'medv') 681 | >>> best_model = comparing_models() 682 | 683 | 684 | include: list of str or scikit-learn compatible object, default = None 685 | To train and evaluate select models, list containing model ID or scikit-learn 686 | compatible object can be passed in include param. To see a list of all models 687 | available in the model library use the ``models`` function. 688 | 689 | 690 | exclude: list of str, default = None 691 | To omit certain models from training and evaluation, pass a list containing 692 | model id in the exclude parameter. To see a list of all models available 693 | in the model library use the ``models`` function. 694 | 695 | 696 | fold: int or scikit-learn compatible CV generator, default = None 697 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 698 | parameter of the ``setup`` function is used. When an integer is passed, 699 | it is interpreted as the 'n_splits' parameter of the CV generator in the 700 | ``setup`` function. 701 | 702 | 703 | round: int, default = 4 704 | Number of decimal places the metrics in the score grid will be rounded to. 705 | 706 | 707 | cross_validation: bool, default = True 708 | When set to False, metrics are evaluated on holdout set. ``fold`` param 709 | is ignored when cross_validation is set to False. 710 | 711 | 712 | sort: str, default = 'R2' 713 | The sort order of the score grid. It also accepts custom metrics that are 714 | added through the ``add_metric`` function. 715 | 716 | 717 | n_select: int, default = 1 718 | Number of top_n models to return. For example, to select top 3 models use 719 | n_select = 3. 720 | 721 | 722 | budget_time: int or float, default = None 723 | If not None, will terminate execution of the function after budget_time 724 | minutes have passed and return results up to that point. 725 | 726 | 727 | turbo: bool, default = True 728 | When set to True, it excludes estimators with longer training times. To 729 | see which algorithms are excluded use the ``models`` function. 730 | 731 | 732 | errors: str, default = 'ignore' 733 | When set to 'ignore', will skip the model with exceptions and continue. 734 | If 'raise', will break the function when exceptions are raised. 735 | 736 | 737 | fit_kwargs: dict, default = {} (empty dict) 738 | Dictionary of arguments passed to the fit method of the model. 739 | 740 | 741 | groups: str or array-like, with shape (n_samples,), default = None 742 | Optional group labels when 'GroupKFold' is used for the cross validation. 743 | It takes an array with shape (n_samples, ) where n_samples is the number 744 | of rows in the training dataset. When string is passed, it is interpreted 745 | as the column name in the dataset containing group labels. 746 | 747 | 748 | verbose: bool, default = True 749 | Score grid is not printed when verbose is set to False. 750 | 751 | 752 | Returns: 753 | Trained model or list of trained models, depending on the ``n_select`` param. 754 | 755 | 756 | Warnings 757 | -------- 758 | - Changing turbo parameter to False may result in very high training times with 759 | datasets exceeding 10,000 rows. 760 | 761 | - No models are logged in ``MLFlow`` when ``cross_validation`` parameter is False. 762 | 763 | """ 764 | 765 | return pycaret.internal.tabular.compare_models( 766 | include=include, 767 | exclude=exclude, 768 | fold=fold, 769 | round=round, 770 | cross_validation=cross_validation, 771 | sort=sort, 772 | n_select=n_select, 773 | budget_time=budget_time, 774 | turbo=turbo, 775 | errors=errors, 776 | fit_kwargs=fit_kwargs, 777 | groups=groups, 778 | verbose=verbose, 779 | ) 780 | 781 | 782 | def creating_model( 783 | estimator: Union[str, Any], 784 | fold: Optional[Union[int, Any]] = None, 785 | round: int = 4, 786 | cross_validation: bool = True, 787 | fit_kwargs: Optional[dict] = None, 788 | groups: Optional[Union[str, Any]] = None, 789 | verbose: bool = True, 790 | **kwargs, 791 | ): 792 | 793 | """ 794 | This function trains and evaluates the performance of a given estimator 795 | using cross validation. The output of this function is a score grid with 796 | CV scores by fold. Metrics evaluated during CV can be accessed using the 797 | ``get_metrics`` function. Custom metrics can be added or removed using 798 | ``add_metric`` and ``remove_metric`` function. All the available models 799 | can be accessed using the ``models`` function. 800 | 801 | 802 | Example 803 | ------- 804 | >>> from PyRapidML.datasets import get_data 805 | >>> boston = extract_data('boston') 806 | >>> from PyRapidML.regression import * 807 | >>> exp_name = initializer(data = boston, target = 'medv') 808 | >>> lr = creating_model('lr') 809 | 810 | 811 | 812 | estimator: str or scikit-learn compatible object 813 | ID of an estimator available in model library or pass an untrained 814 | model object consistent with scikit-learn API. Estimators available 815 | in the model library (ID - Name): 816 | 817 | * 'lr' - Linear Regression 818 | * 'lasso' - Lasso Regression 819 | * 'ridge' - Ridge Regression 820 | * 'en' - Elastic Net 821 | * 'lar' - Least Angle Regression 822 | * 'llar' - Lasso Least Angle Regression 823 | * 'omp' - Orthogonal Matching Pursuit 824 | * 'br' - Bayesian Ridge 825 | * 'ard' - Automatic Relevance Determination 826 | * 'par' - Passive Aggressive Regressor 827 | * 'ransac' - Random Sample Consensus 828 | * 'tr' - TheilSen Regressor 829 | * 'huber' - Huber Regressor 830 | * 'kr' - Kernel Ridge 831 | * 'svm' - Support Vector Regression 832 | * 'knn' - K Neighbors Regressor 833 | * 'dt' - Decision Tree Regressor 834 | * 'rf' - Random Forest Regressor 835 | * 'et' - Extra Trees Regressor 836 | * 'ada' - AdaBoost Regressor 837 | * 'gbr' - Gradient Boosting Regressor 838 | * 'mlp' - MLP Regressor 839 | * 'xgboost' - Extreme Gradient Boosting 840 | * 'lightgbm' - Light Gradient Boosting Machine 841 | * 'catboost' - CatBoost Regressor 842 | 843 | 844 | fold: int or scikit-learn compatible CV generator, default = None 845 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 846 | parameter of the ``setup`` function is used. When an integer is passed, 847 | it is interpreted as the 'n_splits' parameter of the CV generator in the 848 | ``setup`` function. 849 | 850 | 851 | round: int, default = 4 852 | Number of decimal places the metrics in the score grid will be rounded to. 853 | 854 | 855 | cross_validation: bool, default = True 856 | When set to False, metrics are evaluated on holdout set. ``fold`` param 857 | is ignored when cross_validation is set to False. 858 | 859 | 860 | fit_kwargs: dict, default = {} (empty dict) 861 | Dictionary of arguments passed to the fit method of the model. 862 | 863 | 864 | groups: str or array-like, with shape (n_samples,), default = None 865 | Optional group labels when GroupKFold is used for the cross validation. 866 | It takes an array with shape (n_samples, ) where n_samples is the number 867 | of rows in training dataset. When string is passed, it is interpreted as 868 | the column name in the dataset containing group labels. 869 | 870 | 871 | verbose: bool, default = True 872 | Score grid is not printed when verbose is set to False. 873 | 874 | 875 | **kwargs: 876 | Additional keyword arguments to pass to the estimator. 877 | 878 | 879 | Returns: 880 | Trained Model 881 | 882 | 883 | Warnings 884 | -------- 885 | - Models are not logged on the ``MLFlow`` server when ``cross_validation`` param 886 | is set to False. 887 | 888 | """ 889 | 890 | return pycaret.internal.tabular.create_model_supervised( 891 | estimator=estimator, 892 | fold=fold, 893 | round=round, 894 | cross_validation=cross_validation, 895 | fit_kwargs=fit_kwargs, 896 | groups=groups, 897 | verbose=verbose, 898 | **kwargs, 899 | ) 900 | 901 | 902 | def tuning_model( 903 | estimator, 904 | fold: Optional[Union[int, Any]] = None, 905 | round: int = 4, 906 | n_iter: int = 10, 907 | custom_grid: Optional[Union[Dict[str, list], Any]] = None, 908 | optimize: str = "R2", 909 | custom_scorer=None, 910 | search_library: str = "scikit-learn", 911 | search_algorithm: Optional[str] = None, 912 | early_stopping: Any = False, 913 | early_stopping_max_iters: int = 10, 914 | choose_better: bool = False, 915 | fit_kwargs: Optional[dict] = None, 916 | groups: Optional[Union[str, Any]] = None, 917 | return_tuner: bool = False, 918 | verbose: bool = True, 919 | tuner_verbose: Union[int, bool] = True, 920 | **kwargs, 921 | ): 922 | 923 | """ 924 | This function tunes the hyperparameters of a given estimator. The output of 925 | this function is a score grid with CV scores by fold of the best selected 926 | model based on ``optimize`` parameter. Metrics evaluated during CV can be 927 | accessed using the ``get_metrics`` function. Custom metrics can be added 928 | or removed using ``add_metric`` and ``remove_metric`` function. 929 | 930 | 931 | Example 932 | ------- 933 | >>> from PyRapidML.datasets import get_data 934 | >>> boston = extract_data('boston') 935 | >>> from PyRapidML.regression import * 936 | >>> exp_name = initializer(data = boston, target = 'medv') 937 | >>> lr = creating_model('lr') 938 | >>> tuned_lr = tuning_model(lr) 939 | 940 | 941 | estimator: scikit-learn compatible object 942 | Trained model object 943 | 944 | 945 | fold: int or scikit-learn compatible CV generator, default = None 946 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 947 | parameter of the ``setup`` function is used. When an integer is passed, 948 | it is interpreted as the 'n_splits' parameter of the CV generator in the 949 | ``setup`` function. 950 | 951 | 952 | round: int, default = 4 953 | Number of decimal places the metrics in the score grid will be rounded to. 954 | 955 | 956 | n_iter: int, default = 10 957 | Number of iterations in the grid search. Increasing 'n_iter' may improve 958 | model performance but also increases the training time. 959 | 960 | 961 | custom_grid: dictionary, default = None 962 | To define custom search space for hyperparameters, pass a dictionary with 963 | parameter name and values to be iterated. Custom grids must be in a format 964 | supported by the defined ``search_library``. 965 | 966 | 967 | optimize: str, default = 'R2' 968 | Metric name to be evaluated for hyperparameter tuning. It also accepts custom 969 | metrics that are added through the ``add_metric`` function. 970 | 971 | 972 | custom_scorer: object, default = None 973 | custom scoring strategy can be passed to tune hyperparameters of the model. 974 | It must be created using ``sklearn.make_scorer``. It is equivalent of adding 975 | custom metric using the ``add_metric`` function and passing the name of the 976 | custom metric in the ``optimize`` parameter. 977 | Will be deprecated in future. 978 | 979 | 980 | search_library: str, default = 'scikit-learn' 981 | The search library used for tuning hyperparameters. Possible values: 982 | 983 | - 'scikit-learn' - default, requires no further installation 984 | https://github.com/scikit-learn/scikit-learn 985 | 986 | - 'scikit-optimize' - ``pip install scikit-optimize`` 987 | https://scikit-optimize.github.io/stable/ 988 | 989 | - 'tune-sklearn' - ``pip install tune-sklearn ray[tune]`` 990 | https://github.com/ray-project/tune-sklearn 991 | 992 | - 'optuna' - ``pip install optuna`` 993 | https://optuna.org/ 994 | 995 | 996 | search_algorithm: str, default = None 997 | The search algorithm depends on the ``search_library`` parameter. 998 | Some search algorithms require additional libraries to be installed. 999 | If None, will use search library-specific default algorithm. 1000 | 1001 | - 'scikit-learn' possible values: 1002 | - 'random' : random grid search (default) 1003 | - 'grid' : grid search 1004 | 1005 | - 'scikit-optimize' possible values: 1006 | - 'bayesian' : Bayesian search (default) 1007 | 1008 | - 'tune-sklearn' possible values: 1009 | - 'random' : random grid search (default) 1010 | - 'grid' : grid search 1011 | - 'bayesian' : ``pip install scikit-optimize`` 1012 | - 'hyperopt' : ``pip install hyperopt`` 1013 | - 'optuna' : ``pip install optuna`` 1014 | - 'bohb' : ``pip install hpbandster ConfigSpace`` 1015 | 1016 | - 'optuna' possible values: 1017 | - 'random' : randomized search 1018 | - 'tpe' : Tree-structured Parzen Estimator search (default) 1019 | 1020 | 1021 | early_stopping: bool or str or object, default = False 1022 | Use early stopping to stop fitting to a hyperparameter configuration 1023 | if it performs poorly. Ignored when ``search_library`` is scikit-learn, 1024 | or if the estimator does not have 'partial_fit' attribute. If False or 1025 | None, early stopping will not be used. Can be either an object accepted 1026 | by the search library or one of the following: 1027 | 1028 | - 'asha' for Asynchronous Successive Halving Algorithm 1029 | - 'hyperband' for Hyperband 1030 | - 'median' for Median Stopping Rule 1031 | - If False or None, early stopping will not be used. 1032 | 1033 | 1034 | early_stopping_max_iters: int, default = 10 1035 | Maximum number of epochs to run for each sampled configuration. 1036 | Ignored if ``early_stopping`` is False or None. 1037 | 1038 | 1039 | choose_better: bool, default = False 1040 | When set to True, the returned object is always better performing. The 1041 | metric used for comparison is defined by the ``optimize`` parameter. 1042 | 1043 | 1044 | fit_kwargs: dict, default = {} (empty dict) 1045 | Dictionary of arguments passed to the fit method of the tuner. 1046 | 1047 | 1048 | groups: str or array-like, with shape (n_samples,), default = None 1049 | Optional group labels when GroupKFold is used for the cross validation. 1050 | It takes an array with shape (n_samples, ) where n_samples is the number 1051 | of rows in training dataset. When string is passed, it is interpreted as 1052 | the column name in the dataset containing group labels. 1053 | 1054 | 1055 | return_tuner: bool, default = False 1056 | When set to True, will return a tuple of (model, tuner_object). 1057 | 1058 | 1059 | verbose: bool, default = True 1060 | Score grid is not printed when verbose is set to False. 1061 | 1062 | 1063 | tuner_verbose: bool or in, default = True 1064 | If True or above 0, will print messages from the tuner. Higher values 1065 | print more messages. Ignored when ``verbose`` param is False. 1066 | 1067 | 1068 | **kwargs: 1069 | Additional keyword arguments to pass to the optimizer. 1070 | 1071 | 1072 | Returns: 1073 | Trained Model and Optional Tuner Object when ``return_tuner`` is True. 1074 | 1075 | 1076 | Warnings 1077 | -------- 1078 | - Using 'grid' as ``search_algorithm`` may result in very long computation. 1079 | Only recommended with smaller search spaces that can be defined in the 1080 | ``custom_grid`` parameter. 1081 | 1082 | - ``search_library`` 'tune-sklearn' does not support GPU models. 1083 | 1084 | """ 1085 | 1086 | return pycaret.internal.tabular.tune_model_supervised( 1087 | estimator=estimator, 1088 | fold=fold, 1089 | round=round, 1090 | n_iter=n_iter, 1091 | custom_grid=custom_grid, 1092 | optimize=optimize, 1093 | custom_scorer=custom_scorer, 1094 | search_library=search_library, 1095 | search_algorithm=search_algorithm, 1096 | early_stopping=early_stopping, 1097 | early_stopping_max_iters=early_stopping_max_iters, 1098 | choose_better=choose_better, 1099 | fit_kwargs=fit_kwargs, 1100 | groups=groups, 1101 | return_tuner=return_tuner, 1102 | verbose=verbose, 1103 | tuner_verbose=tuner_verbose, 1104 | **kwargs, 1105 | ) 1106 | 1107 | 1108 | def ensemble_model( 1109 | estimator, 1110 | method: str = "Bagging", 1111 | fold: Optional[Union[int, Any]] = None, 1112 | n_estimators: int = 10, 1113 | round: int = 4, 1114 | choose_better: bool = False, 1115 | optimize: str = "R2", 1116 | fit_kwargs: Optional[dict] = None, 1117 | groups: Optional[Union[str, Any]] = None, 1118 | verbose: bool = True, 1119 | ) -> Any: 1120 | 1121 | """ 1122 | This function ensembles a given estimator. The output of this function is 1123 | a score grid with CV scores by fold. Metrics evaluated during CV can be 1124 | accessed using the ``get_metrics`` function. Custom metrics can be added 1125 | or removed using ``add_metric`` and ``remove_metric`` function. 1126 | 1127 | 1128 | Example 1129 | -------- 1130 | >>> from PyRapidML.datasets import get_data 1131 | >>> boston = extract_data('boston') 1132 | >>> from PyRapidML.regression import * 1133 | >>> exp_name = initializer(data = boston, target = 'medv') 1134 | >>> lr = creating_model('lr') 1135 | >>> tuned_lr = tuning_model(lr) 1136 | >>> bagged_dt = ensemble_model(dt, method = 'Bagging') 1137 | 1138 | 1139 | estimator: scikit-learn compatible object 1140 | Trained model object 1141 | 1142 | 1143 | method: str, default = 'Bagging' 1144 | Method for ensembling base estimator. It can be 'Bagging' or 'Boosting'. 1145 | 1146 | 1147 | fold: int or scikit-learn compatible CV generator, default = None 1148 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1149 | parameter of the ``setup`` function is used. When an integer is passed, 1150 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1151 | ``setup`` function. 1152 | 1153 | 1154 | n_estimators: int, default = 10 1155 | The number of base estimators in the ensemble. In case of perfect fit, the 1156 | learning procedure is stopped early. 1157 | 1158 | 1159 | round: int, default = 4 1160 | Number of decimal places the metrics in the score grid will be rounded to. 1161 | 1162 | 1163 | choose_better: bool, default = False 1164 | When set to True, the returned object is always better performing. The 1165 | metric used for comparison is defined by the ``optimize`` parameter. 1166 | 1167 | 1168 | optimize: str, default = 'R2' 1169 | Metric to compare for model selection when ``choose_better`` is True. 1170 | 1171 | 1172 | fit_kwargs: dict, default = {} (empty dict) 1173 | Dictionary of arguments passed to the fit method of the model. 1174 | 1175 | 1176 | groups: str or array-like, with shape (n_samples,), default = None 1177 | Optional group labels when GroupKFold is used for the cross validation. 1178 | It takes an array with shape (n_samples, ) where n_samples is the number 1179 | of rows in training dataset. When string is passed, it is interpreted as 1180 | the column name in the dataset containing group labels. 1181 | 1182 | 1183 | verbose: bool, default = True 1184 | Score grid is not printed when verbose is set to False. 1185 | 1186 | 1187 | Returns: 1188 | Trained Model 1189 | 1190 | """ 1191 | 1192 | return pycaret.internal.tabular.ensemble_model( 1193 | estimator=estimator, 1194 | method=method, 1195 | fold=fold, 1196 | n_estimators=n_estimators, 1197 | round=round, 1198 | choose_better=choose_better, 1199 | optimize=optimize, 1200 | fit_kwargs=fit_kwargs, 1201 | groups=groups, 1202 | verbose=verbose, 1203 | ) 1204 | 1205 | 1206 | def blend_models( 1207 | estimator_list: list, 1208 | fold: Optional[Union[int, Any]] = None, 1209 | round: int = 4, 1210 | choose_better: bool = False, 1211 | optimize: str = "R2", 1212 | weights: Optional[List[float]] = None, 1213 | fit_kwargs: Optional[dict] = None, 1214 | groups: Optional[Union[str, Any]] = None, 1215 | verbose: bool = True, 1216 | ): 1217 | 1218 | """ 1219 | This function trains a Voting Regressor for select models passed in the 1220 | ``estimator_list`` param. The output of this function is a score grid with 1221 | CV scores by fold. Metrics evaluated during CV can be accessed using the 1222 | ``get_metrics`` function. Custom metrics can be added or removed using 1223 | ``add_metric`` and ``remove_metric`` function. 1224 | 1225 | 1226 | Example 1227 | -------- 1228 | >>> from PyRapidML.datasets import get_data 1229 | >>> boston = extract_data('boston') 1230 | >>> from PyRapidML.regression import * 1231 | >>> exp_name = initializer(data = boston, target = 'medv') 1232 | >>> top3 = comparing_models(n_select = 3) 1233 | >>> blender = blend_models(top3) 1234 | 1235 | 1236 | estimator_list: list of scikit-learn compatible objects 1237 | List of trained model objects 1238 | 1239 | 1240 | fold: int or scikit-learn compatible CV generator, default = None 1241 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1242 | parameter of the ``setup`` function is used. When an integer is passed, 1243 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1244 | ``setup`` function. 1245 | 1246 | 1247 | round: int, default = 4 1248 | Number of decimal places the metrics in the score grid will be rounded to. 1249 | 1250 | 1251 | choose_better: bool, default = False 1252 | When set to True, the returned object is always better performing. The 1253 | metric used for comparison is defined by the ``optimize`` parameter. 1254 | 1255 | 1256 | optimize: str, default = 'R2' 1257 | Metric to compare for model selection when ``choose_better`` is True. 1258 | 1259 | 1260 | weights: list, default = None 1261 | Sequence of weights (float or int) to weight the occurrences of predicted class 1262 | labels (hard voting) or class probabilities before averaging (soft voting). Uses 1263 | uniform weights when None. 1264 | 1265 | 1266 | fit_kwargs: dict, default = {} (empty dict) 1267 | Dictionary of arguments passed to the fit method of the model. 1268 | 1269 | 1270 | groups: str or array-like, with shape (n_samples,), default = None 1271 | Optional group labels when GroupKFold is used for the cross validation. 1272 | It takes an array with shape (n_samples, ) where n_samples is the number 1273 | of rows in training dataset. When string is passed, it is interpreted as 1274 | the column name in the dataset containing group labels. 1275 | 1276 | 1277 | verbose: bool, default = True 1278 | Score grid is not printed when verbose is set to False. 1279 | 1280 | 1281 | Returns: 1282 | Trained Model 1283 | 1284 | 1285 | """ 1286 | 1287 | return pycaret.internal.tabular.blend_models( 1288 | estimator_list=estimator_list, 1289 | fold=fold, 1290 | round=round, 1291 | choose_better=choose_better, 1292 | optimize=optimize, 1293 | method="auto", 1294 | weights=weights, 1295 | fit_kwargs=fit_kwargs, 1296 | groups=groups, 1297 | verbose=verbose, 1298 | ) 1299 | 1300 | 1301 | def stack_models( 1302 | estimator_list: list, 1303 | meta_model=None, 1304 | fold: Optional[Union[int, Any]] = None, 1305 | round: int = 4, 1306 | restack: bool = True, 1307 | choose_better: bool = False, 1308 | optimize: str = "R2", 1309 | fit_kwargs: Optional[dict] = None, 1310 | groups: Optional[Union[str, Any]] = None, 1311 | verbose: bool = True, 1312 | ): 1313 | 1314 | """ 1315 | This function trains a meta model over select estimators passed in 1316 | the ``estimator_list`` parameter. The output of this function is a 1317 | score grid with CV scores by fold. Metrics evaluated during CV can 1318 | be accessed using the ``get_metrics`` function. Custom metrics 1319 | can be added or removed using ``add_metric`` and ``remove_metric`` 1320 | function. 1321 | 1322 | 1323 | Example 1324 | -------- 1325 | >>> from PyRapidML.datasets import get_data 1326 | >>> boston = extract_data('boston') 1327 | >>> from PyRapidML.regression import * 1328 | >>> exp_name = initializer(data = boston, target = 'medv') 1329 | >>> top3 = comparing_models(n_select = 3) 1330 | >>> stacker = stack_models(top3) 1331 | 1332 | 1333 | estimator_list: list of scikit-learn compatible objects 1334 | List of trained model objects 1335 | 1336 | 1337 | meta_model: scikit-learn compatible object, default = None 1338 | When None, Linear Regression is trained as a meta model. 1339 | 1340 | 1341 | fold: int or scikit-learn compatible CV generator, default = None 1342 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1343 | parameter of the ``setup`` function is used. When an integer is passed, 1344 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1345 | ``setup`` function. 1346 | 1347 | 1348 | round: int, default = 4 1349 | Number of decimal places the metrics in the score grid will be rounded to. 1350 | 1351 | 1352 | restack: bool, default = True 1353 | When set to False, only the predictions of estimators will be used as 1354 | training data for the ``meta_model``. 1355 | 1356 | 1357 | choose_better: bool, default = False 1358 | When set to True, the returned object is always better performing. The 1359 | metric used for comparison is defined by the ``optimize`` parameter. 1360 | 1361 | 1362 | optimize: str, default = 'R2' 1363 | Metric to compare for model selection when ``choose_better`` is True. 1364 | 1365 | 1366 | fit_kwargs: dict, default = {} (empty dict) 1367 | Dictionary of arguments passed to the fit method of the model. 1368 | 1369 | 1370 | groups: str or array-like, with shape (n_samples,), default = None 1371 | Optional group labels when GroupKFold is used for the cross validation. 1372 | It takes an array with shape (n_samples, ) where n_samples is the number 1373 | of rows in training dataset. When string is passed, it is interpreted as 1374 | the column name in the dataset containing group labels. 1375 | 1376 | 1377 | verbose: bool, default = True 1378 | Score grid is not printed when verbose is set to False. 1379 | 1380 | 1381 | Returns: 1382 | Trained Model 1383 | 1384 | """ 1385 | 1386 | return pycaret.internal.tabular.stack_models( 1387 | estimator_list=estimator_list, 1388 | meta_model=meta_model, 1389 | fold=fold, 1390 | round=round, 1391 | method="auto", 1392 | restack=restack, 1393 | choose_better=choose_better, 1394 | optimize=optimize, 1395 | fit_kwargs=fit_kwargs, 1396 | groups=groups, 1397 | verbose=verbose, 1398 | ) 1399 | 1400 | 1401 | def plot_model( 1402 | estimator, 1403 | plot: str = "residuals", 1404 | scale: float = 1, 1405 | save: bool = False, 1406 | fold: Optional[Union[int, Any]] = None, 1407 | fit_kwargs: Optional[dict] = None, 1408 | groups: Optional[Union[str, Any]] = None, 1409 | use_train_data: bool = False, 1410 | verbose: bool = True, 1411 | display_format: Optional[str] = None, 1412 | ) -> str: 1413 | 1414 | """ 1415 | This function analyzes the performance of a trained model on holdout set. 1416 | It may require re-training the model in certain cases. 1417 | 1418 | 1419 | Example 1420 | -------- 1421 | >>> from PyRapidML.datasets import get_data 1422 | >>> boston = extract_data('boston') 1423 | >>> from PyRapidML.regression import * 1424 | >>> exp_name = initializer(data = boston, target = 'medv') 1425 | >>> lr = creating_model('lr') 1426 | >>> plot_model(lr, plot = 'residual') 1427 | 1428 | 1429 | estimator: scikit-learn compatible object 1430 | Trained model object 1431 | 1432 | 1433 | plot: str, default = 'residual' 1434 | List of available plots (ID - Name): 1435 | 1436 | * 'residuals_interactive' - Interactive Residual plots 1437 | * 'residuals' - Residuals Plot 1438 | * 'error' - Prediction Error Plot 1439 | * 'cooks' - Cooks Distance Plot 1440 | * 'rfe' - Recursive Feat. Selection 1441 | * 'learning' - Learning Curve 1442 | * 'vc' - Validation Curve 1443 | * 'manifold' - Manifold Learning 1444 | * 'feature' - Feature Importance 1445 | * 'feature_all' - Feature Importance (All) 1446 | * 'parameter' - Model Hyperparameter 1447 | * 'tree' - Decision Tree 1448 | 1449 | 1450 | scale: float, default = 1 1451 | The resolution scale of the figure. 1452 | 1453 | 1454 | save: bool, default = False 1455 | When set to True, plot is saved in the current working directory. 1456 | 1457 | 1458 | fold: int or scikit-learn compatible CV generator, default = None 1459 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1460 | parameter of the ``setup`` function is used. When an integer is passed, 1461 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1462 | ``setup`` function. 1463 | 1464 | 1465 | fit_kwargs: dict, default = {} (empty dict) 1466 | Dictionary of arguments passed to the fit method of the model. 1467 | 1468 | 1469 | groups: str or array-like, with shape (n_samples,), default = None 1470 | Optional group labels when GroupKFold is used for the cross validation. 1471 | It takes an array with shape (n_samples, ) where n_samples is the number 1472 | of rows in training dataset. When string is passed, it is interpreted as 1473 | the column name in the dataset containing group labels. 1474 | 1475 | 1476 | use_train_data: bool, default = False 1477 | When set to true, train data will be used for plots, instead 1478 | of test data. 1479 | 1480 | 1481 | verbose: bool, default = True 1482 | When set to False, progress bar is not displayed. 1483 | 1484 | 1485 | display_format: str, default = None 1486 | To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'. 1487 | Currently, not all plots are supported. 1488 | 1489 | 1490 | Returns: 1491 | None 1492 | 1493 | """ 1494 | 1495 | return pycaret.internal.tabular.plot_model( 1496 | estimator=estimator, 1497 | plot=plot, 1498 | scale=scale, 1499 | save=save, 1500 | fold=fold, 1501 | fit_kwargs=fit_kwargs, 1502 | groups=groups, 1503 | verbose=verbose, 1504 | use_train_data=use_train_data, 1505 | system=True, 1506 | display_format=display_format, 1507 | ) 1508 | 1509 | 1510 | def evaluate_model( 1511 | estimator, 1512 | fold: Optional[Union[int, Any]] = None, 1513 | fit_kwargs: Optional[dict] = None, 1514 | groups: Optional[Union[str, Any]] = None, 1515 | use_train_data: bool = False, 1516 | ): 1517 | 1518 | """ 1519 | This function displays a user interface for analyzing performance of a trained 1520 | model. It calls the ``plot_model`` function internally. 1521 | 1522 | Example 1523 | -------- 1524 | >>> from PyRapidML.datasets import get_data 1525 | >>> boston = extract_data('boston') 1526 | >>> from PyRapidML.regression import * 1527 | >>> exp_name = initializer(data = boston, target = 'medv') 1528 | >>> lr = creating_model('lr') 1529 | >>> evaluate_model(lr) 1530 | 1531 | 1532 | estimator: scikit-learn compatible object 1533 | Trained model object 1534 | 1535 | 1536 | fold: int or scikit-learn compatible CV generator, default = None 1537 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1538 | parameter of the ``setup`` function is used. When an integer is passed, 1539 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1540 | ``setup`` function. 1541 | 1542 | 1543 | fit_kwargs: dict, default = {} (empty dict) 1544 | Dictionary of arguments passed to the fit method of the model. 1545 | 1546 | 1547 | groups: str or array-like, with shape (n_samples,), default = None 1548 | Optional group labels when GroupKFold is used for the cross validation. 1549 | It takes an array with shape (n_samples, ) where n_samples is the number 1550 | of rows in training dataset. When string is passed, it is interpreted as 1551 | the column name in the dataset containing group labels. 1552 | 1553 | 1554 | use_train_data: bool, default = False 1555 | When set to true, train data will be used for plots, instead 1556 | of test data. 1557 | 1558 | 1559 | Returns: 1560 | None 1561 | 1562 | 1563 | Warnings 1564 | -------- 1565 | - This function only works in IPython enabled Notebook. 1566 | 1567 | """ 1568 | 1569 | return pycaret.internal.tabular.evaluate_model( 1570 | estimator=estimator, 1571 | fold=fold, 1572 | fit_kwargs=fit_kwargs, 1573 | groups=groups, 1574 | use_train_data=use_train_data, 1575 | ) 1576 | 1577 | 1578 | def interpret_model( 1579 | estimator, 1580 | plot: str = "summary", 1581 | feature: Optional[str] = None, 1582 | observation: Optional[int] = None, 1583 | use_train_data: bool = False, 1584 | X_new_sample: Optional[pd.DataFrame] = None, 1585 | save: bool = False, 1586 | **kwargs, 1587 | ): 1588 | 1589 | """ 1590 | This function analyzes the predictions generated from a tree-based model. It is 1591 | implemented based on the SHAP (SHapley Additive exPlanations). For more info on 1592 | this, please see https://shap.readthedocs.io/en/latest/ 1593 | 1594 | 1595 | Example 1596 | -------- 1597 | >>> from PyRapidML.datasets import get_data 1598 | >>> boston = extract_data('boston') 1599 | >>> from PyRapidML.regression import * 1600 | >>> exp_name = initializer(data = boston, target = 'medv') 1601 | >>> xgboost = creating_model('xgboost') 1602 | >>> interpret_model(xgboost) 1603 | 1604 | 1605 | estimator: scikit-learn compatible object 1606 | Trained model object 1607 | 1608 | 1609 | plot: str, default = 'summary' 1610 | Type of plot. Available options are: 'summary', 'correlation', and 'reason'. 1611 | 1612 | 1613 | feature: str, default = None 1614 | Feature to check correlation with. This parameter is only required when ``plot`` 1615 | type is 'correlation'. When set to None, it uses the first column in the train 1616 | dataset. 1617 | 1618 | 1619 | observation: int, default = None 1620 | Observation index number in holdout set to explain. When ``plot`` is not 1621 | 'reason', this parameter is ignored. 1622 | 1623 | 1624 | use_train_data: bool, default = False 1625 | When set to true, train data will be used for plots, instead 1626 | of test data. 1627 | 1628 | 1629 | X_new_sample: pd.DataFrame, default = None 1630 | Row from an out-of-sample dataframe (neither train nor test data) to be plotted. 1631 | The sample must have the same columns as the raw input data, and it is transformed 1632 | by the preprocessing pipeline automatically before plotting. 1633 | 1634 | 1635 | save: bool, default = False 1636 | When set to True, Plot is saved as a 'png' file in current working directory. 1637 | 1638 | 1639 | **kwargs: 1640 | Additional keyword arguments to pass to the plot. 1641 | 1642 | 1643 | Returns: 1644 | None 1645 | 1646 | """ 1647 | 1648 | return pycaret.internal.tabular.interpret_model( 1649 | estimator=estimator, 1650 | plot=plot, 1651 | feature=feature, 1652 | observation=observation, 1653 | use_train_data=use_train_data, 1654 | X_new_sample=X_new_sample, 1655 | save=save, 1656 | **kwargs, 1657 | ) 1658 | 1659 | 1660 | def predict_model( 1661 | estimator, 1662 | data: Optional[pd.DataFrame] = None, 1663 | round: int = 4, 1664 | verbose: bool = True, 1665 | ) -> pd.DataFrame: 1666 | 1667 | """ 1668 | This function predicts ``Label`` using a trained model. When ``data`` is 1669 | None, it predicts label on the holdout set. 1670 | 1671 | 1672 | Example 1673 | ------- 1674 | >>> from PyRapidML.datasets import get_data 1675 | >>> boston = extract_data('boston') 1676 | >>> from PyRapidML.regression import * 1677 | >>> exp_name = initializer(data = boston, target = 'medv') 1678 | >>> lr = creating_model('lr') 1679 | >>> pred_holdout = predict_model(lr) 1680 | >>> pred_unseen = predict_model(lr, data = unseen_dataframe) 1681 | 1682 | 1683 | estimator: scikit-learn compatible object 1684 | Trained model object 1685 | 1686 | 1687 | data : pandas.DataFrame 1688 | Shape (n_samples, n_features). All features used during training 1689 | must be available in the unseen dataset. 1690 | 1691 | 1692 | round: int, default = 4 1693 | Number of decimal places to round predictions to. 1694 | 1695 | 1696 | verbose: bool, default = True 1697 | When set to False, holdout score grid is not printed. 1698 | 1699 | 1700 | Returns: 1701 | pandas.DataFrame 1702 | 1703 | 1704 | Warnings 1705 | -------- 1706 | - The behavior of the ``predict_model`` is changed in version 2.1 without backward 1707 | compatibility. As such, the pipelines trained using the version (<= 2.0), may not 1708 | work for inference with version >= 2.1. You can either retrain your models with a 1709 | newer version or downgrade the version for inference. 1710 | 1711 | 1712 | """ 1713 | 1714 | return pycaret.internal.tabular.predict_model( 1715 | estimator=estimator, 1716 | data=data, 1717 | probability_threshold=None, 1718 | encoded_labels=True, 1719 | round=round, 1720 | verbose=verbose, 1721 | ml_usecase=MLUsecase.REGRESSION, 1722 | ) 1723 | 1724 | 1725 | def finalize_model( 1726 | estimator, 1727 | fit_kwargs: Optional[dict] = None, 1728 | groups: Optional[Union[str, Any]] = None, 1729 | model_only: bool = True, 1730 | ) -> Any: 1731 | 1732 | """ 1733 | This function trains a given estimator on the entire dataset including the 1734 | holdout set. 1735 | 1736 | 1737 | Example 1738 | -------- 1739 | >>> from PyRapidML.datasets import get_data 1740 | >>> boston = extract_data('boston') 1741 | >>> from PyRapidML.regression import * 1742 | >>> exp_name = initializer(data = boston, target = 'medv') 1743 | >>> lr = creating_model('lr') 1744 | >>> final_lr = finalize_model(lr) 1745 | 1746 | 1747 | estimator: scikit-learn compatible object 1748 | Trained model object 1749 | 1750 | 1751 | fit_kwargs: dict, default = {} (empty dict) 1752 | Dictionary of arguments passed to the fit method of the model. 1753 | 1754 | 1755 | groups: str or array-like, with shape (n_samples,), default = None 1756 | Optional group labels when GroupKFold is used for the cross validation. 1757 | It takes an array with shape (n_samples, ) where n_samples is the number 1758 | of rows in training dataset. When string is passed, it is interpreted as 1759 | the column name in the dataset containing group labels. 1760 | 1761 | 1762 | model_only: bool, default = True 1763 | When set to False, only model object is re-trained and all the 1764 | transformations in Pipeline are ignored. 1765 | 1766 | 1767 | Returns: 1768 | Trained Model 1769 | 1770 | 1771 | """ 1772 | 1773 | return pycaret.internal.tabular.finalize_model( 1774 | estimator=estimator, 1775 | fit_kwargs=fit_kwargs, 1776 | groups=groups, 1777 | model_only=model_only, 1778 | ) 1779 | 1780 | 1781 | def deploy_model( 1782 | model, model_name: str, authentication: dict, platform: str = "aws", 1783 | ): 1784 | 1785 | """ 1786 | This function deploys the transformation pipeline and trained model on cloud. 1787 | 1788 | 1789 | Example 1790 | ------- 1791 | >>> from PyRapidML.datasets import get_data 1792 | >>> boston = extract_data('boston') 1793 | >>> from PyRapidML.regression import * 1794 | >>> exp_name = initializer(data = boston, target = 'medv') 1795 | >>> lr = creating_model('lr') 1796 | >>> deploy_model(model = lr, model_name = 'lr-for-deployment', platform = 'aws', authentication = {'bucket' : 'S3-bucket-name'}) 1797 | 1798 | 1799 | Amazon Web Service (AWS) users: 1800 | To deploy a model on AWS S3 ('aws'), environment variables must be set in your 1801 | local environment. To configure AWS environment variables, type ``aws configure`` 1802 | in the command line. Following information from the IAM portal of amazon console 1803 | account is required: 1804 | 1805 | - AWS Access Key ID 1806 | - AWS Secret Key Access 1807 | - Default Region Name (can be seen under Global settings on your AWS console) 1808 | 1809 | More info: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html 1810 | 1811 | 1812 | Google Cloud Platform (GCP) users: 1813 | To deploy a model on Google Cloud Platform ('gcp'), project must be created 1814 | using command line or GCP console. Once project is created, you must create 1815 | a service account and download the service account key as a JSON file to set 1816 | environment variables in your local environment. 1817 | 1818 | More info: https://cloud.google.com/docs/authentication/production 1819 | 1820 | 1821 | Microsoft Azure (Azure) users: 1822 | To deploy a model on Microsoft Azure ('azure'), environment variables for connection 1823 | string must be set in your local environment. Go to settings of storage account on 1824 | Azure portal to access the connection string required. 1825 | 1826 | More info: https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?toc=%2Fpython%2Fazure%2FTOC.json 1827 | 1828 | 1829 | model: scikit-learn compatible object 1830 | Trained model object 1831 | 1832 | 1833 | model_name: str 1834 | Name of model. 1835 | 1836 | 1837 | authentication: dict 1838 | Dictionary of applicable authentication tokens. 1839 | 1840 | When platform = 'aws': 1841 | {'bucket' : 'S3-bucket-name'} 1842 | 1843 | When platform = 'gcp': 1844 | {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'} 1845 | 1846 | When platform = 'azure': 1847 | {'container': 'azure-container-name'} 1848 | 1849 | 1850 | platform: str, default = 'aws' 1851 | Name of the platform. Currently supported platforms: 'aws', 'gcp' and 'azure'. 1852 | 1853 | 1854 | Returns: 1855 | None 1856 | 1857 | """ 1858 | 1859 | return pycaret.internal.tabular.deploy_model( 1860 | model=model, 1861 | model_name=model_name, 1862 | authentication=authentication, 1863 | platform=platform, 1864 | ) 1865 | 1866 | 1867 | def save_model( 1868 | model, model_name: str, model_only: bool = False, verbose: bool = True, **kwargs 1869 | ): 1870 | 1871 | """ 1872 | This function saves the transformation pipeline and trained model object 1873 | into the current working directory as a pickle file for later use. 1874 | 1875 | Example 1876 | ------- 1877 | >>> from PyRapidML.datasets import get_data 1878 | >>> boston = get_data('boston') 1879 | >>> from PyRapidML.regression import * 1880 | >>> exp_name = setup(data = boston, target = 'medv') 1881 | >>> lr = create_model('lr') 1882 | >>> save_model(lr, 'saved_lr_model') 1883 | 1884 | 1885 | model: scikit-learn compatible object 1886 | Trained model object 1887 | 1888 | 1889 | model_name: str 1890 | Name of the model. 1891 | 1892 | 1893 | model_only: bool, default = False 1894 | When set to True, only trained model object is saved instead of the 1895 | entire pipeline. 1896 | 1897 | 1898 | **kwargs: 1899 | Additional keyword arguments to pass to joblib.dump(). 1900 | 1901 | 1902 | verbose: bool, default = True 1903 | Success message is not printed when verbose is set to False. 1904 | 1905 | 1906 | Returns: 1907 | Tuple of the model object and the filename. 1908 | 1909 | """ 1910 | 1911 | return pycaret.internal.tabular.save_model( 1912 | model=model, 1913 | model_name=model_name, 1914 | model_only=model_only, 1915 | verbose=verbose, 1916 | **kwargs, 1917 | ) 1918 | 1919 | 1920 | def load_model( 1921 | model_name, 1922 | platform: Optional[str] = None, 1923 | authentication: Optional[Dict[str, str]] = None, 1924 | verbose: bool = True, 1925 | ): 1926 | 1927 | """ 1928 | This function loads a previously saved pipeline. 1929 | 1930 | Example 1931 | ------- 1932 | >>> from PyRapidML.regression import load_model 1933 | >>> saved_lr = load_model('saved_lr_model') 1934 | 1935 | 1936 | model_name: str 1937 | Name of the model. 1938 | 1939 | 1940 | platform: str, default = None 1941 | Name of the cloud platform. Currently supported platforms: 1942 | 'aws', 'gcp' and 'azure'. 1943 | 1944 | 1945 | authentication: dict, default = None 1946 | dictionary of applicable authentication tokens. 1947 | 1948 | when platform = 'aws': 1949 | {'bucket' : 'S3-bucket-name'} 1950 | 1951 | when platform = 'gcp': 1952 | {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'} 1953 | 1954 | when platform = 'azure': 1955 | {'container': 'azure-container-name'} 1956 | 1957 | 1958 | verbose: bool, default = True 1959 | Success message is not printed when verbose is set to False. 1960 | 1961 | 1962 | Returns: 1963 | Trained Model 1964 | 1965 | """ 1966 | 1967 | return pycaret.internal.tabular.load_model( 1968 | model_name=model_name, 1969 | platform=platform, 1970 | authentication=authentication, 1971 | verbose=verbose, 1972 | ) 1973 | 1974 | 1975 | def automl(optimize: str = "R2", use_holdout: bool = False) -> Any: 1976 | 1977 | """ 1978 | This function returns the best model out of all trained models in 1979 | current session based on the ``optimize`` parameter. Metrics 1980 | evaluated can be accessed using the ``get_metrics`` function. 1981 | 1982 | 1983 | Example 1984 | ------- 1985 | >>> from PyRapidML.datasets import get_data 1986 | >>> boston = extract_data('boston') 1987 | >>> from PyRapidML.regression import * 1988 | >>> exp_name = initializer(data = boston, target = 'medv') 1989 | >>> top3 = comparing_models(n_select = 3) 1990 | >>> tuned_top3 = [tuning_model(i) for i in top3] 1991 | >>> blender = blend_models(tuned_top3) 1992 | >>> stacker = stack_models(tuned_top3) 1993 | >>> best_mae_model = automl(optimize = 'MAE') 1994 | 1995 | 1996 | optimize: str, default = 'R2' 1997 | Metric to use for model selection. It also accepts custom metrics 1998 | added using the ``add_metric`` function. 1999 | 2000 | 2001 | use_holdout: bool, default = False 2002 | When set to True, metrics are evaluated on holdout set instead of CV. 2003 | 2004 | 2005 | Returns: 2006 | Trained Model 2007 | 2008 | 2009 | """ 2010 | 2011 | return pycaret.internal.tabular.automl(optimize=optimize, use_holdout=use_holdout) 2012 | 2013 | 2014 | def pull(pop: bool = False) -> pd.DataFrame: 2015 | """ 2016 | Returns last printed score grid. Use ``pull`` function after 2017 | any training function to store the score grid in pandas.DataFrame. 2018 | 2019 | 2020 | pop: bool, default = False 2021 | If True, will pop (remove) the returned dataframe from the 2022 | display container. 2023 | 2024 | 2025 | Returns: 2026 | pandas.DataFrame 2027 | 2028 | """ 2029 | return pycaret.internal.tabular.pull(pop=pop) 2030 | 2031 | 2032 | def models( 2033 | type: Optional[str] = None, internal: bool = False, raise_errors: bool = True, 2034 | ) -> pd.DataFrame: 2035 | 2036 | """ 2037 | Returns table of models available in the model library. 2038 | 2039 | Example 2040 | ------- 2041 | >>> from PyRapidML.datasets import get_data 2042 | >>> boston = extract_data('boston') 2043 | >>> from PyRapidML.regression import * 2044 | >>> exp_name = initializer(data = boston, target = 'medv') 2045 | >>> all_models = models() 2046 | 2047 | 2048 | type: str, default = None 2049 | - linear : filters and only return linear models 2050 | - tree : filters and only return tree based models 2051 | - ensemble : filters and only return ensemble models 2052 | 2053 | 2054 | internal: bool, default = False 2055 | When True, will return extra columns and rows used internally. 2056 | 2057 | 2058 | raise_errors: bool, default = True 2059 | When False, will suppress all exceptions, ignoring models 2060 | that couldn't be created. 2061 | 2062 | 2063 | Returns: 2064 | pandas.DataFrame 2065 | 2066 | """ 2067 | return pycaret.internal.tabular.models( 2068 | type=type, internal=internal, raise_errors=raise_errors 2069 | ) 2070 | 2071 | 2072 | def get_metrics( 2073 | reset: bool = False, include_custom: bool = True, raise_errors: bool = True, 2074 | ) -> pd.DataFrame: 2075 | 2076 | """ 2077 | Returns table of available metrics used for CV. 2078 | 2079 | 2080 | Example 2081 | ------- 2082 | >>> from PyRapidML.datasets import get_data 2083 | >>> boston = extract_data('boston') 2084 | >>> from PyRapidML.regression import * 2085 | >>> exp_name = initializer(data = boston, target = 'medv') 2086 | >>> all_metrics = get_metrics() 2087 | 2088 | 2089 | reset: bool, default = False 2090 | When True, will reset all changes made using the ``add_metric`` 2091 | and ``remove_metric`` function. 2092 | 2093 | 2094 | include_custom: bool, default = True 2095 | Whether to include user added (custom) metrics or not. 2096 | 2097 | 2098 | raise_errors: bool, default = True 2099 | If False, will suppress all exceptions, ignoring models that 2100 | couldn't be created. 2101 | 2102 | 2103 | Returns: 2104 | pandas.DataFrame 2105 | 2106 | """ 2107 | 2108 | return pycaret.internal.tabular.get_metrics( 2109 | reset=reset, include_custom=include_custom, raise_errors=raise_errors, 2110 | ) 2111 | 2112 | 2113 | def add_metric( 2114 | id: str, name: str, score_func: type, greater_is_better: bool = True, **kwargs, 2115 | ) -> pd.Series: 2116 | 2117 | """ 2118 | Adds a custom metric to be used for CV. 2119 | 2120 | 2121 | Example 2122 | ------- 2123 | >>> from PyRapidML.datasets import get_data 2124 | >>> boston = extract_data('boston') 2125 | >>> from PyRapidML.regression import * 2126 | >>> exp_name = initializer(data = boston, target = 'medv') 2127 | >>> from sklearn.metrics import explained_variance_score 2128 | >>> add_metric('evs', 'EVS', explained_variance_score) 2129 | 2130 | 2131 | id: str 2132 | Unique id for the metric. 2133 | 2134 | 2135 | name: str 2136 | Display name of the metric. 2137 | 2138 | 2139 | score_func: type 2140 | Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``. 2141 | 2142 | 2143 | greater_is_better: bool, default = True 2144 | Whether ``score_func`` is higher the better or not. 2145 | 2146 | 2147 | **kwargs: 2148 | Arguments to be passed to score function. 2149 | 2150 | 2151 | Returns: 2152 | pandas.Series 2153 | 2154 | """ 2155 | 2156 | return pycaret.internal.tabular.add_metric( 2157 | id=id, 2158 | name=name, 2159 | score_func=score_func, 2160 | target="pred", 2161 | greater_is_better=greater_is_better, 2162 | **kwargs, 2163 | ) 2164 | 2165 | 2166 | def remove_metric(name_or_id: str): 2167 | 2168 | """ 2169 | Removes a metric from CV. 2170 | 2171 | 2172 | Example 2173 | ------- 2174 | >>> from PyRapidML.datasets import get_data 2175 | >>> boston = extract_data('boston') 2176 | >>> from PyRapidML.regression import * 2177 | >>> exp_name = initializer(data = boston, target = 'mredv') 2178 | >>> remove_metric('MAPE') 2179 | 2180 | 2181 | name_or_id: str 2182 | Display name or ID of the metric. 2183 | 2184 | 2185 | Returns: 2186 | None 2187 | 2188 | """ 2189 | return pycaret.internal.tabular.remove_metric(name_or_id=name_or_id) 2190 | 2191 | 2192 | def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.DataFrame: 2193 | 2194 | """ 2195 | Returns a table of experiment logs. Only works when ``log_experiment`` 2196 | is True when initializing the ``setup`` function. 2197 | 2198 | 2199 | Example 2200 | ------- 2201 | >>> from PyRapidML.datasets import get_data 2202 | >>> boston = extract_data('boston') 2203 | >>> from PyRapidML.regression import * 2204 | >>> exp_name = initializer(data = boston, target = 'medv', log_experiment = True) 2205 | >>> best = comparing_models() 2206 | >>> exp_logs = get_logs() 2207 | 2208 | 2209 | experiment_name: str, default = None 2210 | When None current active run is used. 2211 | 2212 | 2213 | save: bool, default = False 2214 | When set to True, csv file is saved in current working directory. 2215 | 2216 | 2217 | Returns: 2218 | pandas.DataFrame 2219 | 2220 | """ 2221 | 2222 | return pycaret.internal.tabular.get_logs(experiment_name=experiment_name, save=save) 2223 | 2224 | 2225 | def get_config(variable: str): 2226 | 2227 | """ 2228 | This function retrieves the global variables created when initializing the 2229 | ``setup`` function. Following variables are accessible: 2230 | 2231 | - X: Transformed dataset (X) 2232 | - y: Transformed dataset (y) 2233 | - X_train: Transformed train dataset (X) 2234 | - X_test: Transformed test/holdout dataset (X) 2235 | - y_train: Transformed train dataset (y) 2236 | - y_test: Transformed test/holdout dataset (y) 2237 | - seed: random state set through session_id 2238 | - prep_pipe: Transformation pipeline 2239 | - fold_shuffle_param: shuffle parameter used in Kfolds 2240 | - n_jobs_param: n_jobs parameter used in model training 2241 | - html_param: html_param configured through setup 2242 | - create_model_container: results grid storage container 2243 | - master_model_container: model storage container 2244 | - display_container: results display container 2245 | - exp_name_log: Name of experiment 2246 | - logging_param: log_experiment param 2247 | - log_plots_param: log_plots param 2248 | - USI: Unique session ID parameter 2249 | - fix_imbalance_param: fix_imbalance param 2250 | - fix_imbalance_method_param: fix_imbalance_method param 2251 | - data_before_preprocess: data before preprocessing 2252 | - target_param: name of target variable 2253 | - gpu_param: use_gpu param configured through setup 2254 | - fold_generator: CV splitter configured in fold_strategy 2255 | - fold_param: fold params defined in the setup 2256 | - fold_groups_param: fold groups defined in the setup 2257 | - stratify_param: stratify parameter defined in the setup 2258 | - transform_target_param: transform_target_param in setup 2259 | - transform_target_method_param: transform_target_method_param in setup 2260 | 2261 | 2262 | Example 2263 | ------- 2264 | >>> from PyRapidML.datasets import get_data 2265 | >>> boston = extract_data('boston') 2266 | >>> from PyRapidML.regression import * 2267 | >>> exp_name = initializer(data = boston, target = 'medv') 2268 | >>> X_train = get_config('X_train') 2269 | 2270 | 2271 | Returns: 2272 | Global variable 2273 | 2274 | 2275 | """ 2276 | 2277 | return pycaret.internal.tabular.get_config(variable=variable) 2278 | 2279 | 2280 | def set_config(variable: str, value): 2281 | 2282 | """ 2283 | This function resets the global variables. Following variables are 2284 | accessible: 2285 | 2286 | - X: Transformed dataset (X) 2287 | - y: Transformed dataset (y) 2288 | - X_train: Transformed train dataset (X) 2289 | - X_test: Transformed test/holdout dataset (X) 2290 | - y_train: Transformed train dataset (y) 2291 | - y_test: Transformed test/holdout dataset (y) 2292 | - seed: random state set through session_id 2293 | - prep_pipe: Transformation pipeline 2294 | - fold_shuffle_param: shuffle parameter used in Kfolds 2295 | - n_jobs_param: n_jobs parameter used in model training 2296 | - html_param: html_param configured through setup 2297 | - create_model_container: results grid storage container 2298 | - master_model_container: model storage container 2299 | - display_container: results display container 2300 | - exp_name_log: Name of experiment 2301 | - logging_param: log_experiment param 2302 | - log_plots_param: log_plots param 2303 | - USI: Unique session ID parameter 2304 | - fix_imbalance_param: fix_imbalance param 2305 | - fix_imbalance_method_param: fix_imbalance_method param 2306 | - data_before_preprocess: data before preprocessing 2307 | - target_param: name of target variable 2308 | - gpu_param: use_gpu param configured through setup 2309 | - fold_generator: CV splitter configured in fold_strategy 2310 | - fold_param: fold params defined in the setup 2311 | - fold_groups_param: fold groups defined in the setup 2312 | - stratify_param: stratify parameter defined in the setup 2313 | - transform_target_param: transform_target_param in setup 2314 | - transform_target_method_param: transform_target_method_param in setup 2315 | 2316 | 2317 | Example 2318 | ------- 2319 | >>> from PyRapidML.datasets import get_data 2320 | >>> boston = extract_data('boston') 2321 | >>> from PyRapidML.regression import * 2322 | >>> exp_name = initializer(data = boston, target = 'medv') 2323 | >>> set_config('seed', 123) 2324 | 2325 | 2326 | Returns: 2327 | None 2328 | 2329 | """ 2330 | 2331 | return pycaret.internal.tabular.set_config(variable=variable, value=value) 2332 | 2333 | 2334 | def save_config(file_name: str): 2335 | 2336 | """ 2337 | This function save all global variables to a pickle file, allowing to 2338 | later resume without rerunning the ``setup``. 2339 | 2340 | 2341 | Example 2342 | ------- 2343 | >>> from PyRapidML.datasets import get_data 2344 | >>> boston = extract_data('boston') 2345 | >>> from PyRapidML.regression import * 2346 | >>> exp_name = initializer(data = boston, target = 'medv') 2347 | >>> save_config('myvars.pkl') 2348 | 2349 | 2350 | Returns: 2351 | None 2352 | 2353 | """ 2354 | 2355 | return pycaret.internal.tabular.save_config(file_name=file_name) 2356 | 2357 | 2358 | def load_config(file_name: str): 2359 | 2360 | """ 2361 | This function loads global variables from a pickle file into Python 2362 | environment. 2363 | 2364 | 2365 | Example 2366 | ------- 2367 | >>> from PyRapidML.regression import load_config 2368 | >>> load_config('myvars.pkl') 2369 | 2370 | 2371 | Returns: 2372 | Global variables 2373 | 2374 | """ 2375 | 2376 | return pycaret.internal.tabular.load_config(file_name=file_name) 2377 | 2378 | 2379 | -------------------------------------------------------------------------------- /PyRapidML/classification.py: -------------------------------------------------------------------------------- 1 | # Module: Classification 2 | # Author: Zain Ali 3 | # License: MIT 4 | # Release: PyRapidML 5 | # Last modified : 05/06/2021 6 | 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | import pycaret.internal.tabular 12 | from pycaret.internal.Display import Display, is_in_colab, enable_colab 13 | from typing import List, Tuple, Any, Union, Optional, Dict 14 | import warnings 15 | from IPython.utils import io 16 | import traceback 17 | 18 | from pycaret.internal.tabular import MLUsecase 19 | 20 | warnings.filterwarnings("ignore") 21 | 22 | 23 | def initializer( 24 | data: pd.DataFrame, 25 | target: str, 26 | train_size: float = 0.7, 27 | test_data: Optional[pd.DataFrame] = None, 28 | preprocess: bool = True, 29 | imputation_type: str = "simple", 30 | iterative_imputation_iters: int = 5, 31 | categorical_features: Optional[List[str]] = None, 32 | categorical_imputation: str = "constant", 33 | categorical_iterative_imputer: Union[str, Any] = "lightgbm", 34 | ordinal_features: Optional[Dict[str, list]] = None, 35 | high_cardinality_features: Optional[List[str]] = None, 36 | high_cardinality_method: str = "frequency", 37 | numeric_features: Optional[List[str]] = None, 38 | numeric_imputation: str = "mean", 39 | numeric_iterative_imputer: Union[str, Any] = "lightgbm", 40 | date_features: Optional[List[str]] = None, 41 | ignore_features: Optional[List[str]] = None, 42 | normalize: bool = False, 43 | normalize_method: str = "zscore", 44 | transformation: bool = False, 45 | transformation_method: str = "yeo-johnson", 46 | handle_unknown_categorical: bool = True, 47 | unknown_categorical_method: str = "least_frequent", 48 | pca: bool = False, 49 | pca_method: str = "linear", 50 | pca_components: Optional[float] = None, 51 | ignore_low_variance: bool = False, 52 | combine_rare_levels: bool = False, 53 | rare_level_threshold: float = 0.10, 54 | bin_numeric_features: Optional[List[str]] = None, 55 | remove_outliers: bool = False, 56 | outliers_threshold: float = 0.05, 57 | remove_multicollinearity: bool = False, 58 | multicollinearity_threshold: float = 0.9, 59 | remove_perfect_collinearity: bool = True, 60 | create_clusters: bool = False, 61 | cluster_iter: int = 20, 62 | polynomial_features: bool = False, 63 | polynomial_degree: int = 2, 64 | trigonometry_features: bool = False, 65 | polynomial_threshold: float = 0.1, 66 | group_features: Optional[List[str]] = None, 67 | group_names: Optional[List[str]] = None, 68 | feature_selection: bool = False, 69 | feature_selection_threshold: float = 0.8, 70 | feature_selection_method: str = "classic", 71 | feature_interaction: bool = False, 72 | feature_ratio: bool = False, 73 | interaction_threshold: float = 0.01, 74 | fix_imbalance: bool = False, 75 | fix_imbalance_method: Optional[Any] = None, 76 | data_split_shuffle: bool = True, 77 | data_split_stratify: Union[bool, List[str]] = False, 78 | fold_strategy: Union[str, Any] = "stratifiedkfold", 79 | fold: int = 10, 80 | fold_shuffle: bool = False, 81 | fold_groups: Optional[Union[str, pd.DataFrame]] = None, 82 | n_jobs: Optional[int] = -1, 83 | use_gpu: bool = False, 84 | custom_pipeline: Union[ 85 | Any, Tuple[str, Any], List[Any], List[Tuple[str, Any]] 86 | ] = None, 87 | html: bool = True, 88 | session_id: Optional[int] = None, 89 | log_experiment: bool = False, 90 | experiment_name: Optional[str] = None, 91 | log_plots: Union[bool, list] = False, 92 | log_profile: bool = False, 93 | log_data: bool = False, 94 | silent: bool = False, 95 | verbose: bool = True, 96 | profile: bool = False, 97 | profile_kwargs: Dict[str, Any] = None, 98 | ): 99 | 100 | """ 101 | This function initializes the training environment and creates the transformation 102 | pipeline. Setup function must be called before executing any other function. It takes 103 | two mandatory parameters: ``data`` and ``target``. All the other parameters are 104 | optional. 105 | 106 | Example 107 | ------- 108 | >>> from PyRapidML.datasets import get_data 109 | >>> juice = extract_data('juice') 110 | >>> from PyRapidML.classification import * 111 | >>> exp_name = initializer(data = juice, target = 'Purchase') 112 | 113 | 114 | data: pandas.DataFrame 115 | Shape (n_samples, n_features), where n_samples is the number of samples and 116 | n_features is the number of features. 117 | 118 | 119 | target: str 120 | Name of the target column to be passed in as a string. The target variable can 121 | be either binary or multiclass. 122 | 123 | 124 | train_size: float, default = 0.7 125 | Proportion of the dataset to be used for training and validation. Should be 126 | between 0.0 and 1.0. 127 | 128 | 129 | test_data: pandas.DataFrame, default = None 130 | If not None, test_data is used as a hold-out set and ``train_size`` parameter is 131 | ignored. test_data must be labelled and the shape of data and test_data must 132 | match. 133 | 134 | 135 | preprocess: bool, default = True 136 | When set to False, no transformations are applied except for train_test_split 137 | and custom transformations passed in ``custom_pipeline`` param. Data must be 138 | ready for modeling (no missing values, no dates, categorical data encoding), 139 | when preprocess is set to False. 140 | 141 | 142 | imputation_type: str, default = 'simple' 143 | The type of imputation to use. Can be either 'simple' or 'iterative'. 144 | 145 | 146 | iterative_imputation_iters: int, default = 5 147 | Number of iterations. Ignored when ``imputation_type`` is not 'iterative'. 148 | 149 | 150 | categorical_features: list of str, default = None 151 | If the inferred data types are not correct or the silent param is set to True, 152 | categorical_features param can be used to overwrite or define the data types. 153 | It takes a list of strings with column names that are categorical. 154 | 155 | 156 | categorical_imputation: str, default = 'constant' 157 | Missing values in categorical features are imputed with a constant 'not_available' 158 | value. The other available option is 'mode'. 159 | 160 | 161 | categorical_iterative_imputer: str, default = 'lightgbm' 162 | Estimator for iterative imputation of missing values in categorical features. 163 | Ignored when ``imputation_type`` is not 'iterative'. 164 | 165 | 166 | ordinal_features: dict, default = None 167 | Encode categorical features as ordinal. For example, a categorical feature with 168 | 'low', 'medium', 'high' values where low < medium < high can be passed as 169 | ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }. 170 | 171 | 172 | high_cardinality_features: list of str, default = None 173 | When categorical features contains many levels, it can be compressed into fewer 174 | levels using this parameter. It takes a list of strings with column names that 175 | are categorical. 176 | 177 | 178 | high_cardinality_method: str, default = 'frequency' 179 | Categorical features with high cardinality are replaced with the frequency of 180 | values in each level occurring in the training dataset. Other available method 181 | is 'clustering' which trains the K-Means clustering algorithm on the statistical 182 | attribute of the training data and replaces the original value of feature with the 183 | cluster label. The number of clusters is determined by optimizing Calinski-Harabasz 184 | and Silhouette criterion. 185 | 186 | 187 | numeric_features: list of str, default = None 188 | If the inferred data types are not correct or the silent param is set to True, 189 | numeric_features param can be used to overwrite or define the data types. 190 | It takes a list of strings with column names that are numeric. 191 | 192 | 193 | numeric_imputation: str, default = 'mean' 194 | Missing values in numeric features are imputed with 'mean' value of the feature 195 | in the training dataset. The other available option is 'median' or 'zero'. 196 | 197 | 198 | numeric_iterative_imputer: str, default = 'lightgbm' 199 | Estimator for iterative imputation of missing values in numeric features. 200 | Ignored when ``imputation_type`` is set to 'simple'. 201 | 202 | 203 | date_features: list of str, default = None 204 | If the inferred data types are not correct or the silent param is set to True, 205 | date_features param can be used to overwrite or define the data types. It takes 206 | a list of strings with column names that are DateTime. 207 | 208 | 209 | ignore_features: list of str, default = None 210 | ignore_features param can be used to ignore features during model training. 211 | It takes a list of strings with column names that are to be ignored. 212 | 213 | 214 | normalize: bool, default = False 215 | When set to True, it transforms the numeric features by scaling them to a given 216 | range. Type of scaling is defined by the ``normalize_method`` parameter. 217 | 218 | 219 | normalize_method: str, default = 'zscore' 220 | Defines the method for scaling. By default, normalize method is set to 'zscore' 221 | The standard zscore is calculated as z = (x - u) / s. Ignored when ``normalize`` 222 | is not True. The other options are: 223 | 224 | - minmax: scales and translates each feature individually such that it is in 225 | the range of 0 - 1. 226 | - maxabs: scales and translates each feature individually such that the 227 | maximal absolute value of each feature will be 1.0. It does not 228 | shift/center the data, and thus does not destroy any sparsity. 229 | - robust: scales and translates each feature according to the Interquartile 230 | range. When the dataset contains outliers, robust scaler often gives 231 | better results. 232 | 233 | 234 | transformation: bool, default = False 235 | When set to True, it applies the power transform to make data more Gaussian-like. 236 | Type of transformation is defined by the ``transformation_method`` parameter. 237 | 238 | 239 | transformation_method: str, default = 'yeo-johnson' 240 | Defines the method for transformation. By default, the transformation method is 241 | set to 'yeo-johnson'. The other available option for transformation is 'quantile'. 242 | Ignored when ``transformation`` is not True. 243 | 244 | 245 | handle_unknown_categorical: bool, default = True 246 | When set to True, unknown categorical levels in unseen data are replaced by the 247 | most or least frequent level as learned in the training dataset. 248 | 249 | 250 | unknown_categorical_method: str, default = 'least_frequent' 251 | Method used to replace unknown categorical levels in unseen data. Method can be 252 | set to 'least_frequent' or 'most_frequent'. 253 | 254 | 255 | pca: bool, default = False 256 | When set to True, dimensionality reduction is applied to project the data into 257 | a lower dimensional space using the method defined in ``pca_method`` parameter. 258 | 259 | 260 | pca_method: str, default = 'linear' 261 | The 'linear' method performs uses Singular Value Decomposition. Other options are: 262 | 263 | - kernel: dimensionality reduction through the use of RVF kernel. 264 | - incremental: replacement for 'linear' pca when the dataset is too large. 265 | 266 | 267 | pca_components: int or float, default = None 268 | Number of components to keep. if pca_components is a float, it is treated as a 269 | target percentage for information retention. When pca_components is an integer 270 | it is treated as the number of features to be kept. pca_components must be less 271 | than the original number of features. Ignored when ``pca`` is not True. 272 | 273 | 274 | ignore_low_variance: bool, default = False 275 | When set to True, all categorical features with insignificant variances are 276 | removed from the data. The variance is calculated using the ratio of unique 277 | values to the number of samples, and the ratio of the most common value to the 278 | frequency of the second most common value. 279 | 280 | 281 | combine_rare_levels: bool, default = False 282 | When set to True, frequency percentile for levels in categorical features below 283 | a certain threshold is combined into a single level. 284 | 285 | 286 | rare_level_threshold: float, default = 0.1 287 | Percentile distribution below which rare categories are combined. Ignored when 288 | ``combine_rare_levels`` is not True. 289 | 290 | 291 | bin_numeric_features: list of str, default = None 292 | To convert numeric features into categorical, bin_numeric_features parameter can 293 | be used. It takes a list of strings with column names to be discretized. It does 294 | so by using 'sturges' rule to determine the number of clusters and then apply 295 | KMeans algorithm. Original values of the feature are then replaced by the 296 | cluster label. 297 | 298 | 299 | remove_outliers: bool, default = False 300 | When set to True, outliers from the training data are removed using the Singular 301 | Value Decomposition. 302 | 303 | 304 | outliers_threshold: float, default = 0.05 305 | The percentage outliers to be removed from the training dataset. Ignored when 306 | ``remove_outliers`` is not True. 307 | 308 | 309 | remove_multicollinearity: bool, default = False 310 | When set to True, features with the inter-correlations higher than the defined 311 | threshold are removed. When two features are highly correlated with each other, 312 | the feature that is less correlated with the target variable is removed. Only 313 | considers numeric features. 314 | 315 | multicollinearity_threshold: float, default = 0.9 316 | Threshold for correlated features. Ignored when ``remove_multicollinearity`` 317 | is not True. 318 | 319 | 320 | remove_perfect_collinearity: bool, default = True 321 | When set to True, perfect collinearity (features with correlation = 1) is removed 322 | from the dataset, when two features are 100% correlated, one of it is randomly 323 | removed from the dataset. 324 | 325 | 326 | create_clusters: bool, default = False 327 | When set to True, an additional feature is created in training dataset where each 328 | instance is assigned to a cluster. The number of clusters is determined by 329 | optimizing Calinski-Harabasz and Silhouette criterion. 330 | 331 | 332 | cluster_iter: int, default = 20 333 | Number of iterations for creating cluster. Each iteration represents cluster 334 | size. Ignored when ``create_clusters`` is not True. 335 | 336 | 337 | polynomial_features: bool, default = False 338 | When set to True, new features are derived using existing numeric features. 339 | 340 | 341 | polynomial_degree: int, default = 2 342 | Degree of polynomial features. For example, if an input sample is two dimensional 343 | and of the form [a, b], the polynomial features with degree = 2 are: 344 | [1, a, b, a^2, ab, b^2]. Ignored when ``polynomial_features`` is not True. 345 | 346 | 347 | trigonometry_features: bool, default = False 348 | When set to True, new features are derived using existing numeric features. 349 | 350 | 351 | polynomial_threshold: float, default = 0.1 352 | When ``polynomial_features`` or ``trigonometry_features`` is True, new features 353 | are derived from the existing numeric features. This may sometimes result in too 354 | large feature space. polynomial_threshold parameter can be used to deal with this 355 | problem. It does so by using combination of Random Forest, AdaBoost and Linear 356 | correlation. All derived features that falls within the percentile distribution 357 | are kept and rest of the features are removed. 358 | 359 | 360 | group_features: list or list of list, default = None 361 | When the dataset contains features with related characteristics, group_features 362 | parameter can be used for feature extraction. It takes a list of strings with 363 | column names that are related. 364 | 365 | 366 | group_names: list, default = None 367 | Group names to be used in naming new features. When the length of group_names 368 | does not match with the length of ``group_features``, new features are named 369 | sequentially group_1, group_2, etc. It is ignored when ``group_features`` is 370 | None. 371 | 372 | 373 | feature_selection: bool, default = False 374 | When set to True, a subset of features are selected using a combination of 375 | various permutation importance techniques including Random Forest, Adaboost 376 | and Linear correlation with target variable. The size of the subset is 377 | dependent on the ``feature_selection_threshold`` parameter. 378 | 379 | 380 | feature_selection_threshold: float, default = 0.8 381 | Threshold value used for feature selection. When ``polynomial_features`` or 382 | ``feature_interaction`` is True, it is recommended to keep the threshold low 383 | to avoid large feature spaces. Setting a very low value may be efficient but 384 | could result in under-fitting. 385 | 386 | 387 | feature_selection_method: str, default = 'classic' 388 | Algorithm for feature selection. 'classic' method uses permutation feature 389 | importance techniques. Other possible value is 'boruta' which uses boruta 390 | algorithm for feature selection. 391 | 392 | 393 | feature_interaction: bool, default = False 394 | When set to True, new features are created by interacting (a * b) all the 395 | numeric variables in the dataset. This feature is not scalable and may not 396 | work as expected on datasets with large feature space. 397 | 398 | 399 | feature_ratio: bool, default = False 400 | When set to True, new features are created by calculating the ratios (a / b) 401 | between all numeric variables in the dataset. This feature is not scalable and 402 | may not work as expected on datasets with large feature space. 403 | 404 | 405 | interaction_threshold: bool, default = 0.01 406 | Similar to polynomial_threshold, It is used to compress a sparse matrix of newly 407 | created features through interaction. Features whose importance based on the 408 | combination of Random Forest, AdaBoost and Linear correlation falls within the 409 | percentile of the defined threshold are kept in the dataset. Remaining features 410 | are dropped before further processing. 411 | 412 | 413 | fix_imbalance: bool, default = False 414 | When training dataset has unequal distribution of target class it can be balanced 415 | using this parameter. When set to True, SMOTE (Synthetic Minority Over-sampling 416 | Technique) is applied by default to create synthetic datapoints for minority class. 417 | 418 | 419 | fix_imbalance_method: obj, default = None 420 | When ``fix_imbalance`` is True, 'imblearn' compatible object with 'fit_resample' 421 | method can be passed. When set to None, 'imblearn.over_sampling.SMOTE' is used. 422 | 423 | 424 | data_split_shuffle: bool, default = True 425 | When set to False, prevents shuffling of rows during 'train_test_split'. 426 | 427 | 428 | data_split_stratify: bool or list, default = False 429 | Controls stratification during 'train_test_split'. When set to True, will 430 | stratify by target column. To stratify on any other columns, pass a list of 431 | column names. Ignored when ``data_split_shuffle`` is False. 432 | 433 | 434 | fold_strategy: str or sklearn CV generator object, default = 'stratifiedkfold' 435 | Choice of cross validation strategy. Possible values are: 436 | 437 | * 'kfold' 438 | * 'stratifiedkfold' 439 | * 'groupkfold' 440 | * 'timeseries' 441 | * a custom CV generator object compatible with scikit-learn. 442 | 443 | 444 | fold: int, default = 10 445 | Number of folds to be used in cross validation. Must be at least 2. This is 446 | a global setting that can be over-written at function level by using ``fold`` 447 | parameter. Ignored when ``fold_strategy`` is a custom object. 448 | 449 | 450 | fold_shuffle: bool, default = False 451 | Controls the shuffle parameter of CV. Only applicable when ``fold_strategy`` 452 | is 'kfold' or 'stratifiedkfold'. Ignored when ``fold_strategy`` is a custom 453 | object. 454 | 455 | 456 | fold_groups: str or array-like, with shape (n_samples,), default = None 457 | Optional group labels when 'GroupKFold' is used for the cross validation. 458 | It takes an array with shape (n_samples, ) where n_samples is the number 459 | of rows in the training dataset. When string is passed, it is interpreted 460 | as the column name in the dataset containing group labels. 461 | 462 | 463 | n_jobs: int, default = -1 464 | The number of jobs to run in parallel (for functions that supports parallel 465 | processing) -1 means using all processors. To run all functions on single 466 | processor set n_jobs to None. 467 | 468 | 469 | use_gpu: bool or str, default = False 470 | When set to True, it will use GPU for training with algorithms that support it, 471 | and fall back to CPU if they are unavailable. When set to 'force', it will only 472 | use GPU-enabled algorithms and raise exceptions when they are unavailable. When 473 | False, all algorithms are trained using CPU only. 474 | 475 | GPU enabled algorithms: 476 | 477 | - Extreme Gradient Boosting, requires no further installation 478 | 479 | - CatBoost Classifier, requires no further installation 480 | (GPU is only enabled when data > 50,000 rows) 481 | 482 | - Light Gradient Boosting Machine, requires GPU installation 483 | https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html 484 | 485 | - Logistic Regression, Ridge Classifier, Random Forest, K Neighbors Classifier, 486 | Support Vector Machine, requires cuML >= 0.15 487 | https://github.com/rapidsai/cuml 488 | 489 | 490 | custom_pipeline: (str, transformer) or list of (str, transformer), default = None 491 | When passed, will append the custom transformers in the preprocessing pipeline 492 | and are applied on each CV fold separately and on the final fit. All the custom 493 | transformations are applied after 'train_test_split' and before PyRapidML's internal 494 | transformations. 495 | 496 | 497 | html: bool, default = True 498 | When set to False, prevents runtime display of monitor. This must be set to False 499 | when the environment does not support IPython. For example, command line terminal, 500 | Databricks Notebook, Spyder and other similar IDEs. 501 | 502 | 503 | session_id: int, default = None 504 | Controls the randomness of experiment. It is equivalent to 'random_state' in 505 | scikit-learn. When None, a pseudo random number is generated. This can be used 506 | for later reproducibility of the entire experiment. 507 | 508 | 509 | log_experiment: bool, default = False 510 | When set to True, all metrics and parameters are logged on the ``MLFlow`` server. 511 | 512 | 513 | experiment_name: str, default = None 514 | Name of the experiment for logging. Ignored when ``log_experiment`` is not True. 515 | 516 | 517 | log_plots: bool or list, default = False 518 | When set to True, certain plots are logged automatically in the ``MLFlow`` server. 519 | To change the type of plots to be logged, pass a list containing plot IDs. Refer 520 | to documentation of ``plot_model``. Ignored when ``log_experiment`` is not True. 521 | 522 | 523 | log_profile: bool, default = False 524 | When set to True, data profile is logged on the ``MLflow`` server as a html file. 525 | Ignored when ``log_experiment`` is not True. 526 | 527 | 528 | log_data: bool, default = False 529 | When set to True, dataset is logged on the ``MLflow`` server as a csv file. 530 | Ignored when ``log_experiment`` is not True. 531 | 532 | 533 | silent: bool, default = False 534 | Controls the confirmation input of data types when ``setup`` is executed. When 535 | executing in completely automated mode or on a remote kernel, this must be True. 536 | 537 | 538 | verbose: bool, default = True 539 | When set to False, Information grid is not printed. 540 | 541 | 542 | profile: bool, default = False 543 | When set to True, an interactive EDA report is displayed. 544 | 545 | 546 | profile_kwargs: dict, default = {} (empty dict) 547 | Dictionary of arguments passed to the ProfileReport method used 548 | to create the EDA report. Ignored if ``profile`` is False. 549 | 550 | 551 | Returns: 552 | Global variables that can be changed using the ``set_config`` function. 553 | 554 | """ 555 | 556 | available_plots = { 557 | "parameter": "Hyperparameters", 558 | "auc": "AUC", 559 | "confusion_matrix": "Confusion Matrix", 560 | "threshold": "Threshold", 561 | "pr": "Precision Recall", 562 | "error": "Prediction Error", 563 | "class_report": "Class Report", 564 | "rfe": "Feature Selection", 565 | "learning": "Learning Curve", 566 | "manifold": "Manifold Learning", 567 | "calibration": "Calibration Curve", 568 | "vc": "Validation Curve", 569 | "dimension": "Dimensions", 570 | "feature": "Feature Importance", 571 | "feature_all": "Feature Importance (All)", 572 | "boundary": "Decision Boundary", 573 | "lift": "Lift Chart", 574 | "gain": "Gain Chart", 575 | "tree": "Decision Tree", 576 | } 577 | 578 | if log_plots == True: 579 | log_plots = ["auc", "confusion_matrix", "feature"] 580 | 581 | return pycaret.internal.tabular.setup( 582 | ml_usecase="classification", 583 | available_plots=available_plots, 584 | data=data, 585 | target=target, 586 | train_size=train_size, 587 | test_data=test_data, 588 | preprocess=preprocess, 589 | imputation_type=imputation_type, 590 | iterative_imputation_iters=iterative_imputation_iters, 591 | categorical_features=categorical_features, 592 | categorical_imputation=categorical_imputation, 593 | categorical_iterative_imputer=categorical_iterative_imputer, 594 | ordinal_features=ordinal_features, 595 | high_cardinality_features=high_cardinality_features, 596 | high_cardinality_method=high_cardinality_method, 597 | numeric_features=numeric_features, 598 | numeric_imputation=numeric_imputation, 599 | numeric_iterative_imputer=numeric_iterative_imputer, 600 | date_features=date_features, 601 | ignore_features=ignore_features, 602 | normalize=normalize, 603 | normalize_method=normalize_method, 604 | transformation=transformation, 605 | transformation_method=transformation_method, 606 | handle_unknown_categorical=handle_unknown_categorical, 607 | unknown_categorical_method=unknown_categorical_method, 608 | pca=pca, 609 | pca_method=pca_method, 610 | pca_components=pca_components, 611 | ignore_low_variance=ignore_low_variance, 612 | combine_rare_levels=combine_rare_levels, 613 | rare_level_threshold=rare_level_threshold, 614 | bin_numeric_features=bin_numeric_features, 615 | remove_outliers=remove_outliers, 616 | outliers_threshold=outliers_threshold, 617 | remove_multicollinearity=remove_multicollinearity, 618 | multicollinearity_threshold=multicollinearity_threshold, 619 | remove_perfect_collinearity=remove_perfect_collinearity, 620 | create_clusters=create_clusters, 621 | cluster_iter=cluster_iter, 622 | polynomial_features=polynomial_features, 623 | polynomial_degree=polynomial_degree, 624 | trigonometry_features=trigonometry_features, 625 | polynomial_threshold=polynomial_threshold, 626 | group_features=group_features, 627 | group_names=group_names, 628 | feature_selection=feature_selection, 629 | feature_selection_threshold=feature_selection_threshold, 630 | feature_selection_method=feature_selection_method, 631 | feature_interaction=feature_interaction, 632 | feature_ratio=feature_ratio, 633 | interaction_threshold=interaction_threshold, 634 | fix_imbalance=fix_imbalance, 635 | fix_imbalance_method=fix_imbalance_method, 636 | data_split_shuffle=data_split_shuffle, 637 | data_split_stratify=data_split_stratify, 638 | fold_strategy=fold_strategy, 639 | fold=fold, 640 | fold_shuffle=fold_shuffle, 641 | fold_groups=fold_groups, 642 | n_jobs=n_jobs, 643 | use_gpu=use_gpu, 644 | custom_pipeline=custom_pipeline, 645 | html=html, 646 | session_id=session_id, 647 | log_experiment=log_experiment, 648 | experiment_name=experiment_name, 649 | log_plots=log_plots, 650 | log_profile=log_profile, 651 | log_data=log_data, 652 | silent=silent, 653 | verbose=verbose, 654 | profile=profile, 655 | profile_kwargs=profile_kwargs, 656 | ) 657 | 658 | 659 | def comparing_models( 660 | include: Optional[List[Union[str, Any]]] = None, 661 | exclude: Optional[List[str]] = None, 662 | fold: Optional[Union[int, Any]] = None, 663 | round: int = 4, 664 | cross_validation: bool = True, 665 | sort: str = "Accuracy", 666 | n_select: int = 1, 667 | budget_time: Optional[float] = None, 668 | turbo: bool = True, 669 | errors: str = "ignore", 670 | fit_kwargs: Optional[dict] = None, 671 | groups: Optional[Union[str, Any]] = None, 672 | verbose: bool = True, 673 | ) -> Union[Any, List[Any]]: 674 | 675 | """ 676 | This function trains and evaluates performance of all estimators available in the 677 | model library using cross validation. The output of this function is a score grid 678 | with average cross validated scores. Metrics evaluated during CV can be accessed 679 | using the ``get_metrics`` function. Custom metrics can be added or removed using 680 | ``add_metric`` and ``remove_metric`` function. 681 | 682 | Example 683 | ------- 684 | >>> from PyRapidML.datasets import get_data 685 | >>> juice = extract_data('juice') 686 | >>> from PyRapidML.classification import * 687 | >>> exp_name = initializer(data = juice, target = 'Purchase') 688 | >>> best_model = comparing_models() 689 | 690 | 691 | include: list of str or scikit-learn compatible object, default = None 692 | To train and evaluate select models, list containing model ID or scikit-learn 693 | compatible object can be passed in include param. To see a list of all models 694 | available in the model library use the ``models`` function. 695 | 696 | 697 | exclude: list of str, default = None 698 | To omit certain models from training and evaluation, pass a list containing 699 | model id in the exclude parameter. To see a list of all models available 700 | in the model library use the ``models`` function. 701 | 702 | 703 | fold: int or scikit-learn compatible CV generator, default = None 704 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 705 | parameter of the ``setup`` function is used. When an integer is passed, 706 | it is interpreted as the 'n_splits' parameter of the CV generator in the 707 | ``setup`` function. 708 | 709 | 710 | round: int, default = 4 711 | Number of decimal places the metrics in the score grid will be rounded to. 712 | 713 | 714 | cross_validation: bool, default = True 715 | When set to False, metrics are evaluated on holdout set. ``fold`` param 716 | is ignored when cross_validation is set to False. 717 | 718 | 719 | sort: str, default = 'Accuracy' 720 | The sort order of the score grid. It also accepts custom metrics that are 721 | added through the ``add_metric`` function. 722 | 723 | 724 | n_select: int, default = 1 725 | Number of top_n models to return. For example, to select top 3 models use 726 | n_select = 3. 727 | 728 | 729 | budget_time: int or float, default = None 730 | If not None, will terminate execution of the function after budget_time 731 | minutes have passed and return results up to that point. 732 | 733 | 734 | turbo: bool, default = True 735 | When set to True, it excludes estimators with longer training times. To 736 | see which algorithms are excluded use the ``models`` function. 737 | 738 | 739 | errors: str, default = 'ignore' 740 | When set to 'ignore', will skip the model with exceptions and continue. 741 | If 'raise', will break the function when exceptions are raised. 742 | 743 | 744 | fit_kwargs: dict, default = {} (empty dict) 745 | Dictionary of arguments passed to the fit method of the model. 746 | 747 | 748 | groups: str or array-like, with shape (n_samples,), default = None 749 | Optional group labels when 'GroupKFold' is used for the cross validation. 750 | It takes an array with shape (n_samples, ) where n_samples is the number 751 | of rows in the training dataset. When string is passed, it is interpreted 752 | as the column name in the dataset containing group labels. 753 | 754 | 755 | verbose: bool, default = True 756 | Score grid is not printed when verbose is set to False. 757 | 758 | 759 | Returns: 760 | Trained model or list of trained models, depending on the ``n_select`` param. 761 | 762 | Warnings 763 | -------- 764 | - Changing turbo parameter to False may result in very high training times with 765 | datasets exceeding 10,000 rows. 766 | 767 | - AUC for estimators that does not support 'predict_proba' is shown as 0.0000. 768 | 769 | - No models are logged in ``MLFlow`` when ``cross_validation`` parameter is False. 770 | """ 771 | 772 | return pycaret.internal.tabular.compare_models( 773 | include=include, 774 | exclude=exclude, 775 | fold=fold, 776 | round=round, 777 | cross_validation=cross_validation, 778 | sort=sort, 779 | n_select=n_select, 780 | budget_time=budget_time, 781 | turbo=turbo, 782 | errors=errors, 783 | fit_kwargs=fit_kwargs, 784 | groups=groups, 785 | verbose=verbose, 786 | ) 787 | 788 | 789 | def creating_model( 790 | estimator: Union[str, Any], 791 | fold: Optional[Union[int, Any]] = None, 792 | round: int = 4, 793 | cross_validation: bool = True, 794 | fit_kwargs: Optional[dict] = None, 795 | groups: Optional[Union[str, Any]] = None, 796 | verbose: bool = True, 797 | **kwargs, 798 | ) -> Any: 799 | 800 | """ 801 | This function trains and evaluates the performance of a given estimator 802 | using cross validation. The output of this function is a score grid with 803 | CV scores by fold. Metrics evaluated during CV can be accessed using the 804 | ``get_metrics`` function. Custom metrics can be added or removed using 805 | ``add_metric`` and ``remove_metric`` function. All the available models 806 | can be accessed using the ``models`` function. 807 | 808 | Example 809 | ------- 810 | >>> from PyRapidML.datasets import get_data 811 | >>> juice = extract_data('juice') 812 | >>> from PyRapidML.classification import * 813 | >>> exp_name = initializer(data = juice, target = 'Purchase') 814 | >>> lr = creating_model('lr') 815 | 816 | 817 | estimator: str or scikit-learn compatible object 818 | ID of an estimator available in model library or pass an untrained 819 | model object consistent with scikit-learn API. Estimators available 820 | in the model library (ID - Name): 821 | 822 | * 'lr' - Logistic Regression 823 | * 'knn' - K Neighbors Classifier 824 | * 'nb' - Naive Bayes 825 | * 'dt' - Decision Tree Classifier 826 | * 'svm' - SVM - Linear Kernel 827 | * 'rbfsvm' - SVM - Radial Kernel 828 | * 'gpc' - Gaussian Process Classifier 829 | * 'mlp' - MLP Classifier 830 | * 'ridge' - Ridge Classifier 831 | * 'rf' - Random Forest Classifier 832 | * 'qda' - Quadratic Discriminant Analysis 833 | * 'ada' - Ada Boost Classifier 834 | * 'gbc' - Gradient Boosting Classifier 835 | * 'lda' - Linear Discriminant Analysis 836 | * 'et' - Extra Trees Classifier 837 | * 'xgboost' - Extreme Gradient Boosting 838 | * 'lightgbm' - Light Gradient Boosting Machine 839 | * 'catboost' - CatBoost Classifier 840 | 841 | 842 | fold: int or scikit-learn compatible CV generator, default = None 843 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 844 | parameter of the ``setup`` function is used. When an integer is passed, 845 | it is interpreted as the 'n_splits' parameter of the CV generator in the 846 | ``setup`` function. 847 | 848 | 849 | round: int, default = 4 850 | Number of decimal places the metrics in the score grid will be rounded to. 851 | 852 | 853 | cross_validation: bool, default = True 854 | When set to False, metrics are evaluated on holdout set. ``fold`` param 855 | is ignored when cross_validation is set to False. 856 | 857 | 858 | fit_kwargs: dict, default = {} (empty dict) 859 | Dictionary of arguments passed to the fit method of the model. 860 | 861 | 862 | groups: str or array-like, with shape (n_samples,), default = None 863 | Optional group labels when GroupKFold is used for the cross validation. 864 | It takes an array with shape (n_samples, ) where n_samples is the number 865 | of rows in training dataset. When string is passed, it is interpreted as 866 | the column name in the dataset containing group labels. 867 | 868 | 869 | verbose: bool, default = True 870 | Score grid is not printed when verbose is set to False. 871 | 872 | 873 | **kwargs**: 874 | Additional keyword arguments to pass to the estimator. 875 | 876 | 877 | Returns: 878 | Trained Model 879 | 880 | 881 | Warnings 882 | -------- 883 | - AUC for estimators that does not support 'predict_proba' is shown as 0.0000. 884 | 885 | - Models are not logged on the ``MLFlow`` server when ``cross_validation`` param 886 | is set to False. 887 | 888 | """ 889 | 890 | return pycaret.internal.tabular.create_model_supervised( 891 | estimator=estimator, 892 | fold=fold, 893 | round=round, 894 | cross_validation=cross_validation, 895 | fit_kwargs=fit_kwargs, 896 | groups=groups, 897 | verbose=verbose, 898 | **kwargs, 899 | ) 900 | 901 | 902 | def tuning_model( 903 | estimator, 904 | fold: Optional[Union[int, Any]] = None, 905 | round: int = 4, 906 | n_iter: int = 10, 907 | custom_grid: Optional[Union[Dict[str, list], Any]] = None, 908 | optimize: str = "Accuracy", 909 | custom_scorer=None, 910 | search_library: str = "scikit-learn", 911 | search_algorithm: Optional[str] = None, 912 | early_stopping: Any = False, 913 | early_stopping_max_iters: int = 10, 914 | choose_better: bool = False, 915 | fit_kwargs: Optional[dict] = None, 916 | groups: Optional[Union[str, Any]] = None, 917 | return_tuner: bool = False, 918 | verbose: bool = True, 919 | tuner_verbose: Union[int, bool] = True, 920 | **kwargs, 921 | ) -> Any: 922 | 923 | """ 924 | This function tunes the hyperparameters of a given estimator. The output of 925 | this function is a score grid with CV scores by fold of the best selected 926 | model based on ``optimize`` parameter. Metrics evaluated during CV can be 927 | accessed using the ``get_metrics`` function. Custom metrics can be added 928 | or removed using ``add_metric`` and ``remove_metric`` function. 929 | 930 | Example 931 | ------- 932 | >>> from PyRapidML.datasets import get_data 933 | >>> juice = extract_data('juice') 934 | >>> from PyRapidML.classification import * 935 | >>> exp_name = initializer(data = juice, target = 'Purchase') 936 | >>> lr = creating_model('lr') 937 | >>> tuned_lr = tuning_model(lr) 938 | 939 | 940 | estimator: scikit-learn compatible object 941 | Trained model object 942 | 943 | 944 | fold: int or scikit-learn compatible CV generator, default = None 945 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 946 | parameter of the ``setup`` function is used. When an integer is passed, 947 | it is interpreted as the 'n_splits' parameter of the CV generator in the 948 | ``setup`` function. 949 | 950 | 951 | round: int, default = 4 952 | Number of decimal places the metrics in the score grid will be rounded to. 953 | 954 | 955 | n_iter: int, default = 10 956 | Number of iterations in the grid search. Increasing 'n_iter' may improve 957 | model performance but also increases the training time. 958 | 959 | 960 | custom_grid: dictionary, default = None 961 | To define custom search space for hyperparameters, pass a dictionary with 962 | parameter name and values to be iterated. Custom grids must be in a format 963 | supported by the defined ``search_library``. 964 | 965 | 966 | optimize: str, default = 'Accuracy' 967 | Metric name to be evaluated for hyperparameter tuning. It also accepts custom 968 | metrics that are added through the ``add_metric`` function. 969 | 970 | 971 | custom_scorer: object, default = None 972 | custom scoring strategy can be passed to tune hyperparameters of the model. 973 | It must be created using ``sklearn.make_scorer``. It is equivalent of adding 974 | custom metric using the ``add_metric`` function and passing the name of the 975 | custom metric in the ``optimize`` parameter. 976 | Will be deprecated in future. 977 | 978 | 979 | search_library: str, default = 'scikit-learn' 980 | The search library used for tuning hyperparameters. Possible values: 981 | 982 | - 'scikit-learn' - default, requires no further installation 983 | https://github.com/scikit-learn/scikit-learn 984 | 985 | - 'scikit-optimize' - ``pip install scikit-optimize`` 986 | https://scikit-optimize.github.io/stable/ 987 | 988 | - 'tune-sklearn' - ``pip install tune-sklearn ray[tune]`` 989 | https://github.com/ray-project/tune-sklearn 990 | 991 | - 'optuna' - ``pip install optuna`` 992 | https://optuna.org/ 993 | 994 | 995 | search_algorithm: str, default = None 996 | The search algorithm depends on the ``search_library`` parameter. 997 | Some search algorithms require additional libraries to be installed. 998 | If None, will use search library-specific default algorithm. 999 | 1000 | - 'scikit-learn' possible values: 1001 | - 'random' : random grid search (default) 1002 | - 'grid' : grid search 1003 | 1004 | - 'scikit-optimize' possible values: 1005 | - 'bayesian' : Bayesian search (default) 1006 | 1007 | - 'tune-sklearn' possible values: 1008 | - 'random' : random grid search (default) 1009 | - 'grid' : grid search 1010 | - 'bayesian' : ``pip install scikit-optimize`` 1011 | - 'hyperopt' : ``pip install hyperopt`` 1012 | - 'optuna' : ``pip install optuna`` 1013 | - 'bohb' : ``pip install hpbandster ConfigSpace`` 1014 | 1015 | - 'optuna' possible values: 1016 | - 'random' : randomized search 1017 | - 'tpe' : Tree-structured Parzen Estimator search (default) 1018 | 1019 | 1020 | early_stopping: bool or str or object, default = False 1021 | Use early stopping to stop fitting to a hyperparameter configuration 1022 | if it performs poorly. Ignored when ``search_library`` is scikit-learn, 1023 | or if the estimator does not have 'partial_fit' attribute. If False or 1024 | None, early stopping will not be used. Can be either an object accepted 1025 | by the search library or one of the following: 1026 | 1027 | - 'asha' for Asynchronous Successive Halving Algorithm 1028 | - 'hyperband' for Hyperband 1029 | - 'median' for Median Stopping Rule 1030 | - If False or None, early stopping will not be used. 1031 | 1032 | 1033 | early_stopping_max_iters: int, default = 10 1034 | Maximum number of epochs to run for each sampled configuration. 1035 | Ignored if ``early_stopping`` is False or None. 1036 | 1037 | 1038 | choose_better: bool, default = False 1039 | When set to True, the returned object is always better performing. The 1040 | metric used for comparison is defined by the ``optimize`` parameter. 1041 | 1042 | 1043 | fit_kwargs: dict, default = {} (empty dict) 1044 | Dictionary of arguments passed to the fit method of the tuner. 1045 | 1046 | 1047 | groups: str or array-like, with shape (n_samples,), default = None 1048 | Optional group labels when GroupKFold is used for the cross validation. 1049 | It takes an array with shape (n_samples, ) where n_samples is the number 1050 | of rows in training dataset. When string is passed, it is interpreted as 1051 | the column name in the dataset containing group labels. 1052 | 1053 | 1054 | return_tuner: bool, default = False 1055 | When set to True, will return a tuple of (model, tuner_object). 1056 | 1057 | 1058 | verbose: bool, default = True 1059 | Score grid is not printed when verbose is set to False. 1060 | 1061 | 1062 | tuner_verbose: bool or in, default = True 1063 | If True or above 0, will print messages from the tuner. Higher values 1064 | print more messages. Ignored when ``verbose`` param is False. 1065 | 1066 | 1067 | **kwargs**: 1068 | Additional keyword arguments to pass to the optimizer. 1069 | 1070 | 1071 | Returns: 1072 | Trained Model and Optional Tuner Object when ``return_tuner`` is True. 1073 | 1074 | 1075 | Warnings 1076 | -------- 1077 | - Using 'grid' as ``search_algorithm`` may result in very long computation. 1078 | Only recommended with smaller search spaces that can be defined in the 1079 | ``custom_grid`` parameter. 1080 | 1081 | - ``search_library`` 'tune-sklearn' does not support GPU models. 1082 | 1083 | """ 1084 | 1085 | return pycaret.internal.tabular.tune_model_supervised( 1086 | estimator=estimator, 1087 | fold=fold, 1088 | round=round, 1089 | n_iter=n_iter, 1090 | custom_grid=custom_grid, 1091 | optimize=optimize, 1092 | custom_scorer=custom_scorer, 1093 | search_library=search_library, 1094 | search_algorithm=search_algorithm, 1095 | early_stopping=early_stopping, 1096 | early_stopping_max_iters=early_stopping_max_iters, 1097 | choose_better=choose_better, 1098 | fit_kwargs=fit_kwargs, 1099 | groups=groups, 1100 | return_tuner=return_tuner, 1101 | verbose=verbose, 1102 | tuner_verbose=tuner_verbose, 1103 | **kwargs, 1104 | ) 1105 | 1106 | 1107 | def ensemble_model( 1108 | estimator, 1109 | method: str = "Bagging", 1110 | fold: Optional[Union[int, Any]] = None, 1111 | n_estimators: int = 10, 1112 | round: int = 4, 1113 | choose_better: bool = False, 1114 | optimize: str = "Accuracy", 1115 | fit_kwargs: Optional[dict] = None, 1116 | groups: Optional[Union[str, Any]] = None, 1117 | verbose: bool = True, 1118 | ) -> Any: 1119 | 1120 | """ 1121 | This function ensembles a given estimator. The output of this function is 1122 | a score grid with CV scores by fold. Metrics evaluated during CV can be 1123 | accessed using the ``get_metrics`` function. Custom metrics can be added 1124 | or removed using ``add_metric`` and ``remove_metric`` function. 1125 | 1126 | 1127 | Example 1128 | ------- 1129 | >>> from PyRapidML.datasets import get_data 1130 | >>> juice = extract_data('juice') 1131 | >>> from PyRapidML.classification import * 1132 | >>> exp_name = initializer(data = juice, target = 'Purchase') 1133 | >>> dt = creating_model('dt') 1134 | >>> bagged_dt = ensemble_model(dt, method = 'Bagging') 1135 | 1136 | 1137 | estimator: scikit-learn compatible object 1138 | Trained model object 1139 | 1140 | 1141 | method: str, default = 'Bagging' 1142 | Method for ensembling base estimator. It can be 'Bagging' or 'Boosting'. 1143 | 1144 | 1145 | fold: int or scikit-learn compatible CV generator, default = None 1146 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1147 | parameter of the ``setup`` function is used. When an integer is passed, 1148 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1149 | ``setup`` function. 1150 | 1151 | 1152 | n_estimators: int, default = 10 1153 | The number of base estimators in the ensemble. In case of perfect fit, the 1154 | learning procedure is stopped early. 1155 | 1156 | 1157 | round: int, default = 4 1158 | Number of decimal places the metrics in the score grid will be rounded to. 1159 | 1160 | 1161 | choose_better: bool, default = False 1162 | When set to True, the returned object is always better performing. The 1163 | metric used for comparison is defined by the ``optimize`` parameter. 1164 | 1165 | 1166 | optimize: str, default = 'Accuracy' 1167 | Metric to compare for model selection when ``choose_better`` is True. 1168 | 1169 | 1170 | fit_kwargs: dict, default = {} (empty dict) 1171 | Dictionary of arguments passed to the fit method of the model. 1172 | 1173 | 1174 | groups: str or array-like, with shape (n_samples,), default = None 1175 | Optional group labels when GroupKFold is used for the cross validation. 1176 | It takes an array with shape (n_samples, ) where n_samples is the number 1177 | of rows in training dataset. When string is passed, it is interpreted as 1178 | the column name in the dataset containing group labels. 1179 | 1180 | 1181 | verbose: bool, default = True 1182 | Score grid is not printed when verbose is set to False. 1183 | 1184 | 1185 | Returns: 1186 | Trained Model 1187 | 1188 | 1189 | Warnings 1190 | -------- 1191 | - Method 'Boosting' is not supported for estimators that do not have 'class_weights' 1192 | or 'predict_proba' attributes. 1193 | 1194 | """ 1195 | 1196 | return pycaret.internal.tabular.ensemble_model( 1197 | estimator=estimator, 1198 | method=method, 1199 | fold=fold, 1200 | n_estimators=n_estimators, 1201 | round=round, 1202 | choose_better=choose_better, 1203 | optimize=optimize, 1204 | fit_kwargs=fit_kwargs, 1205 | groups=groups, 1206 | verbose=verbose, 1207 | ) 1208 | 1209 | 1210 | def blend_models( 1211 | estimator_list: list, 1212 | fold: Optional[Union[int, Any]] = None, 1213 | round: int = 4, 1214 | choose_better: bool = False, 1215 | optimize: str = "Accuracy", 1216 | method: str = "auto", 1217 | weights: Optional[List[float]] = None, 1218 | fit_kwargs: Optional[dict] = None, 1219 | groups: Optional[Union[str, Any]] = None, 1220 | verbose: bool = True, 1221 | ) -> Any: 1222 | 1223 | """ 1224 | This function trains a Soft Voting / Majority Rule classifier for select 1225 | models passed in the ``estimator_list`` param. The output of this function 1226 | is a score grid with CV scores by fold. Metrics evaluated during CV can be 1227 | accessed using the ``get_metrics`` function. Custom metrics can be added 1228 | or removed using ``add_metric`` and ``remove_metric`` function. 1229 | 1230 | 1231 | Example 1232 | ------- 1233 | >>> from PyRapidML.datasets import get_data 1234 | >>> juice = get_data('juice') 1235 | >>> from PyRapidML.classification import * 1236 | >>> exp_name = setup(data = juice, target = 'Purchase') 1237 | >>> top3 = compare_models(n_select = 3) 1238 | >>> blender = blend_models(top3) 1239 | 1240 | 1241 | estimator_list: list of scikit-learn compatible objects 1242 | List of trained model objects 1243 | 1244 | 1245 | fold: int or scikit-learn compatible CV generator, default = None 1246 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1247 | parameter of the ``setup`` function is used. When an integer is passed, 1248 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1249 | ``setup`` function. 1250 | 1251 | 1252 | round: int, default = 4 1253 | Number of decimal places the metrics in the score grid will be rounded to. 1254 | 1255 | 1256 | choose_better: bool, default = False 1257 | When set to True, the returned object is always better performing. The 1258 | metric used for comparison is defined by the ``optimize`` parameter. 1259 | 1260 | 1261 | optimize: str, default = 'Accuracy' 1262 | Metric to compare for model selection when ``choose_better`` is True. 1263 | 1264 | 1265 | method: str, default = 'auto' 1266 | 'hard' uses predicted class labels for majority rule voting. 'soft', predicts 1267 | the class label based on the argmax of the sums of the predicted probabilities, 1268 | which is recommended for an ensemble of well-calibrated classifiers. Default 1269 | value, 'auto', will try to use 'soft' and fall back to 'hard' if the former is 1270 | not supported. 1271 | 1272 | 1273 | weights: list, default = None 1274 | Sequence of weights (float or int) to weight the occurrences of predicted class 1275 | labels (hard voting) or class probabilities before averaging (soft voting). Uses 1276 | uniform weights when None. 1277 | 1278 | 1279 | fit_kwargs: dict, default = {} (empty dict) 1280 | Dictionary of arguments passed to the fit method of the model. 1281 | 1282 | 1283 | groups: str or array-like, with shape (n_samples,), default = None 1284 | Optional group labels when GroupKFold is used for the cross validation. 1285 | It takes an array with shape (n_samples, ) where n_samples is the number 1286 | of rows in training dataset. When string is passed, it is interpreted as 1287 | the column name in the dataset containing group labels. 1288 | 1289 | 1290 | verbose: bool, default = True 1291 | Score grid is not printed when verbose is set to False. 1292 | 1293 | 1294 | Returns: 1295 | Trained Model 1296 | 1297 | """ 1298 | 1299 | return pycaret.internal.tabular.blend_models( 1300 | estimator_list=estimator_list, 1301 | fold=fold, 1302 | round=round, 1303 | choose_better=choose_better, 1304 | optimize=optimize, 1305 | method=method, 1306 | weights=weights, 1307 | fit_kwargs=fit_kwargs, 1308 | groups=groups, 1309 | verbose=verbose, 1310 | ) 1311 | 1312 | 1313 | def stack_models( 1314 | estimator_list: list, 1315 | meta_model=None, 1316 | fold: Optional[Union[int, Any]] = None, 1317 | round: int = 4, 1318 | method: str = "auto", 1319 | restack: bool = True, 1320 | choose_better: bool = False, 1321 | optimize: str = "Accuracy", 1322 | fit_kwargs: Optional[dict] = None, 1323 | groups: Optional[Union[str, Any]] = None, 1324 | verbose: bool = True, 1325 | ) -> Any: 1326 | 1327 | """ 1328 | This function trains a meta model over select estimators passed in 1329 | the ``estimator_list`` parameter. The output of this function is a 1330 | score grid with CV scores by fold. Metrics evaluated during CV can 1331 | be accessed using the ``get_metrics`` function. Custom metrics 1332 | can be added or removed using ``add_metric`` and ``remove_metric`` 1333 | function. 1334 | 1335 | 1336 | Example 1337 | ------- 1338 | >>> from PyRapidML.datasets import get_data 1339 | >>> juice = extract_data('juice') 1340 | >>> from PyRapidML.classification import * 1341 | >>> exp_name = initializer(data = juice, target = 'Purchase') 1342 | >>> top3 = comparing_models(n_select = 3) 1343 | >>> stacker = stack_models(top3) 1344 | 1345 | 1346 | estimator_list: list of scikit-learn compatible objects 1347 | List of trained model objects 1348 | 1349 | 1350 | meta_model: scikit-learn compatible object, default = None 1351 | When None, Logistic Regression is trained as a meta model. 1352 | 1353 | 1354 | fold: int or scikit-learn compatible CV generator, default = None 1355 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1356 | parameter of the ``setup`` function is used. When an integer is passed, 1357 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1358 | ``setup`` function. 1359 | 1360 | 1361 | round: int, default = 4 1362 | Number of decimal places the metrics in the score grid will be rounded to. 1363 | 1364 | 1365 | method: str, default = 'auto' 1366 | When set to 'auto', it will invoke, for each estimator, 'predict_proba', 1367 | 'decision_function' or 'predict' in that order. Other, manually pass one 1368 | of the value from 'predict_proba', 'decision_function' or 'predict'. 1369 | 1370 | 1371 | restack: bool, default = True 1372 | When set to False, only the predictions of estimators will be used as 1373 | training data for the ``meta_model``. 1374 | 1375 | 1376 | choose_better: bool, default = False 1377 | When set to True, the returned object is always better performing. The 1378 | metric used for comparison is defined by the ``optimize`` parameter. 1379 | 1380 | 1381 | optimize: str, default = 'Accuracy' 1382 | Metric to compare for model selection when ``choose_better`` is True. 1383 | 1384 | 1385 | fit_kwargs: dict, default = {} (empty dict) 1386 | Dictionary of arguments passed to the fit method of the model. 1387 | 1388 | 1389 | groups: str or array-like, with shape (n_samples,), default = None 1390 | Optional group labels when GroupKFold is used for the cross validation. 1391 | It takes an array with shape (n_samples, ) where n_samples is the number 1392 | of rows in training dataset. When string is passed, it is interpreted as 1393 | the column name in the dataset containing group labels. 1394 | 1395 | 1396 | verbose: bool, default = True 1397 | Score grid is not printed when verbose is set to False. 1398 | 1399 | 1400 | Returns: 1401 | Trained Model 1402 | 1403 | 1404 | Warnings 1405 | -------- 1406 | - When ``method`` is not set to 'auto', it will check if the defined method 1407 | is available for all estimators passed in ``estimator_list``. If the method is 1408 | not implemented by any estimator, it will raise an error. 1409 | 1410 | """ 1411 | 1412 | return pycaret.internal.tabular.stack_models( 1413 | estimator_list=estimator_list, 1414 | meta_model=meta_model, 1415 | fold=fold, 1416 | round=round, 1417 | method=method, 1418 | restack=restack, 1419 | choose_better=choose_better, 1420 | optimize=optimize, 1421 | fit_kwargs=fit_kwargs, 1422 | groups=groups, 1423 | verbose=verbose, 1424 | ) 1425 | 1426 | 1427 | def plot_model( 1428 | estimator, 1429 | plot: str = "auc", 1430 | scale: float = 1, 1431 | save: bool = False, 1432 | fold: Optional[Union[int, Any]] = None, 1433 | fit_kwargs: Optional[dict] = None, 1434 | groups: Optional[Union[str, Any]] = None, 1435 | use_train_data: bool = False, 1436 | verbose: bool = True, 1437 | display_format: Optional[str] = None, 1438 | ) -> str: 1439 | 1440 | """ 1441 | This function analyzes the performance of a trained model on holdout set. 1442 | It may require re-training the model in certain cases. 1443 | 1444 | Example 1445 | ------- 1446 | >>> from PyRapidML.datasets import get_data 1447 | >>> juice = extract_data('juice') 1448 | >>> from PyRapidML.classification import * 1449 | >>> exp_name = initializer(data = juice, target = 'Purchase') 1450 | >>> lr = creating_model('lr') 1451 | >>> plot_model(lr, plot = 'auc') 1452 | 1453 | 1454 | estimator: scikit-learn compatible object 1455 | Trained model object 1456 | 1457 | 1458 | plot: str, default = 'auc' 1459 | List of available plots (ID - Name): 1460 | 1461 | * 'auc' - Area Under the Curve 1462 | * 'threshold' - Discrimination Threshold 1463 | * 'pr' - Precision Recall Curve 1464 | * 'confusion_matrix' - Confusion Matrix 1465 | * 'error' - Class Prediction Error 1466 | * 'class_report' - Classification Report 1467 | * 'boundary' - Decision Boundary 1468 | * 'rfe' - Recursive Feature Selection 1469 | * 'learning' - Learning Curve 1470 | * 'manifold' - Manifold Learning 1471 | * 'calibration' - Calibration Curve 1472 | * 'vc' - Validation Curve 1473 | * 'dimension' - Dimension Learning 1474 | * 'feature' - Feature Importance 1475 | * 'feature_all' - Feature Importance (All) 1476 | * 'parameter' - Model Hyperparameter 1477 | * 'lift' - Lift Curve 1478 | * 'gain' - Gain Chart 1479 | * 'tree' - Decision Tree 1480 | 1481 | 1482 | scale: float, default = 1 1483 | The resolution scale of the figure. 1484 | 1485 | 1486 | save: bool, default = False 1487 | When set to True, plot is saved in the current working directory. 1488 | 1489 | 1490 | fold: int or scikit-learn compatible CV generator, default = None 1491 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1492 | parameter of the ``setup`` function is used. When an integer is passed, 1493 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1494 | ``setup`` function. 1495 | 1496 | 1497 | fit_kwargs: dict, default = {} (empty dict) 1498 | Dictionary of arguments passed to the fit method of the model. 1499 | 1500 | 1501 | groups: str or array-like, with shape (n_samples,), default = None 1502 | Optional group labels when GroupKFold is used for the cross validation. 1503 | It takes an array with shape (n_samples, ) where n_samples is the number 1504 | of rows in training dataset. When string is passed, it is interpreted as 1505 | the column name in the dataset containing group labels. 1506 | 1507 | 1508 | use_train_data: bool, default = False 1509 | When set to true, train data will be used for plots, instead 1510 | of test data. 1511 | 1512 | 1513 | verbose: bool, default = True 1514 | When set to False, progress bar is not displayed. 1515 | 1516 | 1517 | display_format: str, default = None 1518 | To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'. 1519 | Currently, not all plots are supported. 1520 | 1521 | 1522 | Returns: 1523 | None 1524 | 1525 | 1526 | Warnings 1527 | -------- 1528 | - Estimators that does not support 'predict_proba' attribute cannot be used for 1529 | 'AUC' and 'calibration' plots. 1530 | 1531 | - When the target is multiclass, 'calibration', 'threshold', 'manifold' and 'rfe' 1532 | plots are not available. 1533 | 1534 | - When the 'max_features' parameter of a trained model object is not equal to 1535 | the number of samples in training set, the 'rfe' plot is not available. 1536 | 1537 | """ 1538 | 1539 | return pycaret.internal.tabular.plot_model( 1540 | estimator=estimator, 1541 | plot=plot, 1542 | scale=scale, 1543 | save=save, 1544 | fold=fold, 1545 | fit_kwargs=fit_kwargs, 1546 | groups=groups, 1547 | verbose=verbose, 1548 | use_train_data=use_train_data, 1549 | system=True, 1550 | display_format=display_format, 1551 | ) 1552 | 1553 | 1554 | def evaluate_model( 1555 | estimator, 1556 | fold: Optional[Union[int, Any]] = None, 1557 | fit_kwargs: Optional[dict] = None, 1558 | groups: Optional[Union[str, Any]] = None, 1559 | use_train_data: bool = False, 1560 | ): 1561 | 1562 | """ 1563 | This function displays a user interface for analyzing performance of a trained 1564 | model. It calls the ``plot_model`` function internally. 1565 | 1566 | 1567 | Example 1568 | ------- 1569 | >>> from PyRapidML.datasets import get_data 1570 | >>> juice = extract_data('juice') 1571 | >>> from PyRapidML.classification import * 1572 | >>> exp_name = initializer(data = juice, target = 'Purchase') 1573 | >>> lr = creating_model('lr') 1574 | >>> evaluate_model(lr) 1575 | 1576 | 1577 | estimator: scikit-learn compatible object 1578 | Trained model object 1579 | 1580 | 1581 | fold: int or scikit-learn compatible CV generator, default = None 1582 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1583 | parameter of the ``setup`` function is used. When an integer is passed, 1584 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1585 | ``setup`` function. 1586 | 1587 | 1588 | fit_kwargs: dict, default = {} (empty dict) 1589 | Dictionary of arguments passed to the fit method of the model. 1590 | 1591 | 1592 | groups: str or array-like, with shape (n_samples,), default = None 1593 | Optional group labels when GroupKFold is used for the cross validation. 1594 | It takes an array with shape (n_samples, ) where n_samples is the number 1595 | of rows in training dataset. When string is passed, it is interpreted as 1596 | the column name in the dataset containing group labels. 1597 | 1598 | 1599 | use_train_data: bool, default = False 1600 | When set to true, train data will be used for plots, instead 1601 | of test data. 1602 | 1603 | 1604 | Returns: 1605 | None 1606 | 1607 | 1608 | Warnings 1609 | -------- 1610 | - This function only works in IPython enabled Notebook. 1611 | 1612 | """ 1613 | 1614 | return pycaret.internal.tabular.evaluate_model( 1615 | estimator=estimator, 1616 | fold=fold, 1617 | fit_kwargs=fit_kwargs, 1618 | groups=groups, 1619 | use_train_data=use_train_data, 1620 | ) 1621 | 1622 | 1623 | def interpret_model( 1624 | estimator, 1625 | plot: str = "summary", 1626 | feature: Optional[str] = None, 1627 | observation: Optional[int] = None, 1628 | use_train_data: bool = False, 1629 | X_new_sample: Optional[pd.DataFrame] = None, 1630 | save: bool = False, 1631 | **kwargs, 1632 | ): 1633 | 1634 | """ 1635 | This function analyzes the predictions generated from a tree-based model. It is 1636 | implemented based on the SHAP (SHapley Additive exPlanations). For more info on 1637 | this, please see https://shap.readthedocs.io/en/latest/ 1638 | 1639 | Example 1640 | ------- 1641 | >>> from PyRapidML.datasets import get_data 1642 | >>> juice = extract_data('juice') 1643 | >>> from PyRapidML.classification import * 1644 | >>> exp_name = initializer(data = juice, target = 'Purchase') 1645 | >>> xgboost = creating_model('xgboost') 1646 | >>> interpret_model(xgboost) 1647 | 1648 | 1649 | estimator: scikit-learn compatible object 1650 | Trained model object 1651 | 1652 | 1653 | plot: str, default = 'summary' 1654 | Type of plot. Available options are: 'summary', 'correlation', and 'reason'. 1655 | 1656 | 1657 | feature: str, default = None 1658 | Feature to check correlation with. This parameter is only required when ``plot`` 1659 | type is 'correlation'. When set to None, it uses the first column in the train 1660 | dataset. 1661 | 1662 | 1663 | observation: int, default = None 1664 | Observation index number in holdout set to explain. When ``plot`` is not 1665 | 'reason', this parameter is ignored. 1666 | 1667 | 1668 | use_train_data: bool, default = False 1669 | When set to true, train data will be used for plots, instead 1670 | of test data. 1671 | 1672 | 1673 | X_new_sample: pd.DataFrame, default = None 1674 | Row from an out-of-sample dataframe (neither train nor test data) to be plotted. 1675 | The sample must have the same columns as the raw input data, and it is transformed 1676 | by the preprocessing pipeline automatically before plotting. 1677 | 1678 | 1679 | save: bool, default = False 1680 | When set to True, Plot is saved as a 'png' file in current working directory. 1681 | 1682 | 1683 | **kwargs**: 1684 | Additional keyword arguments to pass to the plot. 1685 | 1686 | 1687 | Returns: 1688 | None 1689 | 1690 | """ 1691 | 1692 | return pycaret.internal.tabular.interpret_model( 1693 | estimator=estimator, 1694 | plot=plot, 1695 | feature=feature, 1696 | observation=observation, 1697 | use_train_data=use_train_data, 1698 | X_new_sample=X_new_sample, 1699 | save=save, 1700 | **kwargs, 1701 | ) 1702 | 1703 | 1704 | def calibrate_model( 1705 | estimator, 1706 | method: str = "sigmoid", 1707 | fold: Optional[Union[int, Any]] = None, 1708 | round: int = 4, 1709 | fit_kwargs: Optional[dict] = None, 1710 | groups: Optional[Union[str, Any]] = None, 1711 | verbose: bool = True, 1712 | ) -> Any: 1713 | 1714 | """ 1715 | This function calibrates the probability of a given estimator using isotonic 1716 | or logistic regression. The output of this function is a score grid with CV 1717 | scores by fold. Metrics evaluated during CV can be accessed using the 1718 | ``get_metrics`` function. Custom metrics can be added or removed using 1719 | ``add_metric`` and ``remove_metric`` function. 1720 | 1721 | 1722 | Example 1723 | ------- 1724 | >>> from PyRapidML.datasets import get_data 1725 | >>> juice = extract_data('juice') 1726 | >>> from PyRapidML.classification import * 1727 | >>> exp_name = initializer(data = juice, target = 'Purchase') 1728 | >>> dt = creating_model('dt') 1729 | >>> calibrated_dt = calibrate_model(dt) 1730 | 1731 | 1732 | estimator: scikit-learn compatible object 1733 | Trained model object 1734 | 1735 | 1736 | method: str, default = 'sigmoid' 1737 | The method to use for calibration. Can be 'sigmoid' which corresponds to 1738 | Platt's method or 'isotonic' which is a non-parametric approach. 1739 | 1740 | 1741 | fold: int or scikit-learn compatible CV generator, default = None 1742 | Controls cross-validation. If None, the CV generator in the ``fold_strategy`` 1743 | parameter of the ``setup`` function is used. When an integer is passed, 1744 | it is interpreted as the 'n_splits' parameter of the CV generator in the 1745 | ``setup`` function. 1746 | 1747 | 1748 | round: int, default = 4 1749 | Number of decimal places the metrics in the score grid will be rounded to. 1750 | 1751 | 1752 | fit_kwargs: dict, default = {} (empty dict) 1753 | Dictionary of arguments passed to the fit method of the model. 1754 | 1755 | 1756 | groups: str or array-like, with shape (n_samples,), default = None 1757 | Optional group labels when GroupKFold is used for the cross validation. 1758 | It takes an array with shape (n_samples, ) where n_samples is the number 1759 | of rows in training dataset. When string is passed, it is interpreted as 1760 | the column name in the dataset containing group labels. 1761 | 1762 | 1763 | verbose: bool, default = True 1764 | Score grid is not printed when verbose is set to False. 1765 | 1766 | 1767 | Returns: 1768 | Trained Model 1769 | 1770 | 1771 | Warnings 1772 | -------- 1773 | - Avoid isotonic calibration with too few calibration samples (< 1000) since it 1774 | tends to overfit. 1775 | 1776 | """ 1777 | 1778 | return pycaret.internal.tabular.calibrate_model( 1779 | estimator=estimator, 1780 | method=method, 1781 | fold=fold, 1782 | round=round, 1783 | fit_kwargs=fit_kwargs, 1784 | groups=groups, 1785 | verbose=verbose, 1786 | ) 1787 | 1788 | 1789 | def optimize_threshold( 1790 | estimator, 1791 | true_positive: int = 0, 1792 | true_negative: int = 0, 1793 | false_positive: int = 0, 1794 | false_negative: int = 0, 1795 | ): 1796 | 1797 | """ 1798 | This function optimizes probability threshold for a given estimator using 1799 | custom cost function. The function displays a plot of optimized cost as a 1800 | function of probability threshold between 0.0 to 1.0 and returns the 1801 | optimized threshold value as a numpy float. 1802 | 1803 | 1804 | Example 1805 | ------- 1806 | >>> from PyRapidML.datasets import get_data 1807 | >>> juice = get_data('juice') 1808 | >>> from PyRapidML.classification import * 1809 | >>> exp_name = setup(data = juice, target = 'Purchase') 1810 | >>> lr = create_model('lr') 1811 | >>> optimize_threshold(lr, true_negative = 10, false_negative = -100) 1812 | 1813 | 1814 | estimator: scikit-learn compatible object 1815 | Trained model object 1816 | 1817 | 1818 | true_positive: int, default = 0 1819 | Cost function or returns for true positive. 1820 | 1821 | 1822 | true_negative: int, default = 0 1823 | Cost function or returns for true negative. 1824 | 1825 | 1826 | false_positive: int, default = 0 1827 | Cost function or returns for false positive. 1828 | 1829 | 1830 | false_negative: int, default = 0 1831 | Cost function or returns for false negative. 1832 | 1833 | 1834 | Returns: 1835 | numpy.float64 1836 | 1837 | 1838 | Warnings 1839 | -------- 1840 | - This function is not supported when target is multiclass. 1841 | 1842 | """ 1843 | 1844 | return pycaret.internal.tabular.optimize_threshold( 1845 | estimator=estimator, 1846 | true_positive=true_positive, 1847 | true_negative=true_negative, 1848 | false_positive=false_positive, 1849 | false_negative=false_negative, 1850 | ) 1851 | 1852 | 1853 | def predict_model( 1854 | estimator, 1855 | data: Optional[pd.DataFrame] = None, 1856 | probability_threshold: Optional[float] = None, 1857 | encoded_labels: bool = False, 1858 | raw_score: bool = False, 1859 | round: int = 4, 1860 | verbose: bool = True, 1861 | ) -> pd.DataFrame: 1862 | 1863 | """ 1864 | This function predicts ``Label`` and ``Score`` (probability of predicted 1865 | class) using a trained model. When ``data`` is None, it predicts label and 1866 | score on the holdout set. 1867 | 1868 | 1869 | Example 1870 | ------- 1871 | >>> from PyRapidML.datasets import get_data 1872 | >>> juice = extract_data('juice') 1873 | >>> from PyRapidML.classification import * 1874 | >>> exp_name = setup(data = juice, target = 'Purchase') 1875 | >>> lr = creating_model('lr') 1876 | >>> pred_holdout = predict_model(lr) 1877 | >>> pred_unseen = predict_model(lr, data = unseen_dataframe) 1878 | 1879 | 1880 | estimator: scikit-learn compatible object 1881 | Trained model object 1882 | 1883 | 1884 | data: pandas.DataFrame 1885 | Shape (n_samples, n_features). All features used during training 1886 | must be available in the unseen dataset. 1887 | 1888 | 1889 | probability_threshold: float, default = None 1890 | Threshold for converting predicted probability to class label. 1891 | It defaults to 0.5 for all classifiers unless explicitly defined 1892 | in this parameter. 1893 | 1894 | 1895 | encoded_labels: bool, default = False 1896 | When set to True, will return labels encoded as an integer. 1897 | 1898 | 1899 | raw_score: bool, default = False 1900 | When set to True, scores for all labels will be returned. 1901 | 1902 | 1903 | round: int, default = 4 1904 | Number of decimal places the metrics in the score grid will be rounded to. 1905 | 1906 | 1907 | verbose: bool, default = True 1908 | When set to False, holdout score grid is not printed. 1909 | 1910 | 1911 | Returns: 1912 | pandas.DataFrame 1913 | 1914 | 1915 | Warnings 1916 | -------- 1917 | - The behavior of the ``predict_model`` is changed in version 2.1 without backward 1918 | compatibility. As such, the pipelines trained using the version (<= 2.0), may not 1919 | work for inference with version >= 2.1. You can either retrain your models with a 1920 | newer version or downgrade the version for inference. 1921 | 1922 | """ 1923 | 1924 | return pycaret.internal.tabular.predict_model( 1925 | estimator=estimator, 1926 | data=data, 1927 | probability_threshold=probability_threshold, 1928 | encoded_labels=encoded_labels, 1929 | raw_score=raw_score, 1930 | round=round, 1931 | verbose=verbose, 1932 | ml_usecase=MLUsecase.CLASSIFICATION, 1933 | ) 1934 | 1935 | 1936 | def finalize_model( 1937 | estimator, 1938 | fit_kwargs: Optional[dict] = None, 1939 | groups: Optional[Union[str, Any]] = None, 1940 | model_only: bool = True, 1941 | ) -> Any: 1942 | 1943 | """ 1944 | This function trains a given estimator on the entire dataset including the 1945 | holdout set. 1946 | 1947 | 1948 | Example 1949 | ------- 1950 | >>> from PyRapidML.datasets import get_data 1951 | >>> juice = extract_data('juice') 1952 | >>> from PyRapidML.classification import * 1953 | >>> exp_name = initializer(data = juice, target = 'Purchase') 1954 | >>> lr = creating_model('lr') 1955 | >>> final_lr = finalize_model(lr) 1956 | 1957 | 1958 | estimator: scikit-learn compatible object 1959 | Trained model object 1960 | 1961 | 1962 | fit_kwargs: dict, default = {} (empty dict) 1963 | Dictionary of arguments passed to the fit method of the model. 1964 | 1965 | 1966 | groups: str or array-like, with shape (n_samples,), default = None 1967 | Optional group labels when GroupKFold is used for the cross validation. 1968 | It takes an array with shape (n_samples, ) where n_samples is the number 1969 | of rows in training dataset. When string is passed, it is interpreted as 1970 | the column name in the dataset containing group labels. 1971 | 1972 | 1973 | model_only: bool, default = True 1974 | When set to False, only model object is re-trained and all the 1975 | transformations in Pipeline are ignored. 1976 | 1977 | 1978 | Returns: 1979 | Trained Model 1980 | 1981 | """ 1982 | 1983 | return pycaret.internal.tabular.finalize_model( 1984 | estimator=estimator, 1985 | fit_kwargs=fit_kwargs, 1986 | groups=groups, 1987 | model_only=model_only, 1988 | ) 1989 | 1990 | 1991 | def deploy_model( 1992 | model, model_name: str, authentication: dict, platform: str = "aws", 1993 | ): 1994 | 1995 | """ 1996 | This function deploys the transformation pipeline and trained model on cloud. 1997 | 1998 | 1999 | Example 2000 | ------- 2001 | >>> from PyRapidML.datasets import get_data 2002 | >>> juice = extract_data('juice') 2003 | >>> from PyRapidML.classification import * 2004 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2005 | >>> lr = creating_model('lr') 2006 | >>> deploy_model(model = lr, model_name = 'lr-for-deployment', platform = 'aws', authentication = {'bucket' : 'S3-bucket-name'}) 2007 | 2008 | 2009 | Amazon Web Service (AWS) users: 2010 | To deploy a model on AWS S3 ('aws'), environment variables must be set in your 2011 | local environment. To configure AWS environment variables, type ``aws configure`` 2012 | in the command line. Following information from the IAM portal of amazon console 2013 | account is required: 2014 | 2015 | - AWS Access Key ID 2016 | - AWS Secret Key Access 2017 | - Default Region Name (can be seen under Global settings on your AWS console) 2018 | 2019 | More info: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html 2020 | 2021 | 2022 | Google Cloud Platform (GCP) users: 2023 | To deploy a model on Google Cloud Platform ('gcp'), project must be created 2024 | using command line or GCP console. Once project is created, you must create 2025 | a service account and download the service account key as a JSON file to set 2026 | environment variables in your local environment. 2027 | 2028 | More info: https://cloud.google.com/docs/authentication/production 2029 | 2030 | 2031 | Microsoft Azure (Azure) users: 2032 | To deploy a model on Microsoft Azure ('azure'), environment variables for connection 2033 | string must be set in your local environment. Go to settings of storage account on 2034 | Azure portal to access the connection string required. 2035 | 2036 | More info: https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?toc=%2Fpython%2Fazure%2FTOC.json 2037 | 2038 | 2039 | model: scikit-learn compatible object 2040 | Trained model object 2041 | 2042 | 2043 | model_name: str 2044 | Name of model. 2045 | 2046 | 2047 | authentication: dict 2048 | Dictionary of applicable authentication tokens. 2049 | 2050 | When platform = 'aws': 2051 | {'bucket' : 'S3-bucket-name'} 2052 | 2053 | When platform = 'gcp': 2054 | {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'} 2055 | 2056 | When platform = 'azure': 2057 | {'container': 'azure-container-name'} 2058 | 2059 | 2060 | platform: str, default = 'aws' 2061 | Name of the cloud platform. Currently supported platforms: 'aws', 'gcp' and 'azure'. 2062 | 2063 | 2064 | Returns: 2065 | None 2066 | 2067 | """ 2068 | 2069 | return pycaret.internal.tabular.deploy_model( 2070 | model=model, 2071 | model_name=model_name, 2072 | authentication=authentication, 2073 | platform=platform, 2074 | ) 2075 | 2076 | 2077 | def save_model( 2078 | model, model_name: str, model_only: bool = False, verbose: bool = True, **kwargs 2079 | ): 2080 | 2081 | """ 2082 | This function saves the transformation pipeline and trained model object 2083 | into the current working directory as a pickle file for later use. 2084 | 2085 | Example 2086 | ------- 2087 | >>> from PyRapidML.datasets import get_data 2088 | >>> juice = extract_data('juice') 2089 | >>> from PyRapidML.classification import * 2090 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2091 | >>> lr = creatinng_model('lr') 2092 | >>> save_model(lr, 'saved_lr_model') 2093 | 2094 | 2095 | model: scikit-learn compatible object 2096 | Trained model object 2097 | 2098 | 2099 | model_name: str 2100 | Name of the model. 2101 | 2102 | 2103 | model_only: bool, default = False 2104 | When set to True, only trained model object is saved instead of the 2105 | entire pipeline. 2106 | 2107 | 2108 | verbose: bool, default = True 2109 | Success message is not printed when verbose is set to False. 2110 | 2111 | 2112 | **kwargs**: 2113 | Additional keyword arguments to pass to joblib.dump(). 2114 | 2115 | 2116 | Returns: 2117 | Tuple of the model object and the filename. 2118 | 2119 | """ 2120 | 2121 | return pycaret.internal.tabular.save_model( 2122 | model=model, 2123 | model_name=model_name, 2124 | model_only=model_only, 2125 | verbose=verbose, 2126 | **kwargs, 2127 | ) 2128 | 2129 | 2130 | def load_model( 2131 | model_name, 2132 | platform: Optional[str] = None, 2133 | authentication: Optional[Dict[str, str]] = None, 2134 | verbose: bool = True, 2135 | ): 2136 | 2137 | """ 2138 | This function loads a previously saved pipeline. 2139 | 2140 | 2141 | Example 2142 | ------- 2143 | >>> from PyRapidML.classification import load_model 2144 | >>> saved_lr = load_model('saved_lr_model') 2145 | 2146 | 2147 | model_name: str 2148 | Name of the model. 2149 | 2150 | 2151 | platform: str, default = None 2152 | Name of the cloud platform. Currently supported platforms: 2153 | 'aws', 'gcp' and 'azure'. 2154 | 2155 | 2156 | authentication: dict, default = None 2157 | dictionary of applicable authentication tokens. 2158 | 2159 | when platform = 'aws': 2160 | {'bucket' : 'S3-bucket-name'} 2161 | 2162 | when platform = 'gcp': 2163 | {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'} 2164 | 2165 | when platform = 'azure': 2166 | {'container': 'azure-container-name'} 2167 | 2168 | 2169 | verbose: bool, default = True 2170 | Success message is not printed when verbose is set to False. 2171 | 2172 | 2173 | Returns: 2174 | Trained Model 2175 | 2176 | """ 2177 | 2178 | return pycaret.internal.tabular.load_model( 2179 | model_name=model_name, 2180 | platform=platform, 2181 | authentication=authentication, 2182 | verbose=verbose, 2183 | ) 2184 | 2185 | 2186 | def automl(optimize: str = "Accuracy", use_holdout: bool = False) -> Any: 2187 | 2188 | """ 2189 | This function returns the best model out of all trained models in 2190 | current session based on the ``optimize`` parameter. Metrics 2191 | evaluated can be accessed using the ``get_metrics`` function. 2192 | 2193 | 2194 | Example 2195 | ------- 2196 | >>> from PyRapidML.datasets import get_data 2197 | >>> juice = extract_data('juice') 2198 | >>> from PyRapidML.classification import * 2199 | >>> exp_name = setup(data = juice, target = 'Purchase') 2200 | >>> top3 = comparing_models(n_select = 3) 2201 | >>> tuned_top3 = [tuning_model(i) for i in top3] 2202 | >>> blender = blend_models(tuned_top3) 2203 | >>> stacker = stack_models(tuned_top3) 2204 | >>> best_auc_model = automl(optimize = 'AUC') 2205 | 2206 | 2207 | optimize: str, default = 'Accuracy' 2208 | Metric to use for model selection. It also accepts custom metrics 2209 | added using the ``add_metric`` function. 2210 | 2211 | 2212 | use_holdout: bool, default = False 2213 | When set to True, metrics are evaluated on holdout set instead of CV. 2214 | 2215 | 2216 | Returns: 2217 | Trained Model 2218 | 2219 | """ 2220 | return pycaret.internal.tabular.automl(optimize=optimize, use_holdout=use_holdout) 2221 | 2222 | 2223 | def pull(pop: bool = False) -> pd.DataFrame: 2224 | 2225 | """ 2226 | Returns last printed score grid. Use ``pull`` function after 2227 | any training function to store the score grid in pandas.DataFrame. 2228 | 2229 | 2230 | pop: bool, default = False 2231 | If True, will pop (remove) the returned dataframe from the 2232 | display container. 2233 | 2234 | 2235 | Returns: 2236 | pandas.DataFrame 2237 | 2238 | """ 2239 | return pycaret.internal.tabular.pull(pop=pop) 2240 | 2241 | 2242 | def models( 2243 | type: Optional[str] = None, internal: bool = False, raise_errors: bool = True, 2244 | ) -> pd.DataFrame: 2245 | 2246 | """ 2247 | Returns table of models available in the model library. 2248 | 2249 | Example 2250 | ------- 2251 | >>> from PyRapidML.datasets import get_data 2252 | >>> juice = extract_data('juice') 2253 | >>> from PyRapidML.classification import * 2254 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2255 | >>> all_models = models() 2256 | 2257 | 2258 | type: str, default = None 2259 | - linear : filters and only return linear models 2260 | - tree : filters and only return tree based models 2261 | - ensemble : filters and only return ensemble models 2262 | 2263 | 2264 | internal: bool, default = False 2265 | When True, will return extra columns and rows used internally. 2266 | 2267 | 2268 | raise_errors: bool, default = True 2269 | When False, will suppress all exceptions, ignoring models 2270 | that couldn't be created. 2271 | 2272 | 2273 | Returns: 2274 | pandas.DataFrame 2275 | 2276 | """ 2277 | return pycaret.internal.tabular.models( 2278 | type=type, internal=internal, raise_errors=raise_errors 2279 | ) 2280 | 2281 | 2282 | def get_metrics( 2283 | reset: bool = False, include_custom: bool = True, raise_errors: bool = True, 2284 | ) -> pd.DataFrame: 2285 | 2286 | """ 2287 | Returns table of available metrics used for CV. 2288 | 2289 | 2290 | Example 2291 | ------- 2292 | >>> from PyRapidML.datasets import get_data 2293 | >>> juice = extract_data('juice') 2294 | >>> from PyRapidML.classification import * 2295 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2296 | >>> all_metrics = get_metrics() 2297 | 2298 | 2299 | reset: bool, default = False 2300 | When True, will reset all changes made using the ``add_metric`` 2301 | and ``remove_metric`` function. 2302 | 2303 | 2304 | include_custom: bool, default = True 2305 | Whether to include user added (custom) metrics or not. 2306 | 2307 | 2308 | raise_errors: bool, default = True 2309 | If False, will suppress all exceptions, ignoring models that 2310 | couldn't be created. 2311 | 2312 | 2313 | Returns: 2314 | pandas.DataFrame 2315 | 2316 | """ 2317 | 2318 | return pycaret.internal.tabular.get_metrics( 2319 | reset=reset, include_custom=include_custom, raise_errors=raise_errors, 2320 | ) 2321 | 2322 | 2323 | def add_metric( 2324 | id: str, 2325 | name: str, 2326 | score_func: type, 2327 | target: str = "pred", 2328 | greater_is_better: bool = True, 2329 | multiclass: bool = True, 2330 | **kwargs, 2331 | ) -> pd.Series: 2332 | 2333 | """ 2334 | Adds a custom metric to be used for CV. 2335 | 2336 | 2337 | Example 2338 | ------- 2339 | >>> from PyRapidML.datasets import get_data 2340 | >>> juice = extract_data('juice') 2341 | >>> from PyRapidML.classification import * 2342 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2343 | >>> from sklearn.metrics import log_loss 2344 | >>> add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False) 2345 | 2346 | 2347 | id: str 2348 | Unique id for the metric. 2349 | 2350 | 2351 | name: str 2352 | Display name of the metric. 2353 | 2354 | 2355 | score_func: type 2356 | Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``. 2357 | 2358 | 2359 | target: str, default = 'pred' 2360 | The target of the score function. 2361 | 2362 | - 'pred' for the prediction table 2363 | - 'pred_proba' for pred_proba 2364 | - 'threshold' for decision_function or predict_proba 2365 | 2366 | 2367 | greater_is_better: bool, default = True 2368 | Whether ``score_func`` is higher the better or not. 2369 | 2370 | 2371 | multiclass: bool, default = True 2372 | Whether the metric supports multiclass target. 2373 | 2374 | 2375 | **kwargs**: 2376 | Arguments to be passed to score function. 2377 | 2378 | 2379 | Returns: 2380 | pandas.Series 2381 | 2382 | """ 2383 | 2384 | return pycaret.internal.tabular.add_metric( 2385 | id=id, 2386 | name=name, 2387 | score_func=score_func, 2388 | target=target, 2389 | greater_is_better=greater_is_better, 2390 | multiclass=multiclass, 2391 | **kwargs, 2392 | ) 2393 | 2394 | 2395 | def remove_metric(name_or_id: str): 2396 | 2397 | """ 2398 | Removes a metric from CV. 2399 | 2400 | 2401 | Example 2402 | ------- 2403 | >>> from PyRapidML.datasets import get_data 2404 | >>> juice = extract_data('juice') 2405 | >>> from PyRapidML.classification import * 2406 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2407 | >>> remove_metric('MCC') 2408 | 2409 | 2410 | name_or_id: str 2411 | Display name or ID of the metric. 2412 | 2413 | 2414 | Returns: 2415 | None 2416 | 2417 | """ 2418 | return pycaret.internal.tabular.remove_metric(name_or_id=name_or_id) 2419 | 2420 | 2421 | def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.DataFrame: 2422 | 2423 | """ 2424 | Returns a table of experiment logs. Only works when ``log_experiment`` 2425 | is True when initializing the ``setup`` function. 2426 | 2427 | 2428 | Example 2429 | ------- 2430 | >>> from PyRapidML.datasets import get_data 2431 | >>> juice = extract_data('juice') 2432 | >>> from PyRapidML.classification import * 2433 | >>> exp_name = initializer(data = juice, target = 'Purchase', log_experiment = True) 2434 | >>> best = comparing_models() 2435 | >>> exp_logs = get_logs() 2436 | 2437 | 2438 | experiment_name: str, default = None 2439 | When None current active run is used. 2440 | 2441 | 2442 | save: bool, default = False 2443 | When set to True, csv file is saved in current working directory. 2444 | 2445 | 2446 | Returns: 2447 | pandas.DataFrame 2448 | 2449 | """ 2450 | 2451 | return pycaret.internal.tabular.get_logs(experiment_name=experiment_name, save=save) 2452 | 2453 | 2454 | def get_config(variable: str): 2455 | 2456 | """ 2457 | This function retrieves the global variables created when initializing the 2458 | ``setup`` function. Following variables are accessible: 2459 | 2460 | - X: Transformed dataset (X) 2461 | - y: Transformed dataset (y) 2462 | - X_train: Transformed train dataset (X) 2463 | - X_test: Transformed test/holdout dataset (X) 2464 | - y_train: Transformed train dataset (y) 2465 | - y_test: Transformed test/holdout dataset (y) 2466 | - seed: random state set through session_id 2467 | - prep_pipe: Transformation pipeline 2468 | - fold_shuffle_param: shuffle parameter used in Kfolds 2469 | - n_jobs_param: n_jobs parameter used in model training 2470 | - html_param: html_param configured through setup 2471 | - create_model_container: results grid storage container 2472 | - master_model_container: model storage container 2473 | - display_container: results display container 2474 | - exp_name_log: Name of experiment 2475 | - logging_param: log_experiment param 2476 | - log_plots_param: log_plots param 2477 | - USI: Unique session ID parameter 2478 | - fix_imbalance_param: fix_imbalance param 2479 | - fix_imbalance_method_param: fix_imbalance_method param 2480 | - data_before_preprocess: data before preprocessing 2481 | - target_param: name of target variable 2482 | - gpu_param: use_gpu param configured through setup 2483 | - fold_generator: CV splitter configured in fold_strategy 2484 | - fold_param: fold params defined in the setup 2485 | - fold_groups_param: fold groups defined in the setup 2486 | - stratify_param: stratify parameter defined in the setup 2487 | 2488 | 2489 | Example 2490 | ------- 2491 | >>> from PyRapidML.datasets import get_data 2492 | >>> juice = extract_data('juice') 2493 | >>> from PyRapidML.classification import * 2494 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2495 | >>> X_train = get_config('X_train') 2496 | 2497 | 2498 | Returns: 2499 | Global variable 2500 | 2501 | """ 2502 | 2503 | return pycaret.internal.tabular.get_config(variable=variable) 2504 | 2505 | 2506 | def set_config(variable: str, value): 2507 | 2508 | """ 2509 | This function resets the global variables. Following variables are 2510 | accessible: 2511 | 2512 | - X: Transformed dataset (X) 2513 | - y: Transformed dataset (y) 2514 | - X_train: Transformed train dataset (X) 2515 | - X_test: Transformed test/holdout dataset (X) 2516 | - y_train: Transformed train dataset (y) 2517 | - y_test: Transformed test/holdout dataset (y) 2518 | - seed: random state set through session_id 2519 | - prep_pipe: Transformation pipeline 2520 | - fold_shuffle_param: shuffle parameter used in Kfolds 2521 | - n_jobs_param: n_jobs parameter used in model training 2522 | - html_param: html_param configured through setup 2523 | - create_model_container: results grid storage container 2524 | - master_model_container: model storage container 2525 | - display_container: results display container 2526 | - exp_name_log: Name of experiment 2527 | - logging_param: log_experiment param 2528 | - log_plots_param: log_plots param 2529 | - USI: Unique session ID parameter 2530 | - fix_imbalance_param: fix_imbalance param 2531 | - fix_imbalance_method_param: fix_imbalance_method param 2532 | - data_before_preprocess: data before preprocessing 2533 | - target_param: name of target variable 2534 | - gpu_param: use_gpu param configured through setup 2535 | - fold_generator: CV splitter configured in fold_strategy 2536 | - fold_param: fold params defined in the setup 2537 | - fold_groups_param: fold groups defined in the setup 2538 | - stratify_param: stratify parameter defined in the setup 2539 | 2540 | Example 2541 | ------- 2542 | >>> from PyRapidML.datasets import get_data 2543 | >>> juice = extract_data('juice') 2544 | >>> from PyRapidML.classification import * 2545 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2546 | >>> set_config('seed', 123) 2547 | 2548 | 2549 | Returns: 2550 | None 2551 | 2552 | """ 2553 | 2554 | return pycaret.internal.tabular.set_config(variable=variable, value=value) 2555 | 2556 | 2557 | def save_config(file_name: str): 2558 | 2559 | """ 2560 | This function save all global variables to a pickle file, allowing to 2561 | later resume without rerunning the ``setup``. 2562 | 2563 | 2564 | Example 2565 | ------- 2566 | >>> from PyRapidML.datasets import get_data 2567 | >>> juice = extract_data('juice') 2568 | >>> from PyRapidML.classification import * 2569 | >>> exp_name = initializer(data = juice, target = 'Purchase') 2570 | >>> save_config('myvars.pkl') 2571 | 2572 | 2573 | Returns: 2574 | None 2575 | 2576 | """ 2577 | 2578 | return pycaret.internal.tabular.save_config(file_name=file_name) 2579 | 2580 | 2581 | def load_config(file_name: str): 2582 | 2583 | """ 2584 | This function loads global variables from a pickle file into Python 2585 | environment. 2586 | 2587 | 2588 | Example 2589 | ------- 2590 | >>> from PyRapidML.classification import load_config 2591 | >>> load_config('myvars.pkl') 2592 | 2593 | 2594 | Returns: 2595 | Global variables 2596 | 2597 | """ 2598 | 2599 | return pycaret.internal.tabular.load_config(file_name=file_name) 2600 | --------------------------------------------------------------------------------