├── img
    ├── .gitkeep
    ├── flexml_banner.jpeg
    ├── flexml_logo.jpeg
    ├── start_guide_reg_output.jpg
    └── start_guide_reg_tuning_output.jpg
├── tests
    ├── __init__.py
    ├── test_performance.py
    ├── test_ml_models.py
    ├── test_cross_validation.py
    ├── test_feature_engineering.py
    ├── test_supervised.py
    └── test_helpers.py
├── flexml
    ├── logs
    │   ├── .gitkeep
    │   └── __init__.py
    ├── structures
    │   └── __init__.py
    ├── logger
    │   ├── __init__.py
    │   └── logger.py
    ├── config
    │   ├── __init__.py
    │   ├── supervised_config.py
    │   └── ml_models.py
    ├── __init__.py
    ├── helpers
    │   ├── __init__.py
    │   ├── tools.py
    │   ├── cross_validation.py
    │   ├── supervised_helpers.py
    │   ├── plot_model_graphs.py
    │   └── validators.py
    ├── classification.py
    ├── regression.py
    ├── _feature_engineer.py
    └── _model_tuner.py
├── codecov.yml
├── MANIFEST.in
├── .github
    └── workflows
    │   └── tests.yml
├── pyproject.toml
├── README.md
├── .gitignore
└── LICENSE


/img/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/flexml/logs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/flexml/logs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false


--------------------------------------------------------------------------------
/flexml/structures/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/flexml/logger/__init__.py:
--------------------------------------------------------------------------------
1 | from flexml.logger.logger import get_logger


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE README.md
2 | recursive-include tests *.py


--------------------------------------------------------------------------------
/img/flexml_banner.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozguraslank/flexml/HEAD/img/flexml_banner.jpeg


--------------------------------------------------------------------------------
/img/flexml_logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozguraslank/flexml/HEAD/img/flexml_logo.jpeg


--------------------------------------------------------------------------------
/img/start_guide_reg_output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozguraslank/flexml/HEAD/img/start_guide_reg_output.jpg


--------------------------------------------------------------------------------
/img/start_guide_reg_tuning_output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozguraslank/flexml/HEAD/img/start_guide_reg_tuning_output.jpg


--------------------------------------------------------------------------------
/flexml/config/__init__.py:
--------------------------------------------------------------------------------
 1 | from flexml.config.ml_models import (
 2 |     get_ml_models
 3 | )
 4 | 
 5 | from flexml.config.supervised_config import (
 6 |     EVALUATION_METRICS,
 7 |     TUNING_METRIC_TRANSFORMATIONS,
 8 |     CROSS_VALIDATION_METHODS,
 9 |     FEATURE_ENGINEERING_METHODS,
10 |     PLOT_TYPES
11 | )


--------------------------------------------------------------------------------
/flexml/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | FlexML: Easy-to-use and flexible AutoML library for Python
 3 | """
 4 | 
 5 | from flexml.helpers.tools import check_numpy_dtype_error
 6 | check_numpy_dtype_error() # Check cronic Colab version issue
 7 | 
 8 | from .regression import Regression
 9 | from .classification import Classification
10 | 
11 | __version__ = "1.1.0"
12 | 
13 | __all__ = ["Regression", "Classification"]
14 | 


--------------------------------------------------------------------------------
/flexml/helpers/__init__.py:
--------------------------------------------------------------------------------
 1 | from flexml.helpers.tools import check_numpy_dtype_error
 2 | check_numpy_dtype_error() # Check cronic Colab version issue
 3 | 
 4 | from flexml.helpers.validators import (
 5 |     eval_metric_checker,
 6 |     random_state_checker,
 7 |     cross_validation_checker,
 8 |     validate_inputs
 9 | )
10 | from flexml.helpers.cross_validation import get_cv_splits
11 | from flexml.helpers.supervised_helpers import evaluate_model_perf
12 | from flexml.helpers.plot_model_graphs import (
13 |     plot_feature_importance,
14 |     plot_confusion_matrix,
15 |     plot_roc_curve,
16 |     plot_shap,
17 |     plot_residuals,
18 |     plot_prediction_error,
19 |     plot_calibration_curve
20 | )
21 | from flexml.helpers.tools import is_interactive_notebook


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Run Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | permissions:
12 |   id-token: write
13 |   contents: read
14 |   
15 | jobs:
16 |   test:
17 |     runs-on: ubuntu-latest
18 | 
19 |     strategy:
20 |       matrix:
21 |         python-version: ['3.9', '3.10', '3.11', '3.12']
22 | 
23 |     steps:
24 |       - name: Checkout code
25 |         uses: actions/checkout@v3
26 | 
27 |       - name: Set up Python ${{ matrix.python-version }}
28 |         uses: actions/setup-python@v4
29 |         with:
30 |           python-version: ${{ matrix.python-version }}
31 | 
32 |       - name: Install dependencies
33 |         run: |
34 |           python -m pip install --upgrade pip
35 |           python -m pip install uv
36 |           python -m uv pip install .[test]
37 | 
38 |       - name: Run tests
39 |         run: pytest --cov --cov-branch --cov-report=xml
40 | 
41 |       - name: Upload results to Codecov
42 |         uses: codecov/codecov-action@v5
43 |         with:
44 |           token: ${{ secrets.CODECOV_TOKEN }}


--------------------------------------------------------------------------------
/flexml/helpers/tools.py:
--------------------------------------------------------------------------------
 1 | from IPython import get_ipython
 2 | from flexml.logger import get_logger
 3 | import warnings
 4 | warnings.filterwarnings("ignore")
 5 | 
 6 | 
 7 | def is_interactive_notebook():
 8 |     """Detects interactive environments including Jupyter and Colab"""
 9 |     try:
10 |         # Get the shell class name
11 |         shell = get_ipython().__class__.__name__
12 |         # Both Jupyter and Colab have specific shell names
13 |         if shell in ['ZMQInteractiveShell', 'Shell']:  # ZMQ is for Jupyter, Shell is for Colab
14 |             return True
15 |         return False
16 |     except:
17 |         # get_ipython() will not be defined in non-interactive environments
18 |         return False
19 |     
20 | 
21 | def check_numpy_dtype_error():
22 |     """
23 |     Checks if the numpy version is compatible with the pandas version in Colab
24 |     """
25 |     logger = get_logger(__name__, "PROD")
26 |     try:
27 |         shell = get_ipython().__class__.__name__
28 |         if shell != "Shell": # If environment is not Colab, no need for this check since It only happens in Colab
29 |             return
30 | 
31 |         import pandas
32 |     except ValueError as e: # Catch ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
33 |         if 'numpy.dtype size changed' in str(e):
34 |             logger.warning("Colab has cronic version issue, restarting the kernel... (details: https://shorturl.at/ZMJBh)")
35 |             try:
36 |                 import os
37 |                 os.kill(os.getpid(), 9)
38 |             except: # If it fails, try to exit the program. 
39 |                 exit()
40 |         else:
41 |             raise e


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "flexml"
 7 | version = "1.1.0"
 8 | authors = [
 9 |   { name="Ozgur Aslan", email="ozguraslank@gmail.com"},
10 | ]
11 | description = "Easy-to-use and flexible AutoML library for Python"
12 | readme = "README.md"
13 | requires-python = ">=3.9,<3.13"
14 | license = { file = "LICENSE" }
15 | keywords = ["AutoML", "Machine Learning", "Data Science", "Regression", "Classification"]
16 | classifiers = [
17 |     "Programming Language :: Python :: 3.9",
18 |     "Programming Language :: Python :: 3.10",
19 |     "Programming Language :: Python :: 3.11",
20 |     "Programming Language :: Python :: 3.12",
21 |     "Operating System :: OS Independent",
22 |     "License :: OSI Approved :: Apache Software License",
23 | ]
24 | 
25 | dependencies = [
26 |     "numpy>=1.21,<=1.26.4",
27 |     "pandas>=2.0.1,<2.2",
28 |     "scikit-learn>=1.5.0,<=1.5.2",
29 |     "xgboost>=2.0.0,<3.0.0",
30 |     "lightgbm>=4.0.0",
31 |     "catboost>=1.2.5",
32 |     "tqdm>=4.60.0",
33 |     "optuna>=3.0.0",
34 |     "ipython>=7.11.0",
35 |     "jinja2>=3.1.0",
36 |     "nbformat>=5.10.0",
37 |     "plotly>=6.0.0",
38 |     "yellowbrick>=1.5",
39 |     "shap>=0.46.0",
40 |     "rich>=13.9.0",
41 |     'setuptools; python_version>="3.12"',
42 | ]
43 | 
44 | [project.optional-dependencies]
45 | test = [
46 |     "pytest>=8.0.1",
47 |     "parameterized>=0.8.1",
48 |     "pytest-cov>=6.0.0",
49 |     "seaborn>=0.13.0",
50 | ]
51 | 
52 | [project.urls]
53 | Repository = "https://github.com/ozguraslank/flexml"
54 | Issues = "https://github.com/ozguraslank/flexml/issues"
55 | 
56 | [tool.setuptools.packages.find]
57 | where = ["."]
58 | include = ["flexml*"]
59 | exclude = ["tests*"] 


--------------------------------------------------------------------------------
/tests/test_performance.py:
--------------------------------------------------------------------------------
 1 | from seaborn import load_dataset
 2 | import unittest
 3 | from flexml import Classification, Regression
 4 | import numpy as np
 5 | 
 6 | import warnings
 7 | warnings.filterwarnings("ignore")
 8 | 
 9 | 
10 | class PerformanceTest(unittest.TestCase):
11 |     """
12 |     Test cases for the performance of the Classification class
13 |     """
14 | 
15 |     df_class = load_dataset('diamonds')
16 |     # Set seed for reproducibility
17 |     np.random.seed(42)
18 |     # Randomly select 20% of the data (excluding 'price') and set to NaN
19 |     mask = np.random.rand(*df_class.shape) < 0.2
20 |     mask[:, df_class.columns.get_loc('cut')] = False
21 |     df_class.where(~mask, np.nan, inplace=True)
22 |     fml_class = Classification(df_class, target_col='cut')
23 |     
24 |     df_regression = load_dataset('diamonds')
25 |     # Randomly select 20% of the data (excluding 'price') and set to NaN
26 |     mask = np.random.rand(*df_regression.shape) < 0.2
27 |     mask[:, df_regression.columns.get_loc('price')] = False
28 |     df_regression.where(~mask, np.nan, inplace=True)
29 |     fml_regression = Regression(df_regression, target_col='price')
30 | 
31 |     def test_performance_classification(self):
32 |         """
33 |         Performance test for the Classification class
34 |         """
35 |         self.fml_class.start_experiment(experiment_size="wide",cv_method="holdout")
36 | 
37 |         # Calculate the average R2 score
38 |         avg_accuracy = self.fml_class._model_stats_df["Accuracy"].mean()
39 |         self.assertGreater(
40 |             avg_accuracy, 
41 |             0.55, 
42 |             f"Average Accuracy score {avg_accuracy:.4f} is not greater than 0.55"
43 |         )
44 | 
45 |     def test_performance_regression(self):
46 |         """
47 |         Performance test for the Classification class
48 |         """
49 |         self.fml_regression.start_experiment(experiment_size="wide",cv_method="holdout")
50 | 
51 |         # Calculate the average R2 score
52 |         avg_r2 = self.fml_regression._model_stats_df["R2"].mean()
53 |         self.assertGreater(
54 |             avg_r2, 
55 |             0.75, 
56 |             f"Average R² score {avg_r2:.4f} is not greater than 0.75"
57 |         )


--------------------------------------------------------------------------------
/flexml/logger/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | __LOG_DIR_PATH = "logs"
 5 | __LOG_FILE_PATH = os.path.join(__LOG_DIR_PATH, "flexml_logs.log")
 6 | 
 7 | def _logger_configuration(log_level: str, logging_to_file: bool = False):
 8 |     """
 9 |     Configures the logger to save logs to a file or not.
10 |     
11 |     Parameters
12 |     ----------
13 |     log_level: str,
14 |         The log level to set for the logger. It can be either "TEST" or "PROD"
15 |     
16 |     logging_to_file : bool, (default=False)
17 |         If True, logs are saved to /logs/flexml_logs.log. Otherwise, logs are not saved to a file.
18 |     """
19 |     handlers = [logging.StreamHandler()]
20 |     log_format = None
21 |     
22 |     if log_level == "TEST":
23 |         log_format = '%(levelname)s | %(asctime)-3s | %(name)s.%(funcName)s | %(message)-3s'
24 |     elif log_level == "PROD":
25 |         log_format = '%(levelname)s | %(asctime)-3s | %(message)-3s'
26 |     else:
27 |         raise ValueError("Invalid log level. It should be either 'TEST' or 'PROD'.")
28 |     
29 |     if logging_to_file:
30 |         os.makedirs(__LOG_DIR_PATH, exist_ok=True)
31 |         handlers.append(logging.FileHandler(__LOG_FILE_PATH))
32 | 
33 |     logging.basicConfig(
34 |         level="INFO",
35 |         format=log_format,
36 |         datefmt="%Y-%m-%d %H:%M:%S",
37 |         handlers=handlers,
38 |         force=True
39 |     )
40 |     
41 |     # Set some of the libraries logging to ERROR level to reduce verbosity
42 |     logging.getLogger('shap').setLevel(logging.ERROR)
43 |     logging.getLogger('sklearn').setLevel(logging.ERROR)
44 |     logging.getLogger("numexpr").setLevel(logging.ERROR)
45 | 
46 | def get_logger(
47 |     name: str,
48 |     log_level: str,
49 |     logging_to_file: bool = False
50 | ) -> logging.Logger:
51 |     """
52 |     Returns a logger object with the given name
53 | 
54 |     Parameters
55 |     ----------
56 |     name : str
57 |         The name of the logger (It's always the name of the class or the module)
58 | 
59 |     log_level: str
60 |         The log level to set for the logger. It can be either "TEST" or "PROD"
61 |         
62 |         Example output for TEST
63 |         >>> logger = get_logger("test_logger", "TEST")
64 |         >>> logger.info("This is a test message")
65 |         >>> 2021-07-07 12:00:00 | test_logger.<module> | This is a test message
66 | 
67 |         Example output for PROD
68 |         >>> logger = get_logger("test_logger", "PROD")
69 |         >>> logger.info("This is a test message")
70 |         >>> 2021-07-07 12:00:00 | This is a test message
71 | 
72 |     logging_to_file : bool, (default=False)
73 |         If True, logs are saved to /logs/flexml_logs.log. Otherwise, logs are not saved to a file
74 | 
75 |     Returns
76 |     -------
77 |     logger : logging.Logger
78 |         The logger object with the given name
79 |     """
80 |     _logger_configuration(log_level, logging_to_file)
81 |     return logging.getLogger(name)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![](https://img.shields.io/pypi/v/flexml.svg) ![Python versions](https://img.shields.io/pypi/pyversions/flexml) [![Code Coverage](https://codecov.io/gh/ozguraslank/flexml/branch/main/graph/badge.svg)](https://codecov.io/gh/ozguraslank/flexml)
 2 | # FlexML
 3 | 
 4 | <div align="center">
 5 | <img src="img/flexml_banner.jpeg" alt="drawing" width="500"/>
 6 | </div>
 7 | 
 8 | ## Introduction
 9 | 
10 | FlexML is an easy-to-use and flexible AutoML library for Python that simplifies the process of building machine learning models. It automates model selection and hyperparameter tuning, offering users the flexibility to customize the size of their experiments by allowing to train all available models in the library or only a subset of them for faster results, FlexML adapts to your needs! <br> <br>
11 | At the moment, FlexML supports only regression and classification tasks and offers two experiment modes; 'quick' and 'wide' allowing users to choose between fitting the most used machine learning models in the field or the full range of models available in the library.
12 | 
13 | ## How to Install
14 | 
15 | ```bash
16 | pip install flexml
17 | ```
18 | 
19 | ## Start Guide with Regression Experiment
20 | 
21 | ```python
22 | # Experiment for a Regression problem on California House Value Prediction dataset in Quick mode
23 | 
24 | from flexml import Regression
25 | from sklearn.datasets import fetch_california_housing
26 | 
27 | df = fetch_california_housing(as_frame=True)['frame']
28 | 
29 | reg_exp = Regression(df, target_col="MedHouseVal")
30 | reg_exp.start_experiment()
31 | ```
32 | --> Once **start_experiment()** process finishes, you will see the model leaderboard as below: <br>
33 | <div align="left">
34 | <img src="img/start_guide_reg_output.jpg" alt="drawing" width="400"/>
35 | </div>
36 | 
37 | ```python
38 | # Get the best model
39 | best_model = reg_exp.get_best_models()
40 | 
41 | # Get the best model by name (Alternative)
42 | best_model = reg_exp.get_model_by_name("CatBoostRegressor")
43 | 
44 | # Tune model (default model is the best model and randomized search for tuning)
45 | reg_exp.tune_model()
46 | ```
47 | 
48 | --> Once **tune_model()** process finishes, you will see the updated model leaderboard as below: <br>
49 | <div align="left">
50 | <img src="img/start_guide_reg_tuning_output.jpg" alt="drawing" width="500"/>
51 | </div>
52 | 
53 | <br>
54 | You can also take a look to jupyter notebook files in the <b>'notebooks'</b> folder in the repository for more detailed explanations of the usage
55 | 
56 | ## How to Contribute:
57 | 
58 | 1. **Fork the repository:** Click on the 'Fork' button at the top right corner of the GitHub repository page
59 | 2. **Create a new branch:** Name your branch descriptively based on the feature or fix you're working on
60 | 3. **Make your changes:** Write code and tests to add your feature or fix the issue
61 |    - You can take a look to **tests** folder in the repository to reach the current unittests
62 | 4. **Run tests:** Ensure all existing and new tests pass
63 | 5. **Submit a pull request:** Open a pull request with a clear description of your changes


--------------------------------------------------------------------------------
/flexml/config/supervised_config.py:
--------------------------------------------------------------------------------
 1 | # Regression & Classification Evaluation Metrics
 2 | EVALUATION_METRICS = {
 3 |     "Regression": {"DEFAULT": "R2",
 4 |                    "ALL": ["R2", "MAE", "MSE", "RMSE", "MAPE"]},
 5 |                    
 6 |     "Classification": {"DEFAULT": "Accuracy",
 7 |                        "ALL": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"]}
 8 | }
 9 | 
10 | # Model Tuning Metric Transformations
11 | TUNING_METRIC_TRANSFORMATIONS = {
12 |     "Regression": {
13 |         'R2': 'r2',
14 |         'MAE': 'neg_mean_absolute_error',
15 |         'MSE': 'neg_mean_squared_error',
16 |         'RMSE': 'neg_root_mean_squared_error',
17 |         'MAPE': 'neg_mean_absolute_percentage_error'
18 |     },
19 | 
20 |     "Classification": {
21 |         'Accuracy': 'accuracy',
22 |         'Precision': 'precision',
23 |         'Recall': 'recall',
24 |         'F1 Score': 'f1_weighted',
25 |         'ROC-AUC': 'roc_auc'
26 |     },
27 | 
28 |     "reverse_signed_eval_metrics": ['MAE','MSE', 'RMSE', 'MAPE']
29 |     # These metrics are used in negative form for optimization processes, so we need to reverse the sign later, e.g. from -0.42 to 0.42
30 | }
31 | 
32 | # Supported Cross Validation Methods
33 | CROSS_VALIDATION_METHODS = {
34 |     # 'all' used for the get_cv_splits() function at helpers/cross_validation.py, while others are used for SupervisedBase's validations
35 |     'all': {
36 |         'kfold': 'kfold',
37 |         'stratified_kfold': 'stratifiedkfold',
38 |         'holdout': 'holdout',
39 |         'stratified_shuffle_split': 'stratifiedshufflesplit',
40 |         'shuffle_split': 'shufflesplit',
41 |         'group_kfold': 'groupkfold',
42 |         'group_shuffle_split': 'groupshufflesplit'
43 |     },
44 | 
45 |     'Regression': {
46 |         'kfold': 'kfold',
47 |         'holdout': 'holdout',
48 |         'shuffle_split': 'shufflesplit',
49 |         'group_kfold': 'groupkfold',
50 |         'group_shuffle_split': 'groupshufflesplit'
51 |     },
52 | 
53 |     'Classification': {
54 |         'kfold': 'kfold',
55 |         'stratified_kfold': 'stratifiedkfold',
56 |         'holdout': 'holdout',
57 |         'shuffle_split': 'shufflesplit',
58 |         'stratified_shuffle_split': 'stratifiedshufflesplit',
59 |         'group_kfold': 'groupkfold',
60 |         'group_shuffle_split': 'groupshufflesplit'
61 |     }
62 | }
63 | 
64 | # Feature Engineering Methods That Can Be Used
65 | FEATURE_ENGINEERING_METHODS = {
66 |     "accepted_numeric_imputations_methods": ['median', 'mean', 'mode', 'constant', 'drop'],
67 |     "accepted_categorical_imputations_methods": ['mode', 'constant', 'drop'],
68 |     "accepted_encoding_methods": ['label_encoder', 'onehot_encoder', 'ordinal_encoder'],
69 |     "accepted_standardization_methods": ['standard_scaler', 'normalize_scaler', 'robust_scaler', 'quantile_transformer', 'minmax_scaler', 'maxabs_scaler']
70 | }
71 | 
72 | # Supported Plot Types
73 | PLOT_TYPES = {
74 |     "Regression": ["feature_importance", "residuals", "prediction_error", "shap_summary","shap_violin"],
75 |     "Classification": ["feature_importance", "confusion_matrix", "roc_curve", "shap_summary", "shap_violin", "calibration_curve"]
76 | }


--------------------------------------------------------------------------------
/tests/test_ml_models.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from parameterized import parameterized
 4 | from sklearn.datasets import load_diabetes, load_breast_cancer
 5 | from flexml.regression import Regression
 6 | from flexml.classification import Classification
 7 | from flexml.helpers import get_cv_splits
 8 | from flexml.logger import get_logger
 9 | from flexml.config import get_ml_models
10 | 
11 | import warnings
12 | warnings.filterwarnings("ignore")
13 | 
14 | class TestMLModels(unittest.TestCase):
15 |     logger = get_logger(__name__, "TEST", logging_to_file=False)
16 |     logger.setLevel("DEBUG")
17 | 
18 |     test_config = {
19 |         'Regression': {
20 |             'data': load_diabetes(as_frame=True)['frame'],
21 |             'target_col': 'target',
22 |             'exp_class': Regression,
23 |             'models': get_ml_models(ml_task_type="Regression")['WIDE']
24 |         },
25 |         'Classification': {
26 |             'data': load_breast_cancer(as_frame=True)['frame'],
27 |             'target_col': 'target',
28 |             'exp_class': Classification,
29 |             'models': get_ml_models(ml_task_type="Classification")['WIDE']
30 |         }
31 |     }
32 | 
33 |     experiments = {}
34 |     cv_splitters = {}
35 |     
36 |     for objective, config in test_config.items():
37 |         exp = config['exp_class'](
38 |             data=config['data'],
39 |             target_col=config['target_col'],
40 |             logging_to_file=False
41 |         )
42 |         experiments[objective] = exp
43 |         
44 |         cv_splitters[objective] = get_cv_splits(
45 |             df=config['data'],
46 |             cv_method="holdout",
47 |             test_size=0.5 # Keeping test_size high to make the training faster
48 |         )
49 | 
50 |     @parameterized.expand([
51 |         (objective, model_pack['name'], model_pack['model'], model_pack['tuning_param_grid'])
52 |         for objective, config in test_config.items()
53 |         for model_pack in config['models']
54 |     ])
55 |     def test_ml_models(self, objective, model_name, model, model_tuning_params):
56 |         exp = self.experiments[objective]
57 |         cv_splitter = self.cv_splitters[objective]
58 | 
59 |         X, y = exp.X, exp.y
60 |         train_idx = cv_splitter[0][0] # holdout validation returns in [(train_index, test_index)] format
61 | 
62 |         X_train = X.iloc[train_idx]
63 |         y_train = y.iloc[train_idx]
64 |         
65 |         model.fit(X_train, y_train)
66 | 
67 |         # If its classification problem
68 |         if objective == 'Classification':
69 |             predictions = model.predict_proba(X_train)
70 |         else:
71 |             predictions = model.predict(X_train)
72 |         
73 |         self.assertIsInstance(predictions, np.ndarray)
74 | 
75 |         try:
76 |             exp.tune_model(
77 |                 model=model,
78 |                 tuning_method="randomized_search",
79 |                 param_grid=model_tuning_params,
80 |                 n_iter=3,
81 |                 n_folds=3,
82 |                 n_jobs=-1
83 |             )
84 | 
85 |         except Exception as e:
86 |             if 'Model leaderboard is empty!' in str(e):
87 |                 # Since we don't use the start_experiment() function, there will be no saved models and this error will be raised --
88 |                 # Because, we call _show_tuning_report when tune_model operation is done and that function calls get_best_models() function that calls __top_n_models_checker() where the error will be raised :)
89 |                 pass
90 |             else:
91 |                 # Handle other exceptions
92 |                 error_msg = f"An error occurred while tuning {model_name} model with the following param_grid {model_tuning_params}. Error: {e}"
93 |                 self.logger.error(error_msg)                
94 |                 raise Exception(error_msg)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Addition to below
  2 | *.xml
  3 | *.iml
  4 | *.tsv
  5 | *.json
  6 | *.txt
  7 | *.tfevents
  8 | *.csv
  9 | *.idea
 10 | 
 11 | # generated docs items
 12 | docs/site/
 13 | docs/docs/_partials/termynal.md
 14 | docs/docs/_partials/*/*.html
 15 | 
 16 | # test cache
 17 | manual_test/
 18 | 
 19 | # other local dev info
 20 | .vscode/
 21 | 
 22 | # Mac OS-specific storage files
 23 | .DS_Store
 24 | 
 25 | # vim
 26 | *.swp
 27 | *.swo
 28 | 
 29 | ## https://github.com/github/gitignore/blob/4488915eec0b3a45b5c63ead28f286819c0917de/Python.gitignore
 30 | 
 31 | # Byte-compiled / optimized / DLL files
 32 | __pycache__/
 33 | *.py[cod]
 34 | *$py.class
 35 | 
 36 | # C extensions
 37 | *.so
 38 | 
 39 | # Distribution / packaging
 40 | .Python
 41 | build/
 42 | develop-eggs/
 43 | dist/
 44 | downloads/
 45 | eggs/
 46 | .eggs/
 47 | lib/
 48 | lib64/
 49 | parts/
 50 | sdist/
 51 | var/
 52 | wheels/
 53 | share/python-wheels/
 54 | *.egg-info/
 55 | .installed.cfg
 56 | *.egg
 57 | MANIFEST
 58 | 
 59 | # PyInstaller
 60 | #  Usually these files are written by a python script from a template
 61 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 62 | *.manifest
 63 | *.spec
 64 | 
 65 | # Installer logs
 66 | pip-log.txt
 67 | pip-delete-this-directory.txt
 68 | 
 69 | # Unit test / coverage reports
 70 | htmlcov/
 71 | .tox/
 72 | .nox/
 73 | .coverage
 74 | .coverage.*
 75 | .cache
 76 | nosetests.xml
 77 | coverage.xml
 78 | *.cover
 79 | *.py,cover
 80 | .hypothesis/
 81 | .pytest_cache/
 82 | cover/
 83 | 
 84 | # Translations
 85 | *.mo
 86 | *.pot
 87 | 
 88 | # Django stuff:
 89 | *.log
 90 | local_settings.py
 91 | db.sqlite3
 92 | db.sqlite3-journal
 93 | 
 94 | # Flask stuff:
 95 | instance/
 96 | .webassets-cache
 97 | 
 98 | # Scrapy stuff:
 99 | .scrapy
100 | 
101 | # PyBuilder
102 | .pybuilder/
103 | target/
104 | 
105 | # Jupyter Notebook
106 | .ipynb_checkpoints
107 | 
108 | # IPython
109 | profile_default/
110 | ipython_config.py
111 | 
112 | # pyenv
113 | #   For a library or package, you might want to ignore these files since the code is
114 | #   intended to run in multiple environments; otherwise, check them in:
115 | # .python-version
116 | 
117 | # pipenv
118 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
119 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
120 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
121 | #   install all needed dependencies.
122 | #Pipfile.lock
123 | 
124 | # poetry
125 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
126 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
127 | #   commonly ignored for libraries.
128 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
129 | #poetry.lock
130 | 
131 | # pdm
132 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
133 | #pdm.lock
134 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
135 | #   in version control.
136 | #   https://pdm.fming.dev/#use-with-ide
137 | .pdm.toml
138 | 
139 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
140 | __pypackages__/
141 | 
142 | # Celery stuff
143 | celerybeat-schedule
144 | celerybeat.pid
145 | 
146 | # SageMath parsed files
147 | *.sage.py
148 | 
149 | # Environments
150 | .env
151 | .venv
152 | env/
153 | venv/
154 | ENV/
155 | env.bak/
156 | venv.bak/
157 | 
158 | # Spyder project settings
159 | .spyderproject
160 | .spyproject
161 | 
162 | # Rope project settings
163 | .ropeproject
164 | 
165 | # mkdocs documentation
166 | /site
167 | 
168 | # mypy
169 | .mypy_cache/
170 | .dmypy.json
171 | dmypy.json
172 | 
173 | # Pyre type checker
174 | .pyre/
175 | 
176 | # pytype static type analyzer
177 | .pytype/
178 | 
179 | # Cython debug symbols
180 | cython_debug/
181 | 
182 | # PyCharm
183 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
184 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
185 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
186 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
187 | #.idea/


--------------------------------------------------------------------------------
/flexml/classification.py:
--------------------------------------------------------------------------------
  1 | from flexml.structures.supervised_base import SupervisedBase
  2 | 
  3 | class Classification(SupervisedBase):
  4 |     """
  5 |     A class to train and evaluate different classification models.
  6 | 
  7 |     Parameters
  8 |     ----------
  9 |     data : pd.DataFrame
 10 |         The input data for the model training process
 11 |     
 12 |     target_col : str
 13 |         The target column name in the data
 14 |     
 15 |     random_state : int, (default=42)
 16 |         The random state for data processing processes
 17 |     
 18 |     drop_columns : list, default=None
 19 |         Columns that will be dropped from the data.
 20 |     
 21 |     categorical_imputation_method : str, default='mode'
 22 |         Imputation method for categorical columns. Options:
 23 |         * 'mode': Replace missing values with the most frequent value
 24 |         * 'constant': Replace missing values with a constant value
 25 |         * 'drop': Drop rows with missing values
 26 | 
 27 |     numerical_imputation_method : str, default='mean'
 28 |         Imputation method for numerical columns. Options:
 29 |         * 'mean': Replace missing values with the column mean
 30 |         * 'median': Replace missing values with the column median
 31 |         * 'mode': Replace missing values with the column mode
 32 |         * 'constant': Replace missing values with a constant value
 33 |         * 'drop': Drop rows with missing values
 34 | 
 35 |     column_imputation_map : dict, default=None
 36 |         Custom mapping of columns to specific imputation methods
 37 |         Example usage: {'column_name': 'mean', 'column_name2': 'mode'}
 38 | 
 39 |     categorical_imputation_constant : str, default='Unknown'
 40 |         The constant value for imputing categorical columns when 'constant' is selected
 41 | 
 42 |     numerical_imputation_constant : float, default=0.0
 43 |         The constant value for imputing numerical columns when 'constant' is selected
 44 | 
 45 |     encoding_method : str, default='onehot_encoder'
 46 |         Encoding method for categorical columns. Options:
 47 |         * 'label_encoder': Use label encoding
 48 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
 49 |         * 'onehot_encoder': Use one-hot encoding
 50 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
 51 |         * 'ordinal_encoder': Use ordinal encoding
 52 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
 53 |         
 54 |     onehot_limit : int, default=25
 55 |         Maximum number of categories to use for one-hot encoding.
 56 | 
 57 |     encoding_method_map : dict, default=None
 58 |         Custom mapping of columns to encoding methods
 59 |         Example usage: {'column_name': 'onehot_encoder', 'column_name2': 'label_encoder'}
 60 |     
 61 |     ordinal_encode_map : dict, default=None
 62 |         Custom mapping of columns to category order for ordinal encoding
 63 |         Example usage: {'column_name': ['low', 'medium', 'high']}
 64 |     
 65 |     normalize : str, default=None
 66 |         Standardize the data using StandardScaler. Options:
 67 |         * 'standard_scaler': Standardize the data using StandardScaler
 68 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
 69 |         * 'minmax_scaler': Scale the data using MinMaxScaler
 70 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
 71 |         * 'robust_scaler': Scale the data using RobustScaler
 72 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
 73 |         * 'quantile_transformer': Transform the data using QuantileTransformer
 74 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
 75 |         * 'maxabs_scaler': Scale the data using MaxAbsScaler
 76 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
 77 |         * 'normalize_scaler': Normalize the data to unit length
 78 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
 79 |         
 80 |     shuffle: bool, (default=True)
 81 |         If True, the data will be shuffled before the model training process
 82 | 
 83 |     logging_to_file: bool, (default=False)
 84 |         If True, the logs will be saved to a file in the current path, located in /logs/flexml_logs.log, Otherwise, it will not be saved
 85 |         
 86 |     Example
 87 |     -------
 88 |     >>> from flexml import Classification
 89 |     >>> df = pd.read_csv("MY_DATA.csv")
 90 |     >>> classification_exp = Classification(data=df, target_col='target_col')
 91 |     >>> classification_exp.start_experiment(experiment_size = 'quick')
 92 |     >>> classification_exp.show_model_stats(eval_metric='accuracy')
 93 | 
 94 |     ------------------------------------------------------------
 95 |     | model_name            |accuracy|precision|recall|f1_score|
 96 |     ------------------------|--------|---------|------|--------|
 97 |     | LogisticRegression    | 0.7863 | 0.6721  |0.5921| 0.2469 |
 98 |     | DecisionTreeClassifier| 0.7725 | 0.6441  |0.4642| 0.4347 |
 99 |     | LGBMClassifier        | 0.7521 | 0.4751  |0.3531| 0.1445 |
100 |     | RidgeClassifier       | 0.7011 | 0.7590  |0.6155| 0.3411 |
101 |     | XGBClassifier         | 0.6213 | 0.4701  |0.2923| 0.4039 |
102 |     ------------------------------------------------------------
103 |     >>> best_model = classification_exp.get_best_models(eval_metric = 'accuracy')
104 |     """
105 |     pass


--------------------------------------------------------------------------------
/flexml/regression.py:
--------------------------------------------------------------------------------
  1 | from flexml.structures.supervised_base import SupervisedBase
  2 | 
  3 | class Regression(SupervisedBase):
  4 |     """
  5 |     A class to train and evaluate different regression models
  6 | 
  7 |     Parameters
  8 |     ----------
  9 |     data : pd.DataFrame
 10 |         The input data for the model training process
 11 |     
 12 |     target_col : str
 13 |         The target column name in the data
 14 | 
 15 |     random_state : int, (default=42)
 16 |         The random state for data processing processes
 17 |     
 18 |     drop_columns : list, default=None
 19 |         Columns that will be dropped from the data
 20 |     
 21 |     categorical_imputation_method : str, default='mode'
 22 |         Imputation method for categorical columns. Options:
 23 |         * 'mode': Replace missing values with the most frequent value
 24 |         * 'constant': Replace missing values with a constant value
 25 |         * 'drop': Drop rows with missing values
 26 | 
 27 |     numerical_imputation_method : str, default='mean'
 28 |         Imputation method for numerical columns. Options:
 29 |         * 'mean': Replace missing values with the column mean
 30 |         * 'median': Replace missing values with the column median
 31 |         * 'mode': Replace missing values with the column mode
 32 |         * 'constant': Replace missing values with a constant value
 33 |         * 'drop': Drop rows with missing values
 34 | 
 35 |     column_imputation_map : dict, default=None
 36 |         Custom mapping of columns to specific imputation methods
 37 |         Example usage: {'column_name': 'mean', 'column_name2': 'mode'}
 38 | 
 39 |     categorical_imputation_constant : str, default='Unknown'
 40 |         The constant value for imputing categorical columns when 'constant' is selected
 41 | 
 42 |     numerical_imputation_constant : float, default=0.0
 43 |         The constant value for imputing numerical columns when 'constant' is selected
 44 | 
 45 |     encoding_method : str, default='onehot_encoder'
 46 |         Encoding method for categorical columns. Options:
 47 |         * 'label_encoder': Use label encoding
 48 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
 49 |         * 'onehot_encoder': Use one-hot encoding
 50 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
 51 |         * 'ordinal_encoder': Use ordinal encoding
 52 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
 53 |         
 54 |     onehot_limit : int, default=25
 55 |         Maximum number of categories to use for one-hot encoding
 56 | 
 57 |     encoding_method_map : dict, default=None
 58 |         Custom mapping of columns to encoding methods
 59 |         Example usage: {'column_name': 'onehot_encoder', 'column_name2': 'label_encoder'}
 60 |     
 61 |     ordinal_encode_map : dict, default=None
 62 |         Custom mapping of columns to category order for ordinal encoding
 63 |         Example usage: {'column_name': ['low', 'medium', 'high']}
 64 |     
 65 |     normalize : str, default=None
 66 |         Standardize the data using StandardScaler. Options:
 67 |         * 'standard_scaler': Standardize the data using StandardScaler
 68 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
 69 |         * 'minmax_scaler': Scale the data using MinMaxScaler
 70 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
 71 |         * 'robust_scaler': Scale the data using RobustScaler
 72 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
 73 |         * 'quantile_transformer': Transform the data using QuantileTransformer
 74 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
 75 |         * 'maxabs_scaler': Scale the data using MaxAbsScaler
 76 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
 77 |         * 'normalize_scaler': Normalize the data to unit length
 78 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
 79 |         
 80 |     shuffle: bool, (default=True)
 81 |         If True, the data will be shuffled before the model training process
 82 | 
 83 |     logging_to_file: bool, (default=False)
 84 |         If True, the logs will be saved to a file in the current path, located in /logs/flexml_logs.log, Otherwise, it will not be saved
 85 | 
 86 |     Example
 87 |     -------
 88 |     >>> from flexml import Regression
 89 |     >>> df = pd.read_csv("MY_DATA.csv")
 90 |     >>> reg_exp = Regression(data=df, target_col='target_col')
 91 |     >>> reg_exp.start_experiment(experiment_size = 'quick')
 92 |     >>> reg_exp.show_model_stats(eval_metric='r2')
 93 | 
 94 |     ---------------------------------------------------------------------
 95 |     | model_name            |   r2   |   mae   | mse  |  rmse  |  mape  |
 96 |     ------------------------|--------|---------|------|--------|--------|
 97 |     | LinearRegression      | 0.7863 | 0.6721  |0.5921| 0.2469 | 0.2011 |
 98 |     | DecisionTreeRegressor | 0.7725 | 0.6441  |0.4642| 0.4347 | 0.3011 |
 99 |     | LGBMRegressor         | 0.7521 | 0.4751  |0.3531| 0.1445 | 0.1011 |
100 |     | Ridge                 | 0.7011 | 0.7590  |0.6155| 0.3411 | 0.2011 |
101 |     | XGBRegressor          | 0.6213 | 0.4701  |0.2923| 0.4039 | 0.3011 |
102 |     | DecisionTreeRegressor | 0.6096 | 0.4541  |0.2821| 0.4011 | 0.3011 |
103 |     | ElasticNet            | 0.5812 | 0.4201  |0.2111| 0.3011 | 0.2011 |
104 |     | Lasso                 | 0.5209 | 0.4101  |0.2011| 0.2911 | 0.2011 |
105 |     ---------------------------------------------------------------------
106 |     >>> best_model = reg_exp.get_best_models(eval_metric = 'r2')
107 |     """
108 |     pass


--------------------------------------------------------------------------------
/flexml/helpers/cross_validation.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from typing import Optional, Any, Iterator
  3 | from sklearn.model_selection import (KFold, StratifiedKFold, ShuffleSplit, 
  4 |                                      StratifiedShuffleSplit, train_test_split,
  5 |                                      GroupKFold, GroupShuffleSplit)
  6 | from flexml.config import CROSS_VALIDATION_METHODS
  7 | from flexml.helpers import cross_validation_checker
  8 | from flexml.logger import get_logger
  9 | 
 10 | 
 11 | def get_cv_splits(
 12 |     df: pd.DataFrame,
 13 |     cv_method: str = "kfold",
 14 |     n_folds: Optional[int] = None,
 15 |     test_size: Optional[float] = None,
 16 |     y_array: Optional[pd.Series] = None,
 17 |     groups_col: Optional[str] = None,
 18 |     random_state: Optional[int] = None,
 19 |     shuffle: bool = True,
 20 |     ml_task_type: Optional[str] = None,
 21 |     logging_to_file: str = False
 22 | ) -> Iterator[Any]:
 23 |     """
 24 |     Returns indices for cross-validation splits according to the specified method and parameters.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     df : pd.DataFrame
 29 |         The full dataset (features and target combined)
 30 | 
 31 |     cv_method : str, (default='kfold' for Regression, 'stratified_kfold' for Classification If `ml_task_type` is provided, else 'kfold')
 32 |         Cross-validation method to use. Options:
 33 |         - For Regression:
 34 |             - "kfold" (default) (Provide `n_folds`)
 35 |             - "holdout" (Provide `test_size`)
 36 |             - "shuffle_split" (Provide `n_folds` and `test_size`)
 37 |             - "group_kfold" (Provide `n_folds` and `groups_col`)
 38 |             - "group_shuffle_split" (Provide `n_folds`, `test_size`, and `groups_col`)
 39 |         
 40 |         - For Classification:
 41 |             - "kfold" (Provide `n_folds`)
 42 |             - "stratified_kfold" (default) (Provide `n_folds`)
 43 |             - "holdout" (Provide `test_size`)
 44 |             - "stratified_shuffle_split" (Provide `n_folds`, `test_size`)
 45 |             - "group_kfold" (Provide `n_folds` and `groups_col`)
 46 |             - "group_shuffles_plit" (Provide `n_folds`, `test_size`, and `groups_col`)
 47 | 
 48 |     n_folds : int, optional (default=None for hold-out validation, 5 for other cv methods)
 49 |         Number of splits/folds for methods that use folds. Default is 5
 50 | 
 51 |     test_size : float, optional
 52 |         The test size to use for holdout, shuffle-based methods, or group shuffle split
 53 | 
 54 |     y_array : pd.Series or array-like, optional
 55 |         The target variable. Required for stratified splits to ensure class balance in each fold
 56 | 
 57 |     groups_col : str, optional
 58 |         The name of the column in `df` that contains group labels. Required for group-based methods
 59 | 
 60 |     random_state : int, optional (default=None)
 61 |         The random state value for the data processing process (Ignored If 'shuffle' is set to False)
 62 | 
 63 |     shuffle: bool, (default=True)
 64 |         If True, the data will be shuffled before the model training process
 65 | 
 66 |     ml_task_type : str, optional
 67 |         The type of ML task. Options: "Regression" or "Classification"
 68 | 
 69 |         If you don't pass a value, the function won't accept None value for cv_method since It won't know the default cv method for your task
 70 | 
 71 |         If you pass a value, the default `cv_method` will be set based on the task type:
 72 |         - "Regression" => "kfold"
 73 |         - "Classification" => "stratified_kfold"
 74 | 
 75 |     logging_to_file : bool, optional
 76 |         Whether to log to file or not. Default is False
 77 | 
 78 |     Returns
 79 |     -------
 80 |     generator
 81 |         A generator that yields (train_index, test_index) for each split
 82 |     """
 83 |     logger = get_logger(__name__, "PROD", logging_to_file)
 84 |     valid_methods = CROSS_VALIDATION_METHODS.get('all')
 85 | 
 86 |     cv_method = cross_validation_checker(
 87 |         df=df,
 88 |         cv_method=cv_method,
 89 |         n_folds=n_folds,
 90 |         test_size=test_size,
 91 |         groups_col=groups_col,
 92 |         available_cv_methods=valid_methods,
 93 |         ml_task_type=ml_task_type
 94 |     )
 95 | 
 96 |     if cv_method == 'holdout' and not test_size:
 97 |         test_size = 0.25
 98 | 
 99 |     if cv_method == 'holdout' and test_size and n_folds:
100 |         logger.warning(f"Both 'n_folds' and 'test_size' provided for {cv_method} validation method. Ignoring 'n_folds'")
101 |         n_folds = None
102 | 
103 |     if cv_method == 'kfold' and test_size:
104 |         logger.warning(f"Both 'n_folds' and 'test_size' provided for {cv_method} method. Ignoring 'test_size'")
105 |         test_size = None
106 | 
107 |     if cv_method != 'holdout' and not n_folds:
108 |         n_folds = 5
109 | 
110 |     if cv_method in ["stratified_kfold", "stratified_shuffle_split"] and y_array is None:
111 |         error_msg = "`y_array` must be provided for stratified methods"
112 |         logger.error(error_msg)
113 |         raise ValueError(error_msg)
114 | 
115 |     groups = df[groups_col].values if groups_col else None
116 |     if groups is not None and cv_method not in ["group_kfold", "group_shuffle_split"]:
117 |         logger.warning(f"'groups_col' provided even though 'cv_method' is {cv_method}. Ignoring 'groups_col'")
118 |         groups = None
119 | 
120 |     if cv_method == "kfold":
121 |         splitter = KFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle)
122 |         return splitter.split(df)
123 |     
124 |     elif cv_method == "shuffle_split":
125 |         splitter = ShuffleSplit(n_splits=n_folds, test_size=test_size, random_state=random_state)
126 |         return splitter.split(df)
127 |         
128 |     elif cv_method == "stratified_shuffle_split":
129 |         splitter = StratifiedShuffleSplit(n_splits=n_folds, test_size=test_size, random_state=random_state)
130 |         return splitter.split(df, y_array)
131 | 
132 |     elif cv_method == "group_shuffle_split":
133 |         splitter = GroupShuffleSplit(n_splits=n_folds, test_size=test_size, random_state=random_state)
134 |         return splitter.split(df, groups=groups)
135 | 
136 |     elif cv_method == "stratified_kfold":
137 |         splitter = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) 
138 |         return splitter.split(df, y_array)
139 |     
140 |     elif cv_method == "group_kfold":
141 |         splitter = GroupKFold(n_splits=n_folds)
142 |         return splitter.split(df, groups=groups)
143 |     
144 |     elif cv_method == "holdout":
145 |         train_index, test_index = train_test_split(
146 |                 df.index,
147 |                 test_size=test_size,
148 |                 shuffle=shuffle,
149 |                 random_state=random_state,
150 |                 stratify=y_array if cv_method == "stratified_kfold" else None
151 |         )
152 |         return [(train_index, test_index)]


--------------------------------------------------------------------------------
/tests/test_cross_validation.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from typing import Optional, Union
  3 | from types import GeneratorType
  4 | import numpy as np
  5 | from parameterized import parameterized
  6 | from sklearn.datasets import load_breast_cancer, load_diabetes
  7 | from flexml.logger import get_logger
  8 | from flexml import Regression, Classification
  9 | from flexml.helpers import cross_validation_checker, get_cv_splits
 10 | 
 11 | import warnings
 12 | warnings.filterwarnings("ignore")
 13 | 
 14 | class TestCrossValidation(unittest.TestCase):
 15 |     logger = get_logger(__name__, "TEST", logging_to_file=False)
 16 | 
 17 |     # Datasets for testing
 18 |     regression_data = load_diabetes(as_frame=True)['frame']
 19 |     breast_data = load_breast_cancer(as_frame=True)
 20 |     classification_data = breast_data['frame']
 21 |     classification_data["target"] = breast_data['target']
 22 |     classification_data["group"] = classification_data.index % 3  # Add synthetic group column
 23 |     regression_data["group"] = regression_data.index % 3  # Add synthetic group column
 24 | 
 25 |     @parameterized.expand([
 26 |         ("Classification", "kfold", {"n_splits": 3}),
 27 |         ("Classification", "stratified_kfold", {"n_splits": 3}),
 28 |         ("Classification", "shuffle_split", {"n_splits": 3, "test_size": 0.25}),
 29 |         ("Classification", "stratified_shuffle_split", {"n_splits": 3, "test_size": 0.25}),
 30 |         ("Classification", "group_kfold", {"n_splits": 3, "groups_col": "group"}),
 31 |         ("Classification", "group_shuffle_split", {"n_splits": 3, "test_size": 0.25, "groups_col": "group"}),
 32 |         ("Classification", "holdout", {"test_size": 0.25}),
 33 |         ("Regression", "kfold", {"n_splits": 3}),
 34 |         ("Regression", "shuffle_split", {"n_splits": 3, "test_size": 0.25}),
 35 |         ("Regression", "group_kfold", {"n_splits": 3, "groups_col": "group"}),
 36 |         ("Regression", "group_shuffle_split", {"n_splits": 3, "test_size": 0.2, "groups_col": "group"}),
 37 |         ("Regression", "holdout", {"test_size": 0.25}),
 38 | 
 39 |         # Edge cases
 40 |         ("Regression", "holdout", {"test_size": None, "n_splits": 3}), # holdout but no test_size given
 41 |         ("Regression", "holdout", {"test_size": 0.25, "n_splits": 3}), # holdout but n_splits given
 42 |         ("Regression", "kfold", {"test_size": 0.25, "n_splits": 3}), # kfold but test_size given
 43 |         ("Regression", "kfold", {"n_splits": None}), # kfold but no n_splits given
 44 |         ("Regression", "holdout", {"groups_col": "group"}) # not a group cross-validation but groups_col given
 45 |     ])
 46 |     def test_cross_validation(
 47 |         self,
 48 |         ml_task_type: str,
 49 |         cv_method: Optional[str],
 50 |         params: dict
 51 |     ):
 52 |         target_col = "target"
 53 | 
 54 |         if ml_task_type == "Classification":
 55 |             df = self.classification_data.copy()
 56 | 
 57 |             # Skip Stratified methods if classes are not sufficiently populated
 58 |             has_sufficient_class_instances = not ("Stratified" in cv_method and (df["target"].value_counts() < 2).any())
 59 |             self.assertTrue(
 60 |                 has_sufficient_class_instances, 
 61 |                 f"{cv_method} couldn't be executed due to insufficient class instances, please take a look to data used for the test"
 62 |             )
 63 | 
 64 |             experiment_object = Classification(df, target_col)
 65 |             
 66 |         else: # Classification
 67 |             self.assertNotIn(
 68 |                 "Stratified",
 69 |                 cv_method, 
 70 |                 f"Stratified methods are for Classification only. You've passed {cv_method} for Regression"
 71 |             )
 72 | 
 73 |             df = self.regression_data.copy()
 74 |             experiment_object = Regression(df, target_col)
 75 |         
 76 |         experiment_object.start_experiment(
 77 |             experiment_size="wide",
 78 |             cv_method=cv_method,
 79 |             n_folds=params.get("n_splits"),
 80 |             test_size=params.get("test_size"),
 81 |             groups_col=params.get("groups_col")
 82 |         )
 83 | 
 84 |         predictions = experiment_object.predict(df.drop(columns=[target_col]), full_train=False)
 85 |         self.assertIsInstance(predictions, np.ndarray)
 86 | 
 87 |     @parameterized.expand([
 88 |         ("test_invalid_cv_method", "X", {}, ValueError),
 89 |         ("test_invalid_n_folds", "kfold", {"n_folds": 1}, ValueError),
 90 |         ("test_invalid_test_size", "holdout", {"test_size": 1.1}, ValueError),
 91 |         ("test_invalid_groups_col", "group_kfold", {"n_folds": 3, "groups_col": "X"}, ValueError),
 92 |         ("test_missing_groups_col_for_group_shuffle_split", "group_shuffle_split", {"n_folds": 3}, ValueError),
 93 |         ("test_missing_groups_col_for_group_kfold", "group_kfold", {"n_folds": 3, "test_size": 0.25}, ValueError),
 94 |         ("test_default_cv_for_classification", None, {"ml_task_type": "Classification"}, "stratified_kfold"),
 95 |         ("test_invalid_ml_task_type", "kfold", {"ml_task_type": "X"}, ValueError), 
 96 |         ("test_normalize_stratified_kfold_name", "stratifiedkfold", {"ml_task_type": "Classification"}, "stratified_kfold") 
 97 |     ])
 98 |     def test_expected_results(self, test_name: str, cv_method: str, params: dict, expected_result: Union[str, Exception]):
 99 |         if isinstance(expected_result, type) and issubclass(expected_result, BaseException):  # Your IDE might say 'code is not reachable' here, but Its
100 |             with self.assertRaises(expected_result):
101 |                 cross_validation_checker(
102 |                     df=self.regression_data,
103 |                     cv_method=cv_method,
104 |                     **params
105 |                 )
106 |         else:
107 |             result = cross_validation_checker(
108 |                 df=self.regression_data,
109 |                 cv_method=cv_method,
110 |                 **params
111 |             )
112 |             self.assertEqual(result, expected_result)
113 | 
114 |     @parameterized.expand([
115 |         ("test_cv_with_none_nfolds", "kfold", {"n_folds": None}, GeneratorType),
116 |         ("test_stratified_without_y_array", "stratified_kfold", {}, ValueError),
117 |         ("test_holdout_returns_generator", "holdout", {"test_size": 0.25}, list)
118 |     ])
119 |     def test_get_cv_splits(self, test_name: str, cv_method: str, params: dict, expected_result: Union[str, Exception]):
120 |         if issubclass(expected_result, BaseException): # Your IDE might say 'code is not reachable' here, but Its
121 |             with self.assertRaises(expected_result):
122 |                 get_cv_splits(
123 |                     df=self.regression_data,
124 |                     cv_method=cv_method,
125 |                     **params
126 |                 )
127 |         else:
128 |             splits = get_cv_splits(
129 |                 df=self.regression_data,
130 |                 cv_method=cv_method,
131 |                 **params
132 |             )
133 |             self.assertIsInstance(splits, expected_result)


--------------------------------------------------------------------------------
/flexml/helpers/supervised_helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from typing import Union
  4 | 
  5 | from sklearn.metrics import (
  6 |     r2_score, 
  7 |     mean_absolute_error, 
  8 |     mean_squared_error,
  9 |     accuracy_score,
 10 |     precision_score,
 11 |     recall_score,
 12 |     f1_score,
 13 |     roc_auc_score)
 14 | 
 15 | 
 16 | def _safe_mape(y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np.ndarray]) -> float:
 17 |     """
 18 |     Computes the Mean Absolute Percentage Error (MAPE) while ignoring zero values in y_true since MAPE is undefined for zero values.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     y_true : pd.Series or np.ndarray
 23 |         The actual values of the target column
 24 | 
 25 |     y_pred : pd.Series or np.ndarray
 26 |         The predicted values of the target column
 27 | 
 28 |     Returns
 29 |     -------
 30 |     float
 31 |         The MAPE score for the desired eval metric
 32 |     """
 33 |     mask = y_true != 0  # Ignore zero values in y_true
 34 |     return round(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])), 6)
 35 | 
 36 | def _evaluate_preds(
 37 |     y_true: Union[pd.Series, np.ndarray],
 38 |     y_pred: Union[pd.Series, np.ndarray],
 39 |     eval_metric: str,
 40 |     average: str = 'macro'
 41 | ) -> float:
 42 |     """
 43 |     Evaluates the model with the given evaluation metric by using the test set
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     y_true : pd.Series or np.ndarray
 48 |         The actual values of the target column
 49 | 
 50 |     y_pred : pd.Series or np.ndarray
 51 |         The predicted values/probabilities of the target column
 52 | 
 53 |     eval_metric : str
 54 |         The evaluation metric that will be used to evaluate the model   
 55 |                  
 56 |         - Avaiable evalulation metrics for Regression:    
 57 |             - R2, MAE, MSE, RMSE, MAPE
 58 | 
 59 |         - Avaiable evalulation metrics for Classification:    
 60 |             - Accuracy, Precision, Recall, F1 Score, ROC-AUC
 61 |         
 62 |     average : str, default='macro'
 63 |         The averaging method to use for multiclass classification metrics.
 64 |         Options are ['binary', 'micro', 'macro', 'weighted'].
 65 |         For binary classification, 'binary' is recommended.
 66 |         For multiclass, 'macro' treats all classes equally.
 67 | 
 68 |     Returns
 69 |     -------
 70 |     float
 71 |         The evaluation metric score for the desired eval metric
 72 |     """
 73 |     if eval_metric == 'R2':
 74 |         return round(r2_score(y_true, y_pred), 6)
 75 |     elif eval_metric == 'MAE':
 76 |         return round(mean_absolute_error(y_true, y_pred), 6)
 77 |     elif eval_metric == 'MSE':
 78 |         return round(mean_squared_error(y_true, y_pred), 6)
 79 |     elif eval_metric == 'RMSE':
 80 |         return round(np.sqrt(mean_squared_error(y_true, y_pred)), 6)
 81 |     elif eval_metric == 'MAPE':
 82 |         return _safe_mape(y_true, y_pred)
 83 |     elif eval_metric == 'Accuracy':
 84 |         return round(accuracy_score(y_true, y_pred), 6)
 85 |     elif eval_metric == 'Precision':
 86 |         return round(precision_score(y_true, y_pred, average=average), 6)
 87 |     elif eval_metric == 'Recall':
 88 |         return round(recall_score(y_true, y_pred, average=average), 6)
 89 |     elif eval_metric == 'F1 Score':
 90 |         return round(f1_score(y_true, y_pred, average=average), 6)
 91 |     elif eval_metric == 'ROC-AUC':
 92 |         if len(y_pred.shape) > 1: # If probabilites are returned 
 93 |             if y_pred.shape[1] >= 3: # If there are 3 or more classes
 94 |                 return round(roc_auc_score(y_true, y_pred, average=average, multi_class='ovr'), 6)
 95 |             elif y_pred.shape[1] == 2: # If there are 2 classes
 96 |                 return round(roc_auc_score(y_true, y_pred[:, 1]), 6)
 97 |         else: # If class labels are returned, ROC-AUC is not applicable (Some models don't have predict_proba method)
 98 |             return -1.0
 99 |     else:
100 |         raise ValueError(f"Error while evaluating the current model. The eval_metric should be one of the following: 'R2', 'MAE', 'MSE', 'RMSE', 'MAPE', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'. Got {eval_metric}")
101 |         
102 | def evaluate_model_perf(
103 |     ml_task_type, 
104 |     y_test,
105 |     y_pred
106 | ) -> dict:
107 |     """
108 |     Evaluates how good are the predictions by comparing them with the actual values, returns regression evaluation scores
109 | 
110 |     Parameters
111 |     ----------
112 |     ml_task_type : str
113 |         The type of the machine learning task. It can be either 'Regression' or 'Classification'
114 | 
115 |     y_test : np.ndarray
116 |         The actual values of the target column.
117 |     
118 |     y_pred : np.ndarray
119 |         For regression tasks: The predicted values of the target column.
120 |         For classification tasks: The predicted probabilities for each class.
121 |         Note: Some models like Perceptron, PassiveAggressiveClassifier, etc. don't have predict_proba method, so they return class labels directly.
122 |     
123 |     Returns
124 |     -------
125 |     dict
126 |         A dictionary containing the evaluation metric of the current task
127 |             
128 |             * R2, MAE, MSE, RMSE, MAPE for Regression tasks
129 | 
130 |             * Accuracy, Precision, Recall, F1 Score, ROC-AUC for Classification tasks
131 |     """
132 | 
133 |     if ml_task_type == "Regression":
134 |         r2 = _evaluate_preds(y_test, y_pred, 'R2')
135 |         mae = _evaluate_preds(y_test, y_pred, 'MAE')
136 |         mse = _evaluate_preds(y_test, y_pred, 'MSE')
137 |         rmse = _evaluate_preds(y_test, y_pred, 'RMSE')
138 |         mape = _evaluate_preds(y_test, y_pred, 'MAPE')
139 |         return {
140 |             "R2": r2,
141 |             "MAE": mae,
142 |             "MSE": mse,
143 |             "RMSE": rmse,
144 |             "MAPE": mape
145 |         }
146 |     
147 |     else: # Classification
148 |         # Convert probabilities to class labels for metrics except ROC-AUC if y_pred is probabilities
149 |         if len(y_pred.shape) > 1:
150 |             y_pred_labels = np.argmax(y_pred, axis=1)
151 |         else:
152 |             y_pred_labels = (y_pred > 0.5).astype(int)
153 | 
154 |         # Determine appropriate averaging method based on number of classes
155 |         n_classes = len(np.unique(y_test))
156 |         avg_method = 'binary' if n_classes == 2 else 'macro'
157 |         
158 |         # Use labels for standard classification metrics
159 |         accuracy = _evaluate_preds(y_test, y_pred_labels, 'Accuracy')
160 |         precision = _evaluate_preds(y_test, y_pred_labels, 'Precision', average=avg_method)
161 |         recall = _evaluate_preds(y_test, y_pred_labels, 'Recall', average=avg_method)
162 |         f1 = _evaluate_preds(y_test, y_pred_labels, 'F1 Score', average=avg_method)
163 |         
164 |         # Use probabilities for ROC-AUC
165 |         roc_auc = _evaluate_preds(y_test, y_pred, 'ROC-AUC', average=avg_method)
166 |         
167 |         return {
168 |             "Accuracy": accuracy,
169 |             "Precision": precision,
170 |             "Recall": recall,
171 |             "F1 Score": f1,
172 |             "ROC-AUC": roc_auc
173 |         }


--------------------------------------------------------------------------------
/tests/test_feature_engineering.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas as pd
  3 | import numpy as np
  4 | from flexml._feature_engineer import FeatureEngineering
  5 | from sklearn.linear_model import LogisticRegression
  6 | 
  7 | import warnings
  8 | warnings.filterwarnings("ignore")
  9 | 
 10 | 
 11 | class TestFeatureEngineering(unittest.TestCase):
 12 |     """
 13 |     Test cases for the feature engineering pipeline in the Classification class
 14 |     """
 15 |     np.random.seed(42)
 16 |     n_rows = 100
 17 | 
 18 |     df = pd.DataFrame({
 19 |         'id': range(1, n_rows + 1),
 20 |         'category_default': np.random.choice(['A', 'B', 'C'], n_rows),    
 21 |         'value_default': np.random.normal(100, 15, n_rows),
 22 |         'status': np.random.choice(['Active', 'Pending', 'Closed'], n_rows),
 23 |         'priority': np.random.choice(['High', 'Medium', 'Low'], n_rows),
 24 |         'score': np.random.randint(0, 100, n_rows),
 25 |         'amount': np.random.uniform(10, 1000, n_rows),
 26 |         'target': np.random.choice([0, 1], n_rows)
 27 |     })
 28 | 
 29 |     # This will create artificial null values within dataframe
 30 |     for column in df.columns:
 31 |         if column not in ['id', 'target']:
 32 |             mask = np.random.random(n_rows) < 0.2
 33 |             df.loc[mask, column] = np.nan
 34 | 
 35 |     encoding_methods = ['label_encoder', 'onehot_encoder', 'ordinal_encoder']
 36 |     imputation_methods = ['mean', 'median', 'mode', 'constant', 'drop']
 37 |     normalization_methods = ['standard_scaler', 'minmax_scaler', 'robust_scaler', 'quantile_transformer', 'maxabs_scaler', 'normalize_scaler']
 38 | 
 39 |     def test_feature_engineering_with_inputs(self):
 40 |         """
 41 |         End-to-end test for feature engineering pipeline through Classification class
 42 |         """
 43 |         feature_exp = FeatureEngineering(
 44 |                                 self.df, 
 45 |                                 target_col='target',
 46 |                                 drop_columns=['id'],
 47 |                                 column_imputation_map={'status': 'constant','amount': 'constant'},
 48 |                                 categorical_imputation_constant='test_constant',
 49 |                                 numerical_imputation_constant=-1,
 50 |                                 encoding_method_map={'category_default': 'ordinal_encoder', 'priority': 'onehot_encoder'},
 51 |                                 ordinal_encode_map={'category_default': ['A', 'C', 'B']},
 52 |                                 onehot_limit=3,
 53 |                                 normalize='normalize_scaler'
 54 |                             )
 55 |         
 56 |         feature_exp.setup()
 57 |         
 58 |         X_train, y_train = feature_exp.fit_transform()
 59 |         lr = LogisticRegression(max_iter=500).fit(X_train, y_train)
 60 | 
 61 |         # Check if all columns are numerical, including target
 62 |         self.assertFalse(
 63 |             X_train.select_dtypes(exclude=[np.number]).columns.tolist(),
 64 |             "Not all columns are numerical"
 65 |         )
 66 | 
 67 |         # Check if there are any null values
 68 |         self.assertFalse(
 69 |             X_train.isnull().any().any(),
 70 |             "There are null values in the processed data"
 71 |         )
 72 | 
 73 |     def test_feature_engineering_without_inputs(self):
 74 |         """
 75 |         End-to-end test for feature engineering pipeline through Classification class
 76 |         """
 77 |         feature_exp = FeatureEngineering(self.df, target_col='target')
 78 |         feature_exp.setup()
 79 |         
 80 |         X_train, y_train = feature_exp.fit_transform()
 81 |         lr = LogisticRegression(max_iter=500).fit(X_train, y_train)
 82 | 
 83 |         # Check if all columns are numerical, including target
 84 |         self.assertFalse(
 85 |             X_train.select_dtypes(exclude=[np.number]).columns.tolist(),
 86 |             "Not all columns are numerical"
 87 |         )
 88 | 
 89 |         # Check if there are any null values
 90 |         self.assertFalse(
 91 |             X_train.isnull().any().any(),
 92 |             "There are null values in the processed data"
 93 |         )
 94 | 
 95 |     def test_feature_engineering_with_dynamic_inputs(self):
 96 |         """
 97 |         Dynamic end-to-end test for feature engineering pipeline through Classification class
 98 |         """
 99 |         # Nested loops for encoding, imputation, and normalization methods
100 |         for encoding_method in self.encoding_methods:
101 |             for imputation_method in self.imputation_methods:
102 |                 for normalization_method in self.normalization_methods:
103 |                     encoding_method_map = {'category_default': encoding_method, 'priority': encoding_method}
104 |                     ordinal_encode_map = None
105 | 
106 |                     # Handle specific cases for encoding methods
107 |                     if encoding_method == 'ordinal_encoder':
108 |                         ordinal_encode_map = {'priority': ['Low', 'Medium', 'High'], 'category_default':['A','C','B']}
109 | 
110 |                     # Distinguish between categorical and numerical imputation methods
111 |                     if imputation_method in ['mode', 'constant', 'drop']:
112 |                         column_imputation_map = {'status': imputation_method, 'amount': 'mean'}
113 |                     elif imputation_method in ['mean', 'median']:
114 |                         column_imputation_map = {'status': 'mode', 'amount': imputation_method}
115 | 
116 |                     with self.subTest(encoding_method=encoding_method, imputation_method=imputation_method, normalization_method=normalization_method):
117 |                         feature_test = FeatureEngineering(
118 |                             data=self.df, 
119 |                             target_col='target',
120 |                             drop_columns=['id'],
121 |                             column_imputation_map=column_imputation_map,
122 |                             categorical_imputation_constant='test_constant',
123 |                             numerical_imputation_constant=-1,
124 |                             encoding_method_map=encoding_method_map,
125 |                             ordinal_encode_map=ordinal_encode_map,
126 |                             onehot_limit=3,
127 |                             normalize=normalization_method
128 |                         )
129 |                         feature_test.setup()
130 |                         
131 |                         X_train, y_train = feature_test.fit_transform()
132 |                         lr = LogisticRegression(max_iter=500).fit(X_train, y_train)
133 | 
134 |                         # Check if all columns are numerical, including target
135 |                         self.assertFalse(
136 |                             X_train.select_dtypes(exclude=[np.number]).columns.tolist(),
137 |                             f"Not all columns are numerical. Failed parameters are: "
138 |                             f"Encoding method: {encoding_method}, "
139 |                             f"Imputation method: {imputation_method}, "
140 |                             f"Normalization method: {normalization_method}"
141 |                         )
142 | 
143 |                         # Check if there are any null values
144 |                         self.assertFalse(
145 |                             X_train.isnull().any().any(),
146 |                             f"There are null values in the processed data. Failed parameters are: "
147 |                             f"Encoding method: {encoding_method}, "
148 |                             f"Imputation method: {imputation_method}, "
149 |                             f"Normalization method: {normalization_method}"
150 |                         )


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.


--------------------------------------------------------------------------------
/tests/test_supervised.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import unittest
  4 | import numpy as np
  5 | from parameterized import parameterized
  6 | from sklearn.datasets import load_diabetes, load_breast_cancer, load_iris
  7 | from flexml import Regression, Classification
  8 | from flexml.logger import get_logger
  9 | import warnings
 10 | warnings.filterwarnings("ignore")
 11 | 
 12 | 
 13 | class TestRegression(unittest.TestCase):
 14 |     logger = get_logger(__name__, "TEST")
 15 |     logger.setLevel("DEBUG")
 16 | 
 17 |     test_config = {
 18 |         'Regression': {
 19 |             'data': load_diabetes(as_frame=True)['frame'],
 20 |             'target_col': 'target',
 21 |             'exp_obj': None
 22 |         },
 23 |         'BinaryClassification': {
 24 |             'data': load_breast_cancer(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'No', 1: 'Yes'})),
 25 |             'target_col': 'target',
 26 |             'exp_obj': None
 27 |         },
 28 |         'MulticlassClassification': {
 29 |             'data': load_iris(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'Iris-Setosa', 1: 'Iris-Versicolor', 2: 'Iris-Virginica'})),
 30 |             'target_col': 'target',
 31 |             'exp_obj': None
 32 |         }
 33 |     }
 34 | 
 35 |     n_folds = 3
 36 |     
 37 |     @parameterized.expand(list(test_config.keys()))
 38 |     def test_01_supervised(self, objective: str):
 39 |         df = self.test_config[objective].get('data')
 40 |         target_col = self.test_config[objective].get('target_col')
 41 |         exp_size = "wide"
 42 |         
 43 |         if objective == 'Regression':
 44 |             exp_obj = Regression(
 45 |                 data = df,
 46 |                 target_col = target_col
 47 |             )
 48 |         else: # BinaryClassification or MulticlassClassification
 49 |             exp_obj = Classification(
 50 |                 data = df,
 51 |                 target_col = target_col
 52 |             )
 53 | 
 54 |         exp_obj.start_experiment(
 55 |             experiment_size = exp_size,
 56 |             n_folds = self.n_folds,
 57 |             eval_metric = "RMSE" if objective == "Regression" else "Accuracy"
 58 |         )
 59 |         
 60 |         top_x_models = exp_obj.get_best_models(top_n_models = 3)
 61 |         self.assertEqual(
 62 |             len(top_x_models), 3, 
 63 |             f"An error occured while retriving the best models in {exp_size} {objective}, expected 3, got {len(top_x_models)}"
 64 |         )
 65 |                 
 66 |         exp_obj.show_model_stats()
 67 |         
 68 |         tuning_methods = ["grid_search", "randomized_search", "optuna"]
 69 |         for method in tuning_methods:
 70 |             if method == "grid_search":
 71 |                 model = "LGBMRegressor" if objective == "Regression" else "LGBMClassifier"
 72 |                 param_grid = {
 73 |                     "n_estimators": [100, 200],
 74 |                     "max_depth": [3, 5],
 75 |                     "learning_rate": [0.5, 0.1]
 76 |                 }
 77 |                 exp_obj.tune_model(model=model, tuning_method=method, param_grid=param_grid, n_folds=self.n_folds, n_iter=3)
 78 |             else:
 79 |                 exp_obj.tune_model(tuning_method=method, n_folds=self.n_folds, n_iter=3)
 80 |             self.assertIsNotNone(exp_obj.tuned_model, f"An error occured while tuning the model with {method} in {exp_size} {objective}, tuned model is None")
 81 |             self.assertIsNotNone(exp_obj.tuned_model_score, f"An error occured while calculating the tuned model's score with {method} in {exp_size} {objective}, tuned model score is None")            
 82 |         
 83 |         # Save experiment objects to config
 84 |         self.test_config[objective]['exp_obj'] = exp_obj
 85 | 
 86 |     def test_02_save_regression_model(self):
 87 |         exp_obj = self.test_config['Regression']['exp_obj']
 88 | 
 89 |         # Test saving model with full_train=True and model_only=True (only the model object, not a pipeline)
 90 |         save_path = "test_regression_model_full_train_model_only.pkl"
 91 |         exp_obj.save_model(save_path=save_path, full_train=True, model_only=True)
 92 |         self.assertTrue(os.path.exists(save_path))
 93 | 
 94 |         # Load the saved model and check if it's the model object (not a pipeline)
 95 |         with open(save_path, 'rb') as f:
 96 |             saved_model = pickle.load(f)
 97 |             self.assertFalse(hasattr(saved_model, 'named_steps'))
 98 |         os.remove(save_path) # Clean up saved model
 99 | 
100 |         # Test saving model with full_train=False and model_only=False (should return a pipeline)
101 |         save_path = "test_regression_model_no_full_train_model_only_false.pkl"
102 |         exp_obj.save_model(save_path=save_path, full_train=False, model_only=False)
103 |         self.assertTrue(os.path.exists(save_path))
104 | 
105 |         # Load the saved model and check if it's a pipeline
106 |         with open(save_path, 'rb') as f:
107 |             saved_model = pickle.load(f)
108 |             self.assertTrue(hasattr(saved_model, 'named_steps'))
109 |         os.remove(save_path) # Clean up saved model
110 |             
111 |     def test_03_save_binary_classification_model(self):
112 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
113 | 
114 |         # Test saving model with full_train=True and model_only=True (only the model object, not a pipeline)
115 |         save_path = "test_binary_classification_model_full_train_model_only.pkl"
116 |         exp_obj.save_model(save_path=save_path, full_train=True, model_only=True)
117 |         self.assertTrue(os.path.exists(save_path))
118 | 
119 |         # Load the saved model and check if it's the model object (not a pipeline)
120 |         with open(save_path, 'rb') as f:
121 |             saved_model = pickle.load(f)
122 |             self.assertFalse(hasattr(saved_model, 'named_steps'))
123 |         os.remove(save_path) # Clean up saved model
124 | 
125 |         # Test saving model with full_train=False and model_only=False (should return a pipeline)
126 |         save_path = "test_binary_classification_model_no_full_train_model_only_false.pkl"
127 |         exp_obj.save_model(save_path=save_path, full_train=False, model_only=False)
128 |         self.assertTrue(os.path.exists(save_path))
129 | 
130 |         # Load the saved model and check if it's a pipeline
131 |         with open(save_path, 'rb') as f:
132 |             saved_model = pickle.load(f)
133 |             self.assertTrue(hasattr(saved_model, 'named_steps'))
134 |         os.remove(save_path) # Clean up saved model
135 | 
136 |     def test_04_save_multiclass_classification_model(self):
137 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
138 | 
139 |         # Test saving model with full_train=True and model_only=True (only the model object, not a pipeline)
140 |         save_path = "test_multiclass_classification_model_full_train_model_only.pkl"
141 |         exp_obj.save_model(save_path=save_path, full_train=True, model_only=True)
142 |         self.assertTrue(os.path.exists(save_path))
143 | 
144 |         # Load the saved model and check if it's the model object (not a pipeline)
145 |         with open(save_path, 'rb') as f:
146 |             saved_model = pickle.load(f)
147 |             self.assertFalse(hasattr(saved_model, 'named_steps'))
148 |         os.remove(save_path) # Clean up saved model
149 | 
150 |         # Test saving model with full_train=False and model_only=False (should return a pipeline)
151 |         save_path = "test_multiclass_classification_model_no_full_train_model_only_false.pkl"
152 |         exp_obj.save_model(save_path=save_path, full_train=False, model_only=False)
153 |         self.assertTrue(os.path.exists(save_path))
154 | 
155 |         # Load the saved model and check if it's a pipeline
156 |         with open(save_path, 'rb') as f:
157 |             saved_model = pickle.load(f)
158 |             self.assertTrue(hasattr(saved_model, 'named_steps'))
159 |         os.remove(save_path) # Clean up saved model
160 | 
161 |     def test_05_predict_model_regression(self):
162 |         # Test regression predictions
163 |         exp_obj = self.test_config['Regression']['exp_obj']
164 |         test_data = self.test_config['Regression'].get('data').drop(columns=['target'])
165 |         
166 |         predictions = exp_obj.predict(
167 |             test_data=test_data,
168 |             model=exp_obj.get_model_by_name("LGBMRegressor"),
169 |             full_train=True,
170 |         )
171 |         self.assertIsInstance(predictions, np.ndarray)
172 | 
173 |     def test_06_predict_model_binary_classification(self):
174 |         # Test binary classification predictions
175 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
176 |         test_data = self.test_config['BinaryClassification'].get('data').drop(columns=['target'])
177 |         
178 |         predictions = exp_obj.predict(test_data, full_train=False)
179 |         predictions_probabilities = exp_obj.predict_proba(test_data, full_train=False)
180 |         self.assertIsInstance(predictions, np.ndarray)
181 |         self.assertIsInstance(predictions_probabilities, np.ndarray)
182 |         self.assertEqual(predictions_probabilities.shape[1], 2)  # Binary classification should have 2 probability columns
183 | 
184 |     def test_07_predict_model_multiclass(self):
185 |         # Test multiclass classification predictions
186 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
187 |         test_data = self.test_config['MulticlassClassification'].get('data').drop(columns=['target'])
188 |         
189 |         predictions = exp_obj.predict(test_data, full_train=False)
190 |         predictions_probabilities = exp_obj.predict_proba(test_data, full_train=False)
191 |         self.assertIsInstance(predictions, np.ndarray)
192 |         self.assertIsInstance(predictions_probabilities, np.ndarray)
193 |         self.assertEqual(predictions_probabilities.shape[1], 3)  # Iris has 3 classes
194 |     
195 |     def test_08_plot_regression_feature_importance(self):
196 |         exp_obj = self.test_config['Regression']['exp_obj']
197 |         exp_obj.plot("CatBoostRegressor", kind="feature_importance")
198 | 
199 |     def test_09_plot_binary_classification_feature_importance(self):
200 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
201 |         exp_obj.plot("XGBClassifier", kind="feature_importance")
202 | 
203 |     def test_10_plot_multiclass_classification_feature_importance(self):
204 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
205 |         exp_obj.plot("LogisticRegression", kind="feature_importance")
206 | 
207 |     def test_11_plot_regression_residuals(self):
208 |         exp_obj = self.test_config['Regression']['exp_obj']
209 |         exp_obj.plot("LinearRegression", kind="residuals")
210 | 
211 |     def test_12_plot_regression_prediction_error(self):
212 |         exp_obj = self.test_config['Regression']['exp_obj']
213 |         exp_obj.plot("LGBMRegressor", kind="prediction_error")
214 | 
215 |     def test_13_plot_regression_shap_summary(self):
216 |         exp_obj = self.test_config['Regression']['exp_obj']
217 |         exp_obj.plot("XGBRegressor", kind="shap_summary")
218 | 
219 |     def test_14_plot_regression_shap_violin(self):
220 |         exp_obj = self.test_config['Regression']['exp_obj']
221 |         exp_obj.plot("RandomForestRegressor", kind="shap_violin")
222 | 
223 |     def test_15_plot_binary_classification_confusion_matrix(self):
224 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
225 |         exp_obj.plot("LogisticRegression", kind="confusion_matrix")
226 | 
227 |     def test_16_plot_binary_classification_roc_curve(self):
228 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
229 |         exp_obj.plot("RandomForestClassifier", kind="roc_curve")
230 | 
231 |     def test_17_plot_binary_classification_calibration_uniform(self):
232 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
233 |         exp_obj.plot("XGBClassifier", kind="calibration_curve", strategy='uniform', n_bins=10)
234 | 
235 |     def test_18_plot_binary_classification_calibration_quantile(self):
236 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
237 |         exp_obj.plot("LGBMClassifier", kind="calibration_curve", strategy='quantile', n_bins=8)
238 | 
239 |     def test_19_plot_binary_classification_shap_summary(self):
240 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
241 |         exp_obj.plot("CatBoostClassifier", kind="shap_summary")
242 | 
243 |     def test_20_plot_binary_classification_shap_violin(self):
244 |         exp_obj = self.test_config['BinaryClassification']['exp_obj']
245 |         exp_obj.plot("XGBClassifier", kind="shap_violin")
246 | 
247 |     def test_21_plot_multiclass_classification_confusion_matrix(self):
248 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
249 |         exp_obj.plot("RandomForestClassifier", kind="confusion_matrix")
250 | 
251 |     def test_22_plot_multiclass_classification_roc_curve(self):
252 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
253 |         exp_obj.plot("LogisticRegression", kind="roc_curve")
254 | 
255 |     def test_23_plot_multiclass_calibration_uniform(self):
256 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
257 |         exp_obj.plot("XGBClassifier", kind="calibration_curve", strategy='uniform', n_bins=10)
258 | 
259 |     def test_24_plot_multiclass_calibration_quantile(self):
260 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
261 |         exp_obj.plot("CatBoostClassifier", kind="calibration_curve", strategy='quantile', n_bins=12)
262 | 
263 |     def test_25_plot_multiclass_classification_shap_summary(self):
264 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
265 |         exp_obj.plot("LGBMClassifier", kind="shap_summary")
266 | 
267 |     def test_26_plot_multiclass_classification_shap_violin(self):
268 |         exp_obj = self.test_config['MulticlassClassification']['exp_obj']
269 |         exp_obj.plot("RandomForestClassifier", kind="shap_violin")


--------------------------------------------------------------------------------
/flexml/helpers/plot_model_graphs.py:
--------------------------------------------------------------------------------
  1 | import plotly.graph_objects as go
  2 | import numpy as np
  3 | import shap
  4 | from typing import Union, Optional, Dict
  5 | from sklearn.metrics import confusion_matrix
  6 | from sklearn.metrics import roc_curve, auc
  7 | from yellowbrick.regressor import ResidualsPlot, PredictionError
  8 | 
  9 | 
 10 | def plot_feature_importance(
 11 |         model: object,
 12 |         feature_names: list[str],
 13 |         top_x_features: int = 20,
 14 |         width: int = 800,
 15 |         height: int = 600,
 16 |     ) -> Union[go.Figure, str]:
 17 |     """
 18 |     Create a plotly figure showing feature importance for a given model
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     model: object
 23 |         Machine learning model to display its feature importance
 24 |     
 25 |     feature_names: list[str]
 26 |         List of feature names to display in the plot
 27 | 
 28 |     top_x_features: int (default = 20), optional
 29 |         Number of top features to display in the plot
 30 | 
 31 |     width: int (default = 800), optional
 32 |         Width of the plot   
 33 | 
 34 |     height: int (default = 600), optional
 35 |         Height of the plot
 36 | 
 37 |     Returns
 38 |     -------
 39 |     plotly.graph_objects.Figure or str
 40 |         A plotly figure object containing the feature importance visualization,
 41 |         or an error message if an error occurs during the process.
 42 |     """
 43 |     try:
 44 |         model_name = model.__class__.__name__
 45 |         importance = None
 46 | 
 47 |         # Check if the model has 'feature_importances_' attribute (tree-based models)
 48 |         if hasattr(model, 'feature_importances_'):
 49 |             importance = model.feature_importances_
 50 | 
 51 |         # Check if the model has coefficients (linear models)
 52 |         elif hasattr(model, 'coef_'):
 53 |             importance = np.abs(model.coef_)
 54 |             if importance.ndim > 1:  # Handle multi-output models (e.g., LogisticRegression with multiple classes)
 55 |                 importance = np.mean(importance, axis=0)
 56 | 
 57 |         if importance is not None and len(importance) == len(feature_names):
 58 |             indices = np.argsort(importance)[::-1]  # Sort in descending order
 59 |             
 60 |             # Limit to top 20 features
 61 |             indices = indices[:top_x_features]
 62 |             sorted_importance = importance[indices]
 63 |             sorted_features = np.array(feature_names)[indices]
 64 | 
 65 |             fig = go.Figure()
 66 |             fig.add_trace(go.Bar(
 67 |                 y=sorted_features,
 68 |                 x=sorted_importance,
 69 |                 orientation='h',
 70 |                 marker=dict(
 71 |                     color=sorted_importance,
 72 |                     colorscale='Viridis'
 73 |                 )
 74 |             ))
 75 | 
 76 |             fig.update_layout(
 77 |                 title=f"Feature Importance for {model_name} (Top {top_x_features} Features)",
 78 |                 xaxis_title="Importance",
 79 |                 yaxis_title="Features",
 80 |                 height=height,
 81 |                 width=width,
 82 |                 yaxis=dict(autorange="reversed")
 83 |             )
 84 | 
 85 |             return fig
 86 |         else:
 87 |             return f"Feature importance is not available or mismatched for {model_name}"
 88 | 
 89 |     except Exception as e:
 90 |         return f"Could not calculate feature importance for the model {model_name}. Error: {e}"
 91 | 
 92 | 
 93 | def plot_confusion_matrix(
 94 |         y_true: np.array, 
 95 |         y_pred: np.array, 
 96 |         class_mapping: dict = None,
 97 |         width: int = 800,
 98 |         height: int = 600
 99 |     ) -> Union[go.Figure, str]:
100 |     """
101 |     Create a plotly figure showing confusion matrix.
102 | 
103 |     Parameters
104 |     ----------
105 |     y_true : np.array
106 |         Array of true (correct) labels
107 | 
108 |     y_pred : np.array
109 |         Array of predicted labels
110 | 
111 |     class_mapping : dict, optional
112 |         Dictionary mapping encoded values to class labels (e.g., {0: 'male', 1: 'female'})
113 | 
114 |     width: int (default = 800), optional
115 |         Width of the plot
116 | 
117 |     height: int (default = 600), optional
118 |         Height of the plot
119 | 
120 |     Returns
121 |     -------
122 |     plotly.graph_objects.Figure or str
123 |         A plotly figure object containing the confusion matrix visualization,
124 |         or an error message if an error occurs during the process.
125 |     """
126 |     try:
127 |         cm = confusion_matrix(y_true, y_pred)
128 |         
129 |         # Convert class indices to labels using the provided mapping
130 |         class_names = [class_mapping[i] for i in range(cm.shape[0])] if class_mapping else list(range(cm.shape[0]))
131 | 
132 |         fig = go.Figure(data=go.Heatmap(
133 |             z=cm,
134 |             x=class_names,
135 |             y=class_names,
136 |             colorscale='Viridis',
137 |             text=cm,
138 |             texttemplate="%{text}",
139 |             textfont={"size": 16},
140 |             hoverongaps=False))
141 |         
142 |         fig.update_layout(
143 |             title='Confusion Matrix',
144 |             xaxis_title='Predicted label',
145 |             yaxis_title='True label',
146 |             yaxis=dict(autorange="reversed"),
147 |             width=width,
148 |             height=height
149 |         )
150 |         
151 |         return fig
152 |     except Exception as e:
153 |         return f"Error creating confusion matrix plot: {str(e)}"
154 | 
155 | 
156 | def plot_roc_curve(
157 |         y_true: np.array, 
158 |         y_prob: np.array, 
159 |         class_names: list = None,
160 |         width: int = 800,
161 |         height: int = 600
162 |     ) -> Union[go.Figure, str]:
163 |     """
164 |     Create a plotly figure showing ROC curve.
165 | 
166 |     Parameters
167 |     ----------
168 |     y_true : np.array
169 |         Array of true (correct) labels
170 | 
171 |     y_prob : np.array
172 |         Array of predicted probabilities
173 | 
174 |     class_names : list, optional
175 |         List of class names for multiple classes
176 | 
177 |     width: int (default = 800), optional
178 |         Width of the plot
179 | 
180 |     height: int (default = 600), optional
181 |         Height of the plot
182 | 
183 |     Returns
184 |     -------
185 |     plotly.graph_objects.Figure or str
186 |         A plotly figure object containing the ROC curve visualization,
187 |         or an error message if an error occurs during the process.
188 |     """
189 |     try:
190 |         fig = go.Figure()
191 |         
192 |         # Handle binary classification
193 |         if y_prob.ndim == 1 or y_prob.shape[1] == 2:
194 |             if y_prob.ndim == 2:
195 |                 y_prob = y_prob[:, 1]
196 |             fpr, tpr, _ = roc_curve(y_true, y_prob)
197 |             auc_score = auc(fpr, tpr)
198 |             
199 |             fig.add_trace(go.Scatter(
200 |                 x=fpr, y=tpr,
201 |                 name=f'ROC curve (AUC = {auc_score:.3f})',
202 |                 mode='lines'
203 |             ))
204 |             
205 |         # Handle multi-class
206 |         else:
207 |             if class_names is None:
208 |                 class_names = [f'Class {i}' for i in range(y_prob.shape[1])]
209 |                 
210 |             for i in range(y_prob.shape[1]):
211 |                 fpr, tpr, _ = roc_curve(y_true == i, y_prob[:, i])
212 |                 auc_score = auc(fpr, tpr)
213 |                 
214 |                 fig.add_trace(go.Scatter(
215 |                     x=fpr, y=tpr,
216 |                     name=f'{class_names[i]} (AUC = {auc_score:.3f})',
217 |                     mode='lines'
218 |                 ))
219 |         
220 |         fig.add_trace(go.Scatter(
221 |             x=[0, 1], y=[0, 1],
222 |             name='Random',
223 |             mode='lines',
224 |             line=dict(dash='dash', color='gray')
225 |         ))
226 |         
227 |         fig.update_layout(
228 |             title='Receiver Operating Characteristic (ROC) Curve',
229 |             xaxis_title='False Positive Rate',
230 |             yaxis_title='True Positive Rate',
231 |             width=width,
232 |             height=height,
233 |             showlegend=True
234 |         )
235 |         
236 |         return fig
237 |     except Exception as e:
238 |         return f"Error creating ROC curve plot: {str(e)}"
239 | 
240 | 
241 | def plot_calibration_curve(
242 |         y_true: np.array, 
243 |         y_prob: np.array, 
244 |         class_mapping: Optional[Dict[int, str]] = None,
245 |         n_bins: int = 10, 
246 |         strategy: str = 'uniform',
247 |         width: int = 800,
248 |         height: int = 600,
249 |     ) -> Union[go.Figure, str]:
250 |     """
251 |     Create a plotly figure showing probability calibration curve.
252 | 
253 |     Parameters
254 |     ----------
255 |     y_true : np.array
256 |         True labels (binary or multiclass)
257 | 
258 |     y_prob : np.array
259 |         Predicted probabilities (shape [n_samples, n_classes] for multiclass)
260 | 
261 |     n_bins : int (default = 10), optional
262 |         Number of bins to discretize the [0, 1] interval
263 | 
264 |     strategy : {'uniform', 'quantile'} (default = 'uniform'), optional
265 |         Strategy used to define the widths of the bins
266 | 
267 |     width: int (default = 800), optional
268 |         Width of the plot
269 | 
270 |     height: int (default = 600), optional
271 |         Height of the plot
272 | 
273 |     class_mapping: Dict[int, str] (default = None), optional
274 |         Dictionary mapping class indices to class names
275 | 
276 |     Returns
277 |     -------
278 |     plotly.graph_objects.Figure or str
279 |         A plotly figure object containing the calibration curve visualization,
280 |         or an error message if an error occurs during the process.
281 |     """
282 |     try:
283 |         from sklearn.calibration import calibration_curve
284 |         from sklearn.preprocessing import LabelBinarizer
285 |         
286 |         fig = go.Figure()
287 |         
288 |         # Handle binary classification
289 |         if y_prob.ndim == 1 or y_prob.shape[1] == 2:
290 |             if y_prob.ndim == 2:
291 |                 y_prob = y_prob[:, 1]
292 |                 
293 |             prob_true, prob_pred = calibration_curve(y_true, y_prob, 
294 |                                                     n_bins=n_bins, 
295 |                                                     strategy=strategy)
296 |             
297 |             class_name = class_mapping.get(1, 'Positive Class') if class_mapping else 'Calibration Curve'
298 |             fig.add_trace(go.Scatter(
299 |                 x=prob_pred,
300 |                 y=prob_true,
301 |                 name=class_name,
302 |                 mode='lines+markers',
303 |                 marker=dict(size=8)
304 |             ))
305 |             
306 |         # Handle multiclass using one-vs-rest approach
307 |         else:
308 |             lb = LabelBinarizer().fit(y_true)
309 |             y_onehot = lb.transform(y_true)
310 |             
311 |             for class_idx in range(y_prob.shape[1]):
312 |                 prob_true, prob_pred = calibration_curve(y_onehot[:, class_idx], 
313 |                                                         y_prob[:, class_idx],
314 |                                                         n_bins=n_bins,
315 |                                                         strategy=strategy)
316 |                 
317 |                 class_name = class_mapping.get(class_idx, f'Class {class_idx}') if class_mapping else f'Class {class_idx}'
318 |                 
319 |                 # Apply class mapping here
320 |                 if class_mapping and class_idx in class_mapping:
321 |                     class_name = class_mapping[class_idx]
322 | 
323 |                 fig.add_trace(go.Scatter(
324 |                     x=prob_pred,
325 |                     y=prob_true,
326 |                     name=class_name,
327 |                     mode='lines+markers',
328 |                     marker=dict(size=8)
329 |                 ))
330 |         
331 |         # Add perfect calibration line
332 |         fig.add_trace(go.Scatter(
333 |             x=[0, 1],
334 |             y=[0, 1],
335 |             name='Perfect Calibration',
336 |             line=dict(dash='dash', color='gray'),
337 |             mode='lines'
338 |         ))
339 |         
340 |         fig.update_layout(
341 |             title='Calibration Curve (Reliability Diagram)',
342 |             xaxis_title='Mean Predicted Probability',
343 |             yaxis_title='Fraction of Positives',
344 |             width=width,
345 |             height=height,
346 |             showlegend=True,
347 |             legend=dict(x=0.7, y=0.1),
348 |             xaxis=dict(range=[0, 1]),
349 |             yaxis=dict(range=[0, 1])
350 |         )
351 |         
352 |         return fig
353 |         
354 |     except Exception as e:
355 |         return f"Error creating calibration curve plot: {str(e)}"
356 | 
357 | 
358 | def plot_shap(
359 |         model: object, 
360 |         X_test: np.array, 
361 |         shap_type: str = 'shap_summary'
362 |     ) -> Union[go.Figure, str]:
363 |     """
364 |     Create a plotly figure showing SHAP values visualization.
365 | 
366 |     Parameters
367 |     ----------
368 |     model : object
369 |         Trained model
370 | 
371 |     X_test : np.array
372 |         Feature data for explanation
373 | 
374 |     shap_type : str
375 |         Type of SHAP plot to generate:
376 |         - 'shap_summary': shap.summary_plot
377 |         - 'shap_violin': shap.plots.violin
378 | 
379 |     Returns
380 |     -------
381 |     plotly.graph_objects.Figure or str
382 |         A plotly figure object containing the SHAP values visualization,
383 |         or an error message if an error occurs during the process.
384 |     """
385 |     try:
386 |         # Check if model is a tree-based model
387 |         model_type = str(type(model))
388 |         
389 |         tree_based_models = [
390 |             "RandomForest", "GradientBoosting", "AdaBoost", 
391 |             "HistGradientBoosting", "DecisionTree", "ExtraTrees",
392 |             "XGB", "CatBoost", "LGBM"
393 |         ]
394 |         is_tree_based = any(model_name in model_type for model_name in tree_based_models)
395 |         
396 |         if is_tree_based:
397 |             explainer = shap.TreeExplainer(model)
398 |             shap_values = explainer.shap_values(X_test)
399 |         else:
400 |             explainer = shap.KernelExplainer(model.predict, X_test)
401 |             shap_values = explainer.shap_values(X_test, silent=True)
402 |         
403 |         if len(shap_values.shape) == 3: # Models like DecisionTree, RandomForest return probabilities for each class, Let's downgrade to 2D array
404 |             shap_values = shap_values[:, :, 1]
405 |         # Convert SHAP values to appropriate format if needed
406 |         if isinstance(shap_values, list) and shap_type != 'shap_dependence':
407 |             shap_values = np.array(shap_values).mean(axis=0)
408 |         
409 |         # Generate the appropriate SHAP plot based on shap_type
410 |         if shap_type == 'shap_summary':
411 |             shap.summary_plot(shap_values, X_test)
412 |         elif shap_type == 'shap_violin':
413 |             shap.plots.violin(shap_values, X_test)
414 |         else:
415 |             return f"Invalid shap_type: {shap_type}"
416 |             
417 |         return True
418 |     
419 |     except Exception as e:
420 |         return f"Error creating SHAP plot: {str(e)}"
421 | 
422 | 
423 | def plot_residuals(
424 |         model: object, 
425 |         X_train: np.array, 
426 |         y_train: np.array,
427 |         X_test: np.array, 
428 |         y_test: np.array
429 |     ) -> object:
430 |     """
431 |     Create a residuals plot using Yellowbrick.
432 | 
433 |     Parameters
434 |     ----------
435 |     model : object
436 |         Trained regressor
437 | 
438 |     X_train : np.array
439 |         Training features
440 | 
441 |     y_train : np.array
442 |         Training targets
443 | 
444 |     X_test : np.array
445 |         Test features
446 | 
447 |     y_test : np.array
448 |         Test targets
449 | 
450 |     Returns
451 |     -------
452 |     object
453 |         Visualizer object from Yellowbrick
454 |     """
455 |     try:
456 |         if model.__class__.__name__ == "CatBoostRegressor": # https://github.com/DistrictDataLabs/yellowbrick/issues/1099
457 |             from yellowbrick.contrib.wrapper import regressor as wrap_regressor
458 |             model = wrap_regressor(model)
459 | 
460 |         visualizer = ResidualsPlot(model)
461 |         visualizer.fit(X_train, y_train)
462 |         visualizer.score(X_test, y_test)
463 |         return visualizer
464 |     
465 |     except Exception as e:
466 |         return f"Error creating residuals plot: {str(e)}"
467 | 
468 | 
469 | def plot_prediction_error(
470 |         model: object,
471 |         X_train: np.array,
472 |         y_train: np.array,
473 |         X_test: np.array, 
474 |         y_test: np.array
475 |     ) -> object:
476 |     """
477 |     Create a prediction error plot using Yellowbrick.
478 | 
479 |     Parameters
480 |     ----------
481 |     model : object
482 |         Trained regressor
483 | 
484 |     X_train : np.array
485 |         Training features
486 | 
487 |     y_train : np.array
488 |         Training targets
489 | 
490 |     X_test : np.array
491 |         Test features
492 | 
493 |     y_test : np.array
494 |         Test targets
495 | 
496 |     Returns
497 |     -------
498 |     object
499 |         Visualizer object from Yellowbrick
500 |     """
501 |     try:
502 |         if model.__class__.__name__ == "CatBoostRegressor": # https://github.com/DistrictDataLabs/yellowbrick/issues/1099
503 |             from yellowbrick.contrib.wrapper import regressor as wrap_regressor
504 |             model = wrap_regressor(model)
505 | 
506 |         visualizer = PredictionError(model)
507 |         visualizer.fit(X_train, y_train)
508 |         visualizer.score(X_test, y_test)
509 |         return visualizer
510 |     
511 |     except Exception as e:
512 |         return f"Error creating prediction error plot: {str(e)}"


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas as pd
  3 | import numpy as np
  4 | from parameterized import parameterized
  5 | from flexml.helpers import validate_inputs, eval_metric_checker, random_state_checker
  6 | import warnings
  7 | warnings.filterwarnings("ignore")
  8 | 
  9 | 
 10 | class TestHelpers(unittest.TestCase):
 11 |     """
 12 |     Test cases for the feature engineering pipeline in the Classification class
 13 |     """
 14 |     np.random.seed(42)
 15 |     n_rows = 100
 16 | 
 17 |     df = pd.DataFrame({
 18 |         'id': range(1, n_rows + 1),
 19 |         'category_default': np.random.choice(['A', 'B', 'C'], n_rows),    
 20 |         'value_default': np.random.normal(100, 15, n_rows),
 21 |         'status': np.random.choice(['Active', 'Pending', 'Closed'], n_rows),
 22 |         'priority': np.random.choice(['High', 'Medium', 'Low'], n_rows),
 23 |         'score': np.random.randint(0, 100, n_rows),
 24 |         'amount': np.random.uniform(10, 1000, n_rows),
 25 |         'target': np.random.choice([0, 1], n_rows)
 26 |     })
 27 | 
 28 |     # This will create artificial null values within dataframe
 29 |     for column in df.columns:
 30 |         if column not in ['id', 'target']:
 31 |             mask = np.random.random(n_rows) < 0.2
 32 |             df.loc[mask, column] = np.nan
 33 | 
 34 | 
 35 |     @parameterized.expand([
 36 |         # Basic validation errors
 37 |         (
 38 |             "target_in_drop_columns", 
 39 |             {"drop_columns": ["target"]}, 
 40 |             ValueError, 
 41 |             "target column 'target' cannot be in the drop_columns list"
 42 |         ),
 43 | 
 44 |         (
 45 |             "no_features_after_drop", 
 46 |             {"drop_columns": ["category_default", "value_default", "status", "priority", "score", "amount", "id"]}, 
 47 |             ValueError, 
 48 |             "After dropping columns, only {'target'} remain"
 49 |         ),
 50 | 
 51 |         # Imputation method errors
 52 |         (
 53 |             "invalid_cat_imputation", 
 54 |             {"categorical_imputation_method": "invalid"}, 
 55 |             ValueError,
 56 |             "categorical_imputation_method 'invalid' is not valid"
 57 |         ),
 58 | 
 59 |         (
 60 |             "invalid_num_imputation", 
 61 |             {"numerical_imputation_method": "invalid"}, 
 62 |             ValueError,
 63 |             "numerical_imputation_method 'invalid' is not valid"
 64 |         ),
 65 | 
 66 |         # Column imputation map errors
 67 |         (
 68 |             "column_imputation_invalid_column", 
 69 |             {"column_imputation_map": {"nonexistent_column": "mean"}}, 
 70 |             ValueError,
 71 |             "column 'nonexistent_column' in column_imputation_map is not in the data"
 72 |         ),
 73 | 
 74 |         (
 75 |             "column_imputation_invalid_numeric_method", 
 76 |             {"column_imputation_map": {"value_default": "invalid"}}, 
 77 |             ValueError,
 78 |             "numeric imputation method 'invalid' for column 'value_default' is not valid"
 79 |         ),
 80 | 
 81 |         (
 82 |             "column_imputation_invalid_categorical_method", 
 83 |             {"column_imputation_map": {"category_default": "invalid"}}, 
 84 |             ValueError,
 85 |             "categorical imputation method 'invalid' for column 'category_default' is not valid"
 86 |         ),
 87 | 
 88 |         # Constant type errors
 89 |         (
 90 |             "invalid_numerical_constant", 
 91 |             {"numerical_imputation_constant": "invalid"}, 
 92 |             ValueError,
 93 |             "numerical_imputation_constant should be a number"
 94 |         ),
 95 | 
 96 |         (
 97 |             "invalid_categorical_constant", 
 98 |             {"categorical_imputation_constant": 123}, 
 99 |             ValueError,
100 |             "categorical_imputation_constant should be a string"
101 |         ),
102 | 
103 |         # Encoding method errors
104 |         (
105 |             "invalid_encoding_method", 
106 |             {"encoding_method": "invalid_encoder"}, 
107 |             ValueError,
108 |             "encoding_method 'invalid_encoder' is not valid"
109 |         ),
110 | 
111 |         (
112 |             "invalid_onehot_limit", 
113 |             {"onehot_limit": -5}, 
114 |             ValueError,
115 |             "onehot_limit should be a positive integer"
116 |         ),
117 | 
118 |         # Encoding method map errors
119 |         (
120 |             "encoding_map_invalid_column", 
121 |             {"encoding_method_map": {"nonexistent_column": "label_encoder"}}, 
122 |             ValueError,
123 |             "column 'nonexistent_column' in encoding_method_map is not in the data"
124 |         ),
125 | 
126 |         (
127 |             "encoding_map_dropped_column", 
128 |             {"drop_columns": ["category_default"], "encoding_method_map": {"category_default": "label_encoder"}}, 
129 |             ValueError,
130 |             "column 'category_default' in encoding_method_map is in drop_columns"
131 |         ),
132 | 
133 |         (
134 |             "encoding_map_invalid_method", 
135 |             {"encoding_method_map": {"category_default": "invalid"}}, 
136 |             ValueError,
137 |             "encoding method 'invalid' for column 'category_default' is not valid"
138 |         ),
139 | 
140 |         # Ordinal encoding errors
141 |         (
142 |             "missing_ordinal_map", 
143 |             {"encoding_method": "ordinal_encoder"}, 
144 |             ValueError,
145 |             "Ordinal encoding is selected but no ordinal_encode_map is provided"
146 |         ),
147 | 
148 |         (
149 |             "missing_column_ordinal_map", 
150 |             {"encoding_method": "ordinal_encoder", "ordinal_encode_map": {}}, 
151 |             ValueError,
152 |             "Ordinal encoding is selected for column 'category_default' but no ordinal_encode_map is provided"
153 |         ),
154 | 
155 |         (
156 |             "mismatched_ordinal_values", 
157 |             {"encoding_method": "ordinal_encoder", 
158 |              "ordinal_encode_map": {
159 |                  "category_default": ["X", "Y", "Z"],
160 |                  "status": ["Active", "Pending", "Closed"],
161 |                  "priority": ["Low", "Medium", "High"]}}, 
162 |             ValueError,
163 |             "Distinct values in column 'category_default' do not match"
164 |         ),
165 | 
166 |         (
167 |             "extra_columns_ordinal_map", 
168 |             {"encoding_method": "ordinal_encoder", 
169 |              "ordinal_encode_map": {
170 |                  "category_default": ["A", "B", "C"],
171 |                  "status": ["Active", "Pending", "Closed"],
172 |                  "priority": ["Low", "Medium", "High"],
173 |                  "extra_column": ["X", "Y", "Z"]}}, 
174 |             ValueError,
175 |             "Ordinal_encode_map includes extra columns not in the categorical columns"
176 |         ),
177 | 
178 |         # Normalization errors
179 |         (
180 |             "invalid_normalization", 
181 |             {"normalize": "invalid_scaler"}, 
182 |             ValueError,
183 |             "normalize method 'invalid_scaler' is not valid"
184 |         ),
185 | 
186 |         # Drop columns validation
187 |         (
188 |             "drop_column_not_in_data", 
189 |             {"drop_columns": ["nonexistent_column"]}, 
190 |             ValueError,
191 |             "column 'nonexistent_column' in drop_columns is not in the data"
192 |         ),
193 | 
194 |         # Ordinal encoding in method map errors
195 |         (
196 |             "missing_ordinal_map_in_method_map", 
197 |             {"encoding_method_map": {"category_default": "ordinal_encoder"}}, 
198 |             ValueError,
199 |             "Ordinal encoding is selected for column 'category_default' but no ordinal_encode_map is provided"
200 |         ),
201 | 
202 |         (
203 |             "missing_column_ordinal_map_in_method_map", 
204 |             {"encoding_method_map": {"category_default": "ordinal_encoder"},
205 |              "ordinal_encode_map": {}}, 
206 |             ValueError,
207 |             "Ordinal encoding is selected for column 'category_default' but no ordinal_encode_map is provided"
208 |         ),
209 | 
210 |         (
211 |             "mismatched_ordinal_values_in_method_map", 
212 |             {"encoding_method_map": {
213 |                  "category_default": "ordinal_encoder",
214 |                  "status": "label_encoder",
215 |                  "priority": "label_encoder"
216 |              },
217 |              "ordinal_encode_map": {
218 |                  "category_default": ["X", "Y", "Z"]
219 |              }}, 
220 |             ValueError,
221 |             "Unique values in 'category_default' do not match with the ones given in ordinal_encode_map"
222 |         ),
223 | 
224 |         (
225 |             "extra_columns_ordinal_map_in_method_map", 
226 |             {"encoding_method_map": {
227 |                  "category_default": "ordinal_encoder"
228 |              },
229 |              "ordinal_encode_map": {
230 |                  "category_default": ["A", "B", "C"],
231 |                  "extra_column": ["X", "Y", "Z"]
232 |              }}, 
233 |             ValueError,
234 |             "Ordinal_encode_map includes extra columns not specified for ordinal encoding"
235 |         ),
236 |     ])
237 |     def test_validate_inputs_errors(self, test_name, params, expected_error, expected_message):
238 |         """Test validate_inputs exception raising for invalid parameters"""
239 |         with self.assertRaisesRegex(expected_error, expected_message):
240 |             validate_inputs(
241 |                 data=self.df,
242 |                 target_col='target',
243 |                 **params
244 |             )
245 | 
246 |     # helpers/validators.py
247 |     @parameterized.expand([
248 |         # Default behavior tests
249 |         (
250 |             "regression_default",
251 |             {"ml_task_type": "Regression", "eval_metric": None},
252 |             "R2",
253 |             None
254 |         ),
255 |         (
256 |             "classification_default",
257 |             {"ml_task_type": "Classification", "eval_metric": None},
258 |             "Accuracy",
259 |             None
260 |         ),
261 | 
262 |         # Regression metric tests
263 |         (
264 |             "regression_valid_lowercase",
265 |             {"ml_task_type": "Regression", "eval_metric": "mae"},
266 |             "MAE",
267 |             None
268 |         ),
269 |         (
270 |             "regression_valid_uppercase",
271 |             {"ml_task_type": "Regression", "eval_metric": "RMSE"},
272 |             "RMSE",
273 |             None
274 |         ),
275 |         (
276 |             "regression_invalid_metric",
277 |             {"ml_task_type": "Regression", "eval_metric": "invalid"},
278 |             None,
279 |             ValueError
280 |         ),
281 | 
282 |         # Classification metric tests
283 |         (
284 |             "classification_valid_exact",
285 |             {"ml_task_type": "Classification", "eval_metric": "Accuracy"},
286 |             "Accuracy",
287 |             None
288 |         ),
289 |         (
290 |             "classification_valid_flexible",
291 |             {"ml_task_type": "Classification", "eval_metric": "roc-auc"},
292 |             "ROC-AUC",
293 |             None
294 |         ),
295 |         (
296 |             "classification_valid_no_special",
297 |             {"ml_task_type": "Classification", "eval_metric": "rocauc"},
298 |             "ROC-AUC",
299 |             None
300 |         ),
301 |         (
302 |             "classification_invalid_metric",
303 |             {"ml_task_type": "Classification", "eval_metric": "invalid"},
304 |             None,
305 |             ValueError
306 |         ),
307 | 
308 |         # Custom metrics list tests
309 |         (
310 |             "custom_metrics_valid_classification",
311 |             {
312 |                 "ml_task_type": "Classification",
313 |                 "eval_metric": "F1 Score",
314 |                 "all_evaluation_metrics": None,
315 |                 "default_evaluation_metric": None
316 |             },
317 |             "F1 Score",
318 |             None
319 |         ),
320 | 
321 |         (
322 |             "custom_metrics_valid_regression",
323 |             {
324 |                 "ml_task_type": "Regression",
325 |                 "eval_metric": "MAE",
326 |                 "all_evaluation_metrics": None,
327 |                 "default_evaluation_metric": None
328 |             },
329 |             "MAE",
330 |             None
331 |         ),
332 | 
333 |     ])
334 |     def test_eval_metric_checker(self, test_name, params, expected_result, expected_error):
335 |         """Test eval_metric_checker validation"""
336 |         if expected_error:
337 |             with self.assertRaises(expected_error):
338 |                 eval_metric_checker(**params)
339 |         else:
340 |             result = eval_metric_checker(**params)
341 |             self.assertEqual(result, expected_result)
342 | 
343 |     # helpers/validators.py
344 |     @parameterized.expand([
345 |         # Valid cases
346 |         (
347 |             "none_value",
348 |             None,
349 |             None,
350 |             None
351 |         ),
352 |         (
353 |             "zero_value",
354 |             0,
355 |             0,
356 |             None
357 |         ),
358 |         (
359 |             "positive_integer",
360 |             42,
361 |             42,
362 |             None
363 |         ),
364 | 
365 |         # Invalid cases
366 |         (
367 |             "negative_integer",
368 |             -1,
369 |             None,
370 |             ValueError
371 |         ),
372 |         (
373 |             "float_value",
374 |             42.0,
375 |             None,
376 |             ValueError
377 |         )
378 |     ])
379 |     def test_random_state_checker(self, test_name, input_value, expected_result, expected_error):
380 |         """Test random_state_checker validation"""
381 |         if expected_error:
382 |             with self.assertRaises(expected_error):
383 |                 random_state_checker(input_value)
384 |         else:
385 |             result = random_state_checker(input_value)
386 |             self.assertEqual(result, expected_result)
387 | 
388 |     # helpers/supervised_helpers.py
389 |     def test_binary_classification_probabilities(self):
390 |         """
391 |         Test binary classification with probability predictions.
392 |         """
393 |         from flexml.helpers.supervised_helpers import evaluate_model_perf
394 | 
395 |         # Setup binary classification data
396 |         y_true = np.array([0, 1, 0, 1, 0])
397 |         y_pred_proba = np.array([
398 |             [0.8, 0.2],  # Should predict class 0
399 |             [0.3, 0.7],  # Should predict class 1
400 |             [0.6, 0.4],  # Should predict class 0
401 |             [0.2, 0.8],  # Should predict class 1
402 |             [0.9, 0.1]   # Should predict class 0
403 |         ])
404 | 
405 |         # Test model performance evaluation
406 |         results = evaluate_model_perf("Classification", y_true, y_pred_proba)
407 | 
408 |         # Verify all metrics are present
409 |         expected_metrics = {"Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"}
410 |         self.assertEqual(set(results.keys()), expected_metrics)
411 | 
412 |     def test_multiclass_classification_probabilities(self):
413 |         """
414 |         Test multiclass classification with probability predictions for more than two classes.
415 |         """
416 |         from flexml.helpers.supervised_helpers import evaluate_model_perf
417 | 
418 |         # Setup multiclass classification data
419 |         y_true = np.array([0, 1, 2, 1, 0])
420 |         y_pred_proba = np.array([
421 |             [0.8, 0.1, 0.1],  # Should predict class 0
422 |             [0.1, 0.7, 0.2],  # Should predict class 1
423 |             [0.2, 0.2, 0.6],  # Should predict class 2
424 |             [0.1, 0.8, 0.1],  # Should predict class 1
425 |             [0.6, 0.2, 0.2]   # Should predict class 0
426 |         ])
427 | 
428 |         # Test model performance evaluation
429 |         results = evaluate_model_perf("Classification", y_true, y_pred_proba)
430 | 
431 |         # Verify all metrics are present
432 |         expected_metrics = {"Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"}
433 |         self.assertEqual(set(results.keys()), expected_metrics)
434 | 
435 |     def test_classification_with_direct_labels(self):
436 |         """
437 |         Test classification with direct label predictions (no probabilities).
438 |         Tests the handling of predictions when model doesn't output probabilities.
439 |         """
440 |         from flexml.helpers.supervised_helpers import evaluate_model_perf
441 | 
442 |         # Setup classification data with direct labels
443 |         y_true = np.array([0, 1, 0, 1, 0])
444 |         y_pred_labels = np.array([0, 1, 0, 1, 0])  # Direct label predictions
445 | 
446 |         # Test model performance evaluation
447 |         results = evaluate_model_perf("Classification", y_true, y_pred_labels)
448 | 
449 |         # Verify all metrics are present
450 |         expected_metrics = {"Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"}
451 |         self.assertEqual(set(results.keys()), expected_metrics)
452 | 
453 |     def test_evaluate_preds_invalid_metric(self):
454 |         """
455 |         Test that _evaluate_preds raises ValueError for invalid metrics.
456 |         """
457 |         from flexml.helpers.supervised_helpers import _evaluate_preds
458 | 
459 |         y_true = np.array([0, 1, 0])
460 |         y_pred = np.array([0, 1, 0])
461 | 
462 |         with self.assertRaisesRegex(ValueError, "Error while evaluating the current model"):
463 |             _evaluate_preds(y_true, y_pred, "InvalidMetric")
464 | 
465 |     def test_probability_to_label_conversion(self):
466 |         """
467 |         Test the conversion from probability predictions to class labels.
468 |         Tests both binary and multiclass cases.
469 |         """
470 |         from flexml.helpers.supervised_helpers import evaluate_model_perf
471 | 
472 |         # Binary case
473 |         y_true_binary = np.array([0, 1, 0])
474 |         y_pred_binary_proba = np.array([
475 |             [0.8, 0.2],  # Should convert to 0
476 |             [0.3, 0.7],  # Should convert to 1
477 |             [0.6, 0.4]   # Should convert to 0
478 |         ])
479 |         
480 |         binary_results = evaluate_model_perf("Classification", y_true_binary, y_pred_binary_proba)
481 |         self.assertEqual(binary_results["Accuracy"], 1.0)  # Perfect predictions after conversion
482 | 
483 |         # Multiclass case
484 |         y_true_multi = np.array([0, 1, 2])
485 |         y_pred_multi_proba = np.array([
486 |             [0.8, 0.1, 0.1],  # Should convert to 0
487 |             [0.1, 0.7, 0.2],  # Should convert to 1
488 |             [0.2, 0.2, 0.6]   # Should convert to 2
489 |         ])
490 |         
491 |         multi_results = evaluate_model_perf("Classification", y_true_multi, y_pred_multi_proba)
492 |         self.assertEqual(multi_results["Accuracy"], 1.0)  # Perfect predictions after conversion


--------------------------------------------------------------------------------
/flexml/config/ml_models.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | 
  5 | 
  6 | # TODO: Should be improved
  7 | def get_ml_models(
  8 |     ml_task_type: str,
  9 |     num_class: Optional[int] = None,
 10 |     random_state: Optional[int] = None,
 11 |     n_jobs: Optional[int] = -1
 12 | ) -> dict:
 13 |     """
 14 |     Returns a dictionary of quick and wide regression and classification models
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     ml_task_type : str
 19 |         The type of the machine learning task. It can be "Regression" or "Classification"
 20 | 
 21 |     num_class : int, optional (default=None)
 22 |         The number of classes in the classification task. No need to pass it in regression tasks
 23 |         It will be set to 2 if None is passed to suppose its binary classification
 24 | 
 25 |     random_state : int, optional (default=None)
 26 |         The random state value for the model training process
 27 | 
 28 |     n_jobs : int, optional (default=-1)
 29 |         The number of jobs to run in parallel. -1 means using all processors
 30 |     
 31 |     Returns
 32 |     -------
 33 |     dict
 34 |         A dictionary of quick and wide Regression/Classification models
 35 |     """
 36 |     if ml_task_type not in ["Regression", "Classification"]:
 37 |         raise ValueError(f"Expected ml_task_type to be either 'Regression' or 'Classification', got {ml_task_type}")
 38 | 
 39 |     if ml_task_type == "Regression":
 40 |         from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor
 41 |         from sklearn.linear_model import BayesianRidge, OrthogonalMatchingPursuit
 42 |         from sklearn.tree import DecisionTreeRegressor
 43 |         from sklearn.ensemble import (
 44 |             AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, 
 45 |             ExtraTreesRegressor, HistGradientBoostingRegressor
 46 |         )
 47 |         from sklearn.neighbors import KNeighborsRegressor
 48 |         from sklearn.neural_network import MLPRegressor
 49 |         from xgboost import XGBRegressor
 50 |         from lightgbm import LGBMRegressor
 51 |         from catboost import CatBoostRegressor
 52 | 
 53 | 
 54 |         # Quick Regression Models
 55 |         LINEAR_REGRESSION = LinearRegression(n_jobs=n_jobs)
 56 |         LASSO_REGRESSION = Lasso(random_state=random_state)
 57 |         RIDGE_REGRESSION = Ridge(random_state=random_state)
 58 |         XGBOOST_REGRESSION = XGBRegressor(enable_categorical=True, random_state=random_state, n_jobs=n_jobs)
 59 |         LIGHTGBM_REGRESSION = LGBMRegressor(verbose=-1, enable_categorical=True, random_state=random_state, n_jobs=n_jobs)
 60 |         CATBOOST_REGRESSION = CatBoostRegressor(allow_writing_files=False, silent=True, random_seed=random_state, thread_count=n_jobs)
 61 |         DECISION_TREE_REGRESSION = DecisionTreeRegressor(random_state=random_state)
 62 |         ELASTIC_NET_REGRESSION = ElasticNet(random_state=random_state)
 63 |         HUBER_REGRESSION = HuberRegressor()
 64 | 
 65 |         # Wide Regression Models
 66 |         KNN_REGRESSION = KNeighborsRegressor(n_jobs=n_jobs) 
 67 |         BAYESIAN_RIDGE_REGRESSION = BayesianRidge()
 68 |         ADA_BOOST_REGRESSION = AdaBoostRegressor(random_state=random_state)
 69 |         HIST_GRADIENT_BOOSTING_REGRESSION = HistGradientBoostingRegressor(random_state=random_state)
 70 |         GRADIENT_BOOSTING_REGRESSION = GradientBoostingRegressor(random_state=random_state)
 71 |         RANDOM_FOREST_REGRESSION = RandomForestRegressor(random_state=random_state, n_jobs=n_jobs)
 72 |         EXTRA_TREES_REGRESSION = ExtraTreesRegressor(random_state=random_state, n_jobs=n_jobs)
 73 |         OMP_REGRESSION = OrthogonalMatchingPursuit()
 74 |         MLP_REGRESSION = MLPRegressor(
 75 |             solver='lbfgs',
 76 |             hidden_layer_sizes=(50,),
 77 |             early_stopping=True,
 78 |             learning_rate='adaptive',
 79 |             random_state=random_state
 80 |         )
 81 | 
 82 |         # Quick Regression Model Configurations
 83 |         QUICK_REGRESSION_MODELS = [
 84 |             {
 85 |                 "name": LINEAR_REGRESSION.__class__.__name__,
 86 |                 "model": LINEAR_REGRESSION,
 87 |                 "tuning_param_grid": {
 88 |                     'fit_intercept': [True, False]
 89 |                 }
 90 |             },
 91 |             {
 92 |                 "name": LASSO_REGRESSION.__class__.__name__,
 93 |                 "model": LASSO_REGRESSION,
 94 |                 "tuning_param_grid": {
 95 |                     "alpha": [0.1, 0.5, 1.0, 2.0],
 96 |                     "max_iter": [1000, 2000, 3000]
 97 |                 }
 98 |             },
 99 |             {
100 |                 "name": RIDGE_REGRESSION.__class__.__name__,
101 |                 "model": RIDGE_REGRESSION,
102 |                 "tuning_param_grid": {
103 |                     "alpha": [0.1, 0.5, 1.0, 2.0],
104 |                     "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
105 |                 }
106 |             },
107 |             {
108 |                 "name": XGBOOST_REGRESSION.__class__.__name__,
109 |                 "model": XGBOOST_REGRESSION,
110 |                 "tuning_param_grid": {
111 |                     "n_estimators": [100, 200, 300, 500, 700, 1000],
112 |                     "max_depth": [3, 5, 7, 9, 10],
113 |                     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
114 |                     "subsample": [0.5, 0.7, 1],
115 |                     "colsample_bytree": [0.5, 0.7, 1],
116 |                     "gamma": [0, 0.1, 0.2],
117 |                     "reg_alpha": [0, 0.1, 0.5],
118 |                     "reg_lambda": [0, 0.1, 0.5],
119 |                     "min_child_weight": [1, 3, 5],
120 |                     "scale_pos_weight": [1, 2, 3]
121 |                 }
122 |             },
123 |             {
124 |                 "name": LIGHTGBM_REGRESSION.__class__.__name__,
125 |                 "model": LIGHTGBM_REGRESSION,
126 |                 "tuning_param_grid": {
127 |                     "n_estimators": [100, 200, 300, 500, 700, 1000],
128 |                     "max_depth": [3, 5, 7, 9, 10, 12],
129 |                     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
130 |                     "subsample": [0.5, 0.7, 1],
131 |                     "colsample_bytree": [0.5, 0.7, 1],
132 |                     "reg_alpha": [0, 0.1, 0.5],
133 |                     "reg_lambda": [0, 0.1, 0.5],
134 |                     "min_child_weight": [1, 3, 5],
135 |                     "num_leaves": [31, 50, 100]
136 |                 }
137 |             },
138 |             {
139 |                 "name": CATBOOST_REGRESSION.__class__.__name__,
140 |                 "model": CATBOOST_REGRESSION,
141 |                 "tuning_param_grid": {
142 |                     "iterations": [100, 200, 300, 500, 700, 1000, 1500],
143 |                     "depth": [3, 5, 7, 9, 10, 12],
144 |                     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
145 |                     "l2_leaf_reg": [0.1, 1, 3, 5, 10],
146 |                     "border_count": [32, 50, 75, 100, 150]
147 |                 }
148 |             },
149 |             {
150 |                 "name": DECISION_TREE_REGRESSION.__class__.__name__,
151 |                 "model": DECISION_TREE_REGRESSION,
152 |                 "tuning_param_grid": {
153 |                     "max_depth": [3, 5, 7, 9, 10],
154 |                     "min_samples_split": [2, 5, 10],
155 |                     "min_samples_leaf": [1, 2, 4],
156 |                     "max_features": ["sqrt", "log2"],
157 |                     "max_leaf_nodes": [10, 20, 30, 40],
158 |                     "criterion": ["friedman_mse", "poisson", "absolute_error", "squared_error"]
159 |                 }
160 |             },
161 |             {
162 |                 "name": ELASTIC_NET_REGRESSION.__class__.__name__,
163 |                 "model": ELASTIC_NET_REGRESSION,
164 |                 "tuning_param_grid": {
165 |                     "alpha": [0.1, 0.5, 1.0, 2.0],
166 |                     "l1_ratio": [0.1, 0.5, 0.7, 1.0]
167 |                 }
168 |             },
169 |             {
170 |                 "name": HUBER_REGRESSION.__class__.__name__,
171 |                 "model": HUBER_REGRESSION,
172 |                 "tuning_param_grid": {
173 |                     "epsilon": [1.1, 1.35, 1.5, 1.75, 2.0],
174 |                     "alpha": [0.0001, 0.001, 0.01, 0.1, 1.0]
175 |                 }
176 |             }
177 |         ]
178 | 
179 |         # Wide Regression Model Configurations
180 |         WIDE_REGRESSION_MODELS = QUICK_REGRESSION_MODELS + [
181 |             {
182 |                 "name": KNN_REGRESSION.__class__.__name__,
183 |                 "model": KNN_REGRESSION,
184 |                 "tuning_param_grid": {
185 |                     "n_neighbors": [3, 5, 7, 9],
186 |                     "weights": ["uniform", "distance"],
187 |                     "p": [1, 2]
188 |                 }
189 |             },
190 |             {
191 |                 "name": ADA_BOOST_REGRESSION.__class__.__name__,
192 |                 "model": ADA_BOOST_REGRESSION,
193 |                 "tuning_param_grid": {
194 |                     "n_estimators": [50, 100, 200, 300],
195 |                     "learning_rate": [0.01, 0.05, 0.1, 0.5, 1],
196 |                     "loss": ["linear", "square", "exponential"]
197 |                 }
198 |             },
199 |             {
200 |                 "name": BAYESIAN_RIDGE_REGRESSION.__class__.__name__,
201 |                 "model": BAYESIAN_RIDGE_REGRESSION,
202 |                 "tuning_param_grid": {
203 |                     "max_iter": [100, 200, 300, 400, 500],
204 |                     "alpha_1": [1e-6, 1e-5, 1e-4],
205 |                     "alpha_2": [1e-6, 1e-5, 1e-4],
206 |                     "lambda_1": [1e-6, 1e-5, 1e-4],
207 |                     "lambda_2": [1e-6, 1e-5, 1e-4]
208 |                 }
209 |             },
210 |             {
211 |                 "name": RANDOM_FOREST_REGRESSION.__class__.__name__,
212 |                 "model": RANDOM_FOREST_REGRESSION,
213 |                 "tuning_param_grid": {
214 |                     "n_estimators": [50, 100, 200, 300, 400],
215 |                     "max_depth": [3, 5, 7, 9, 10],
216 |                     "min_samples_split": [2, 5, 10],
217 |                     "min_samples_leaf": [1, 2, 4],
218 |                     "max_features": ["sqrt", "log2", 0.3, 0.5],
219 |                     "bootstrap": [True, False]
220 |                 }
221 |             },
222 |             {
223 |                 "name": EXTRA_TREES_REGRESSION.__class__.__name__,
224 |                 "model": EXTRA_TREES_REGRESSION,
225 |                 "tuning_param_grid": {
226 |                     'n_estimators': [100, 200, 300, 500],
227 |                     'max_depth': [3, 5, 7, 9, 10],
228 |                     'min_samples_split': [2, 5, 10],
229 |                     'min_samples_leaf': [1, 2, 4],
230 |                     'max_features': ["sqrt", "log2"],
231 |                     'bootstrap': [True, False]
232 |                 }
233 |             },
234 |             {
235 |                 "name": OMP_REGRESSION.__class__.__name__,
236 |                 "model": OMP_REGRESSION,
237 |                 "tuning_param_grid": {
238 |                     "n_nonzero_coefs": [5, 10, 15, 20],
239 |                     "tol": [1e-4, 1e-3, 1e-2, 1e-1]
240 |                 }
241 |             },
242 |             {
243 |                 "name": HIST_GRADIENT_BOOSTING_REGRESSION.__class__.__name__,
244 |                 "model": HIST_GRADIENT_BOOSTING_REGRESSION,
245 |                 "tuning_param_grid": {
246 |                     "max_iter": [100, 200, 300, 500],
247 |                     "max_depth": [3, 5, 7, 9, 10],
248 |                     "learning_rate": [0.01, 0.1, 0.3],
249 |                     "min_samples_leaf": [1, 5, 10],
250 |                     "l2_regularization": [0, 1.0, 10.0],
251 |                     "max_bins": [128, 255]
252 |                 }
253 |             },
254 |             {
255 |                 "name": GRADIENT_BOOSTING_REGRESSION.__class__.__name__,
256 |                 "model": GRADIENT_BOOSTING_REGRESSION,
257 |                 "tuning_param_grid": {
258 |                     "n_estimators": [100, 200, 300, 400, 500],
259 |                     "max_depth": [3, 5, 7, 9, 10],
260 |                     "learning_rate": [0.01, 0.02, 0.05, 0.1, 0.2],
261 |                     "min_samples_split": [2, 5, 10],
262 |                     "min_samples_leaf": [1, 2, 4],
263 |                     "alpha": [0.1, 0.5, 0.9]
264 |                 }
265 |             },
266 |             {
267 |                 "name": MLP_REGRESSION.__class__.__name__,
268 |                 "model": MLP_REGRESSION,
269 |                 "tuning_param_grid": {
270 |                     "hidden_layer_sizes": [(50,), (100,), (50, 50), (100, 50)],
271 |                     "max_iter": [100, 200, 300, 400],
272 |                     "activation": ["relu", "tanh"],
273 |                     "alpha": [0.0001, 0.001, 0.01],
274 |                     "learning_rate": ["constant", "adaptive"],
275 |                     "learning_rate_init": [0.001, 0.01]
276 |                 }
277 |             }
278 |         ]
279 | 
280 |         return {
281 |             "QUICK": QUICK_REGRESSION_MODELS,
282 |             "WIDE": WIDE_REGRESSION_MODELS
283 |         }
284 | 
285 |     else:
286 |         from sklearn.linear_model import LogisticRegression
287 |         from sklearn.tree import DecisionTreeClassifier
288 |         from sklearn.ensemble import (
289 |             AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, 
290 |             ExtraTreesClassifier, HistGradientBoostingClassifier
291 |         )
292 |         from sklearn.neighbors import KNeighborsClassifier
293 |         from sklearn.naive_bayes import GaussianNB
294 |         from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
295 |         from sklearn.neural_network import MLPClassifier
296 |         from xgboost import XGBClassifier
297 |         from lightgbm import LGBMClassifier
298 |         from catboost import CatBoostClassifier
299 |     
300 | 
301 |         if num_class is None: # Suppose binary 
302 |             num_class = 2
303 | 
304 |         if num_class > 2:
305 |             xgb_objective = "multi:softmax"
306 |         else:
307 |             xgb_objective = "binary:logistic"
308 | 
309 |         # Quick Classification Models
310 |         LOGISTIC_REGRESSION = LogisticRegression(max_iter=1000, random_state=random_state, n_jobs=n_jobs)
311 |         XGBOOST_CLASSIFIER = XGBClassifier(objective=xgb_objective, random_state=random_state, n_jobs=n_jobs)
312 |         LIGHTGBM_CLASSIFIER = LGBMClassifier(verbose=-1, random_state=random_state, n_jobs=n_jobs)
313 |         CATBOOST_CLASSIFIER = CatBoostClassifier(allow_writing_files=False, silent=True, random_seed=random_state, thread_count=n_jobs)
314 |         DECISION_TREE_CLASSIFIER = DecisionTreeClassifier(random_state=random_state)
315 |         RANDOM_FOREST_CLASSIFIER = RandomForestClassifier(random_state=random_state, n_jobs=n_jobs)
316 |         NAIVE_BAYES_CLASSIFIER = GaussianNB()
317 |         KNN_CLASSIFIER = KNeighborsClassifier(n_jobs=n_jobs)
318 | 
319 |         # Wide Classification Models
320 |         ADA_BOOST_CLASSIFIER = AdaBoostClassifier(random_state=random_state)
321 |         HIST_GRADIENT_BOOSTING_CLASSIFIER = HistGradientBoostingClassifier(random_state=random_state)
322 |         GRADIENT_BOOSTING_CLASSIFIER = GradientBoostingClassifier(random_state=random_state)
323 |         EXTRA_TREES_CLASSIFIER = ExtraTreesClassifier(random_state=random_state, n_jobs=n_jobs)
324 |         QDA_CLASSIFIER = QuadraticDiscriminantAnalysis()
325 |         LDA_CLASSIFIER = LinearDiscriminantAnalysis()
326 |         MLP_CLASSIFIER = MLPClassifier(
327 |             hidden_layer_sizes=(100,),
328 |             early_stopping=True,
329 |             tol=0.001,
330 |             learning_rate='adaptive',
331 |             random_state=random_state
332 |         )
333 | 
334 |         # Quick Classification Model Configurations
335 |         QUICK_CLASSIFICATION_MODELS = [
336 |             {
337 |                 "name": LOGISTIC_REGRESSION.__class__.__name__,
338 |                 "model": LOGISTIC_REGRESSION,
339 |                 "tuning_param_grid": {
340 |                     "penalty": ["l2"],
341 |                     "C": [0.01, 0.1, 1, 10, 100],
342 |                     "max_iter": [100, 200, 300, 400, 500]
343 |                 }
344 |             },
345 |             {
346 |                 "name": XGBOOST_CLASSIFIER.__class__.__name__,
347 |                 "model": XGBOOST_CLASSIFIER,
348 |                 "tuning_param_grid": {
349 |                     "n_estimators": [100, 200, 300, 500, 700, 1000],
350 |                     "max_depth": [3, 5, 7, 9, 10],
351 |                     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
352 |                     "subsample": [0.5, 0.7, 0.9, 1],
353 |                     "colsample_bytree": [0.5, 0.7, 0.9, 1],
354 |                     "gamma": [0, 0.1, 0.2, 0.3],
355 |                     "reg_alpha": [0, 0.1, 0.5, 1],
356 |                     "reg_lambda": [0, 0.1, 0.5, 1],
357 |                     "min_child_weight": [1, 3, 5],
358 |                     "scale_pos_weight": [1, 2, 3]
359 |                 }
360 |             },
361 |             {
362 |                 "name": LIGHTGBM_CLASSIFIER.__class__.__name__,
363 |                 "model": LIGHTGBM_CLASSIFIER,
364 |                 "tuning_param_grid": {
365 |                     "n_estimators": [100, 200, 300, 500, 700, 1000],
366 |                     "max_depth": [3, 5, 7, 9, 10, 12],
367 |                     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
368 |                     "subsample": [0.5, 0.7, 0.9, 1],
369 |                     "colsample_bytree": [0.5, 0.7, 0.9, 1],
370 |                     "reg_alpha": [0, 0.1, 0.5, 1],
371 |                     "reg_lambda": [0, 0.1, 0.5, 1],
372 |                     "min_child_weight": [1, 3, 5],
373 |                     "num_leaves": [31, 50, 75, 100]
374 |                 }
375 |             },
376 |             {
377 |                 "name": CATBOOST_CLASSIFIER.__class__.__name__,
378 |                 "model": CATBOOST_CLASSIFIER,
379 |                 "tuning_param_grid": {
380 |                     "iterations": [100, 200, 300, 500, 700, 1000],
381 |                     "depth": [3, 5, 7, 9, 10, 12],
382 |                     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
383 |                     "l2_leaf_reg": [0.1, 1, 3, 5, 10],
384 |                     "border_count": [32, 50, 75, 100, 150]
385 |                 }
386 |             },
387 |             {
388 |                 "name": DECISION_TREE_CLASSIFIER.__class__.__name__,
389 |                 "model": DECISION_TREE_CLASSIFIER,
390 |                 "tuning_param_grid": {
391 |                     "max_depth": [3, 5, 7, 9, 10],
392 |                     "min_samples_split": [2, 5, 10],
393 |                     "min_samples_leaf": [1, 2, 4],
394 |                     "max_features": ["sqrt", "log2"],
395 |                     "max_leaf_nodes": [10, 20, 30, 40],
396 |                     "criterion": ["gini", "entropy"]
397 |                 }
398 |             },
399 |             {
400 |                 "name": RANDOM_FOREST_CLASSIFIER.__class__.__name__,
401 |                 "model": RANDOM_FOREST_CLASSIFIER,
402 |                 "tuning_param_grid": {
403 |                     "n_estimators": [100, 200, 300, 400],
404 |                     "max_depth": [3, 5, 7, 9, 10],
405 |                     "min_samples_split": [2, 5, 10],
406 |                     "min_samples_leaf": [1, 2, 4],
407 |                     "max_features": ["sqrt", "log2", 0.3, 0.5],
408 |                     "bootstrap": [True, False]
409 |                 }
410 |             },
411 |             {
412 |                 "name": NAIVE_BAYES_CLASSIFIER.__class__.__name__,
413 |                 "model": NAIVE_BAYES_CLASSIFIER,
414 |                 "tuning_param_grid": {
415 |                     "var_smoothing": [1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10]
416 |                 }
417 |             },
418 |             {
419 |                 "name": KNN_CLASSIFIER.__class__.__name__,
420 |                 "model": KNN_CLASSIFIER,
421 |                 "tuning_param_grid": {
422 |                     "n_neighbors": [3, 5, 7, 9, 11],
423 |                     "weights": ["uniform", "distance"],
424 |                     "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
425 |                     "leaf_size": [10, 20, 30, 40, 50],
426 |                     "p": [1, 2]
427 |                 }
428 |             },
429 |         ]
430 | 
431 |         # Wide Classification Model Configurations
432 |         WIDE_CLASSIFICATION_MODELS = QUICK_CLASSIFICATION_MODELS + [
433 |             {
434 |                 "name": ADA_BOOST_CLASSIFIER.__class__.__name__,
435 |                 "model": ADA_BOOST_CLASSIFIER,
436 |                 "tuning_param_grid": {
437 |                     "n_estimators": [50, 100, 200, 300],
438 |                     "learning_rate": [0.01, 0.05, 0.1, 0.5, 1],
439 |                     "algorithm": ["SAMME", "SAMME.R"]
440 |                 }
441 |             },
442 |             {
443 |                 "name": HIST_GRADIENT_BOOSTING_CLASSIFIER.__class__.__name__,
444 |                 "model": HIST_GRADIENT_BOOSTING_CLASSIFIER,
445 |                 "tuning_param_grid": {
446 |                     "max_iter": [100, 200, 300, 500],
447 |                     "max_depth": [3, 5, 7, 9, 10],
448 |                     "learning_rate": [0.01, 0.1, 0.3],
449 |                     "min_samples_leaf": [1, 5, 10],
450 |                     "l2_regularization": [0, 1.0, 10.0],
451 |                     "max_bins": [128, 255]
452 |                 }
453 |             },
454 |             {
455 |                 "name": GRADIENT_BOOSTING_CLASSIFIER.__class__.__name__,
456 |                 "model": GRADIENT_BOOSTING_CLASSIFIER,
457 |                 "tuning_param_grid": {
458 |                     'n_estimators': [100, 200, 300, 500],
459 |                     'learning_rate': [0.01, 0.1, 0.3],
460 |                     'max_depth': [3, 5, 7, 9, 10],
461 |                     'min_samples_split': [2, 5, 10],
462 |                     'min_samples_leaf': [1, 2, 4]
463 |                 }
464 |             },
465 |             {
466 |                 "name": EXTRA_TREES_CLASSIFIER.__class__.__name__,
467 |                 "model": EXTRA_TREES_CLASSIFIER,
468 |                 "tuning_param_grid": {
469 |                     'n_estimators': [100, 200, 300, 500],
470 |                     'max_depth': [3, 5, 7, 9, 10],
471 |                     'min_samples_split': [2, 5, 10],
472 |                     'min_samples_leaf': [1, 2, 4],
473 |                     'max_features': ["sqrt", "log2"],
474 |                     'bootstrap': [True, False]
475 |                 }
476 |             },
477 |             {
478 |                 "name": QDA_CLASSIFIER.__class__.__name__,
479 |                 "model": QDA_CLASSIFIER,
480 |                 "tuning_param_grid": {
481 |                     "reg_param": [0.0, 0.1, 0.5, 1.0],
482 |                     "tol": [1e-4, 1e-3, 1e-2, 1e-1]
483 |                 }
484 |             },
485 |             {
486 |                 "name": LDA_CLASSIFIER.__class__.__name__,
487 |                 "model": LDA_CLASSIFIER,
488 |                 "tuning_param_grid": {
489 |                     "solver": ["svd", "lsqr", "eigen"],
490 |                     "shrinkage": [0.1, 0.5, 1.0]
491 |                 }
492 |             },
493 |             {
494 |                 "name": MLP_CLASSIFIER.__class__.__name__,
495 |                 "model": MLP_CLASSIFIER,
496 |                 "tuning_param_grid": {
497 |                     "hidden_layer_sizes": [(50,), (100,), (50, 50), (100, 50)],
498 |                     "max_iter": [100, 200, 300, 400],
499 |                     "activation": ["relu", "tanh"],
500 |                     "alpha": [0.0001, 0.001, 0.01],
501 |                     "learning_rate": ["constant", "adaptive"],
502 |                     "learning_rate_init": [0.001, 0.01]
503 |                 }
504 |             }
505 |         ]
506 | 
507 |         return {
508 |             "QUICK": QUICK_CLASSIFICATION_MODELS,
509 |             "WIDE": WIDE_CLASSIFICATION_MODELS
510 |         }


--------------------------------------------------------------------------------
/flexml/helpers/validators.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from typing import Optional, List
  3 | from flexml.config import EVALUATION_METRICS, FEATURE_ENGINEERING_METHODS, CROSS_VALIDATION_METHODS
  4 | from flexml.logger import get_logger
  5 | import re
  6 | 
  7 | def eval_metric_checker(
  8 |     ml_task_type: str,
  9 |     eval_metric: Optional[str] = None,
 10 |     all_evaluation_metrics: Optional[List[str]] = None,
 11 |     default_evaluation_metric: Optional[str] = None
 12 | ) -> str:
 13 |     """
 14 |     Since eval_metric setting and validation is a common process for both Regression and Classification tasks...
 15 |     this method is used to set and validate the evaluation metric.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     ml_task_type : str
 20 |         The type of ML task ('Regression' or 'Classification')
 21 | 
 22 |     eval_metric : str, optional (default='R2' for Regression, 'Accuracy' for Classification)
 23 |         The evaluation metric to use for model evaluation
 24 | 
 25 |         - Avaiable evalulation metrics for Regression:    
 26 |             - R2, MAE, MSE, RMSE, MAPE
 27 | 
 28 |         - Avaiable evalulation metrics for Classification:    
 29 |             - Accuracy, Precision, Recall, F1 Score, ROC-AUC
 30 |     
 31 |     all_evaluation_metrics : List[str], (default=None)
 32 |         All possible evaluation metrics for the current task (Regression or Classification), e.g. ['R2', 'MAE', 'MSE', 'RMSE', 'MAPE'] for Regression
 33 | 
 34 |         If passed as None, they will be fetched from the config file
 35 | 
 36 |     default_evaluation_metric : str, (default=None)
 37 |         The default evaluation metric to use for the current task (Regression or Classification) e.g. 'R2' for Regression, 'Accuracy' for Classification
 38 | 
 39 |         If passed as None, it will be fetched from the config file
 40 | 
 41 |     Returns
 42 |     -------
 43 |     str
 44 |         The evaluation metric to use for model evaluation for the current task (Regression or Classification)
 45 |     """
 46 |     logger = get_logger(__name__, "PROD", False)
 47 |     
 48 |     if default_evaluation_metric is None or all_evaluation_metrics is None:
 49 |         default_evaluation_metric = EVALUATION_METRICS[ml_task_type]["DEFAULT"]
 50 |         all_evaluation_metrics = EVALUATION_METRICS[ml_task_type]["ALL"]
 51 | 
 52 |     if eval_metric is None:
 53 |         return default_evaluation_metric
 54 |     
 55 |     if ml_task_type == "Regression":
 56 |         eval_metric = eval_metric.upper()
 57 |     else:
 58 |         # Normalize input for flexible matching
 59 |         original_metric = eval_metric
 60 |         normalized_input = re.sub(r'[^a-zA-Z0-9]', '', eval_metric).lower()
 61 |         normalized_config = {re.sub(r'[^a-zA-Z0-9]', '', m).lower(): m 
 62 |                             for m in all_evaluation_metrics}
 63 |         
 64 |         if normalized_input in normalized_config:
 65 |             eval_metric = normalized_config[normalized_input]
 66 |         else:
 67 |             error_msg = (f"'{original_metric}' is not a valid evaluation metric for {ml_task_type}, "
 68 |                         f"expected one of: {all_evaluation_metrics}")
 69 |             logger.error(error_msg)
 70 |             raise ValueError(error_msg)
 71 | 
 72 |     if eval_metric not in all_evaluation_metrics:
 73 |         error_msg = f"Validation failed for {eval_metric} - not in configured metrics"
 74 |         logger.error(error_msg)
 75 |         raise ValueError(error_msg)
 76 |     
 77 |     return eval_metric
 78 | 
 79 | def random_state_checker(random_state: Optional[int] = None) -> int:
 80 |     """
 81 |     Validates the random_state parameter
 82 | 
 83 |     Parameters
 84 |     ----------
 85 |     random_state : int, optional (default=None)
 86 |         Random state value
 87 | 
 88 |     Returns
 89 |     -------
 90 |     int
 91 |         Validated random state
 92 |     """
 93 |     logger = get_logger(__name__, "PROD", False)
 94 | 
 95 |     if random_state is not None and (not isinstance(random_state, int) or random_state < 0):
 96 |         error_msg = f"random_state should be either None or a positive integer, got {random_state}"
 97 |         logger.error(error_msg)
 98 |         raise ValueError(error_msg)
 99 |     
100 |     return random_state
101 | 
102 | def cross_validation_checker(
103 |     df: pd.DataFrame,
104 |     cv_method: Optional[str] = None,
105 |     n_folds: Optional[int] = None,
106 |     test_size: Optional[float] = None,
107 |     groups_col: Optional[str] = None,
108 |     available_cv_methods: Optional[dict] = None,
109 |     ml_task_type: Optional[str] = None
110 | ) -> str:
111 |     
112 |     """
113 |     df : pd.DataFrame
114 |         The DataFrame that will be performed cross-validation to
115 | 
116 |     cv_method : str, (default='kfold' for Regression, 'stratified_kfold' for Classification if ml_task_type is not None)
117 |         The cross-validation method to use
118 | 
119 |         If passed as None, the default cross-validation method for the corresponding ml_task_type will be used If ml_task_type is not None
120 | 
121 |     n_folds : int, optional (default=None)
122 |         Number of folds to use for cross-validation
123 | 
124 |     test_size : float, optional (default=None)
125 |         The proportion of the dataset to include in the test split
126 | 
127 |     groups_col : str, optional (default=None)
128 |         The column in the DataFrame that contains the groups for group-based cross-validation methods
129 |     
130 |     available_cv_methods : dict, optional (default=None)
131 |         A dictionary containing the available cross-validation methods
132 | 
133 |     ml_task_type : str, optional (default=None)
134 |         The type of ML task ('Regression' or 'Classification')
135 | 
136 |     Returns
137 |     -------
138 |     str
139 |         The cross-validation method to use for the current task (Regression or Classification)
140 |     """
141 |     logger = get_logger(__name__, "PROD", False)
142 | 
143 |     if ml_task_type is not None and ml_task_type not in ['Regression', 'Classification']:
144 |         error_msg = f"ml_task_type should be 'Regression' or 'Classification', got {ml_task_type}"
145 |         logger.error(error_msg)
146 |         raise ValueError(error_msg)
147 | 
148 |     if available_cv_methods is None:
149 |         if ml_task_type is not None:
150 |             available_cv_methods = CROSS_VALIDATION_METHODS[ml_task_type]
151 |         else:
152 |             available_cv_methods = CROSS_VALIDATION_METHODS['all']
153 | 
154 |     if cv_method is None:
155 |         if ml_task_type is not None:
156 |             if ml_task_type == 'Regression':
157 |                 cv_method = 'kfold'
158 |             elif ml_task_type == "Classification":
159 |                 cv_method = 'stratified_kfold'
160 | 
161 |     else:
162 |         cv_method = cv_method.lower()
163 |         if available_cv_methods.get(cv_method) is None:
164 |             # If cv_method is not found in the available cv methods, check the without '_' version -->
165 |             # e.g. 'stratified_kfold' and 'stratifiedkfold'
166 |             if cv_method in available_cv_methods.values():
167 |                 cv_method = list(available_cv_methods.keys())[list(available_cv_methods.values()).index(cv_method)]
168 |     
169 |     # Check if cv_method is still None
170 |     if cv_method is None or cv_method not in list(available_cv_methods.keys()):
171 |         error_msg = f"cv_method is not found in the available cross-validation methods, expected one of {list(available_cv_methods.keys())}, got {cv_method}"
172 |         logger.error(error_msg)
173 |         raise ValueError(error_msg)
174 |     
175 |     if n_folds is not None and (not isinstance(n_folds, int) or n_folds < 2):
176 |         error_msg = "`n_folds` must be an integer >= 2 if provided"
177 |         logger.error(error_msg)
178 |         raise ValueError(error_msg)
179 |     
180 |     if test_size is not None and (not isinstance(test_size, float) or not 0 < test_size < 1):
181 |         error_msg = f"test_size parameter expected to be a float between 0 and 1, got {test_size}"
182 |         logger.error(error_msg)
183 |         raise ValueError(error_msg)
184 |     
185 |     if groups_col is not None and groups_col not in df.columns:
186 |         error_msg = f"groups_col should be a column in the DataFrame, got {groups_col}"
187 |         logger.error(error_msg)
188 |         raise ValueError(error_msg)
189 |     
190 |     if cv_method in ["group_kfold", "group_shuffle_split"] and groups_col is None:
191 |         error_msg = "`groups_col` must be provided for group-based methods"
192 |         logger.error(error_msg)
193 |         raise ValueError(error_msg)
194 |     
195 |     return cv_method
196 | 
197 | def validate_inputs(
198 |     data: pd.DataFrame,
199 |     target_col: str, 
200 |     drop_columns=None,
201 |     categorical_imputation_method="mode",
202 |     numerical_imputation_method="mean",
203 |     column_imputation_map=None,
204 |     numerical_imputation_constant=0.0,
205 |     categorical_imputation_constant="Unknown",
206 |     encoding_method="label_encoder",
207 |     onehot_limit=25,
208 |     encoding_method_map=None,
209 |     ordinal_encode_map=None,
210 |     normalize=None
211 | ):
212 |     """
213 |     Validates the input parameters for the feature engineering process
214 | 
215 |     Parameters
216 |     ----------
217 |     data : pd.DataFrame
218 |         The input data for the model training process
219 |     
220 |     target_col : str
221 |         The target column name in the data
222 | 
223 |     drop_columns : list, default=None
224 |         Columns that will be dropped from the data.
225 |     
226 |     categorical_imputation_method : str, default='mode'
227 |         Imputation method for categorical columns. Options:
228 |         * 'mode': Replace missing values with the most frequent value.
229 |         * 'constant': Replace missing values with a constant value.
230 |         * 'drop': Drop rows with missing values.
231 | 
232 |     numerical_imputation_method : str, default='mean'
233 |         Imputation method for numerical columns. Options:
234 |         * 'mean': Replace missing values with the column mean.
235 |         * 'median': Replace missing values with the column median.
236 |         * 'mode': Replace missing values with the column mode.
237 |         * 'constant': Replace missing values with a constant value.
238 |         * 'drop': Drop rows with missing values.
239 | 
240 |     column_imputation_map : dict, default=None
241 |         Custom mapping of columns to specific imputation methods.
242 |         Example usage: {'column_name': 'mean', 'column_name2': 'mode'}
243 | 
244 |     numerical_imputation_constant : float, default=0.0
245 |         The constant value for imputing numerical columns when 'constant' is selected.
246 | 
247 |     categorical_imputation_constant : str, default='Unknown'
248 |         The constant value for imputing categorical columns when 'constant' is selected.
249 | 
250 |     encoding_method : str, default='label_encoder'
251 |         Encoding method for categorical columns. Options:
252 |         * 'label_encoder': Use label encoding
253 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
254 |         * 'onehot_encoder': Use one-hot encoding
255 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
256 |         * 'ordinal_encoder': Use ordinal encoding
257 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
258 |         
259 |     onehot_limit : int, default=25
260 |         Maximum number of categories to use for one-hot encoding.
261 | 
262 |     encoding_method_map : dict, default=None
263 |         Custom mapping of columns to encoding methods.
264 |         Example usage: {'column_name': 'onehot_encoder', 'column_name2': 'label_encoder'}
265 |     
266 |     ordinal_encode_map : dict, default=None
267 |         Custom mapping of columns to category order for ordinal encoding.
268 |         Example usage: {'column_name': ['low', 'medium', 'high']}
269 |     
270 |     normalize : str, default=None
271 |         Standardize the data using StandardScaler. Options:
272 |         * 'standard_scaler': Standardize the data using StandardScaler
273 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
274 |         * 'minmax_scaler': Scale the data using MinMaxScaler
275 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
276 |         * 'robust_scaler': Scale the data using RobustScaler
277 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
278 |         * 'quantile_transformer': Transform the data using QuantileTransformer
279 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
280 |         * 'maxabs_scaler': Scale the data using MaxAbsScaler
281 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
282 |         * 'normalize_scaler': Normalize the data to unit length
283 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
284 |     """
285 |     # Check if any of the columns in drop_columns match the target_col
286 |     if drop_columns is not None and target_col in drop_columns:
287 |         error_msg = f"The target column '{target_col}' cannot be in the drop_columns list"
288 |         raise ValueError(error_msg)
289 |     
290 |     if drop_columns is None:
291 |         drop_columns = []
292 |     remaining_columns = set(data.columns) - set(drop_columns)
293 | 
294 |     # Ensure the target column is in the remaining columns and there's at least one feature column
295 |     if target_col not in remaining_columns or len(remaining_columns) < 2:
296 |         error_msg = (
297 |             f"After dropping columns, only {remaining_columns} remain. "
298 |             f"There should be at least one feature column and the target column '{target_col}' remaining."
299 |         )
300 |         raise ValueError(error_msg)
301 |     
302 |     # Check if categorical_imputation_method is valid
303 |     if categorical_imputation_method not in FEATURE_ENGINEERING_METHODS["accepted_categorical_imputations_methods"]:
304 |         error_msg = f"The categorical_imputation_method '{categorical_imputation_method}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_categorical_imputations_methods']}"
305 |         raise ValueError(error_msg)
306 |     
307 |     # Check if numerical_imputation_method is valid
308 |     if numerical_imputation_method not in FEATURE_ENGINEERING_METHODS["accepted_numeric_imputations_methods"]:
309 |         error_msg = f"The numerical_imputation_method '{numerical_imputation_method}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_numeric_imputations_methods']}"
310 |         raise ValueError(error_msg)
311 |     
312 |     # Check if encoding_method is valid
313 |     if encoding_method not in FEATURE_ENGINEERING_METHODS["accepted_encoding_methods"]:
314 |         error_msg = f"The encoding_method '{encoding_method}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_encoding_methods']}"
315 |         raise ValueError(error_msg)
316 |     
317 |     # Check if onehot_limit is a positive integer
318 |     if not isinstance(onehot_limit, int) or onehot_limit < 0:
319 |         error_msg = f"onehot_limit should be a positive integer, got {onehot_limit}"
320 |         raise ValueError(error_msg)
321 |     
322 |     # Check if drop_columns columns are in data
323 |     if drop_columns is not None:
324 |         for col in drop_columns:
325 |             if col not in data.columns:
326 |                 error_msg = f"The column '{col}' in drop_columns is not in the data"
327 |                 raise ValueError(error_msg)
328 |         
329 |     # Check if columns in column_imputation_map are in data and methods are valid
330 |     if column_imputation_map is not None:
331 |         for col, method in column_imputation_map.items():
332 |             if col not in data.columns:
333 |                 error_msg = f"The column '{col}' in column_imputation_map is not in the data"
334 |                 raise ValueError(error_msg)
335 |             
336 |             if col in data.select_dtypes(include=['number']).columns:
337 |                 if method not in FEATURE_ENGINEERING_METHODS["accepted_numeric_imputations_methods"]:
338 |                     error_msg = f"The numeric imputation method '{method}' for column '{col}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_numeric_imputations_methods']}"
339 |                     raise ValueError(error_msg)
340 |             else:
341 |                 if method not in FEATURE_ENGINEERING_METHODS["accepted_categorical_imputations_methods"]:
342 |                     error_msg = f"The categorical imputation method '{method}' for column '{col}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_categorical_imputations_methods']}"
343 |                     raise ValueError(error_msg)
344 | 
345 |     # Check if numerical_imputation_constant is a number
346 |     if not isinstance(numerical_imputation_constant, (int, float)):
347 |         error_msg = f"numerical_imputation_constant should be a number, got {type(numerical_imputation_constant)}"
348 |         raise ValueError(error_msg)
349 | 
350 |     # Check if categorical_imputation_constant is a string
351 |     if not isinstance(categorical_imputation_constant, str):
352 |         error_msg = f"categorical_imputation_constant should be a string, got {type(categorical_imputation_constant)}"
353 |         raise ValueError(error_msg)
354 | 
355 |     # Check if encoding_method is ordinal_encoder and ordinal_encoder_map is provided for every categorical column
356 |     if encoding_method == "ordinal_encoder":
357 |         if ordinal_encode_map is None:
358 |             error_msg = "Ordinal encoding is selected but no ordinal_encode_map is provided"
359 |             raise ValueError(error_msg)
360 |         # Check if ordinal_encode_map is provided for every categorical column
361 |         for col in data.select_dtypes(include=['object', 'category']).columns:
362 |             if col not in ordinal_encode_map:
363 |                 error_msg = f"Ordinal encoding is selected for column '{col}' but no ordinal_encode_map is provided"
364 |                 raise ValueError(error_msg)
365 | 
366 |     # Check if methods inside encoding_method_map are valid and columns are in data
367 |     if encoding_method_map is not None:
368 |         for col, method in encoding_method_map.items():
369 |             if col not in data.columns:
370 |                 error_msg = f"The column '{col}' in encoding_method_map is not in the data"
371 |                 raise ValueError(error_msg)
372 |             
373 |             if col in drop_columns:
374 |                 error_msg = f"The column '{col}' in encoding_method_map is in drop_columns"
375 |                 raise ValueError(error_msg)
376 | 
377 |             if method not in FEATURE_ENGINEERING_METHODS["accepted_encoding_methods"]:
378 |                 error_msg = f"The encoding method '{method}' for column '{col}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_encoding_methods']}"
379 |                 raise ValueError(error_msg)
380 | 
381 |             # Check if there is a ordinal_encoder between methods and ordinal_encode_map is provided
382 |             if method == "ordinal_encoder":
383 |                 if ordinal_encode_map is None:
384 |                     error_msg = f"Ordinal encoding is selected for column '{col}' but no ordinal_encode_map is provided"
385 |                     raise ValueError(error_msg)
386 |                 # Check if map for col is provided within ordinal_encode_map
387 |                 if col not in ordinal_encode_map:
388 |                     error_msg = f"Ordinal encoding is selected for column '{col}' but no ordinal_encode_map is provided"
389 |                     raise ValueError(error_msg)
390 | 
391 |     # Check if normalize is valid
392 |     if normalize is not None and normalize not in FEATURE_ENGINEERING_METHODS["accepted_standardization_methods"]:
393 |         error_msg = f"The normalize method '{normalize}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_standardization_methods']}"
394 |         raise ValueError(error_msg)
395 | 
396 |     # Check if encoding_method is ordinal_encoder
397 |     if encoding_method == "ordinal_encoder":
398 |         if ordinal_encode_map is None:
399 |             error_msg = "Ordinal encoding is selected, but no ordinal_encode_map is provided."
400 |             raise ValueError(error_msg)
401 |         
402 |         # Get all categorical columns
403 |         categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
404 |         
405 |         # Check that all categorical columns are in ordinal_encode_map
406 |         for col in categorical_columns:
407 |             if col not in ordinal_encode_map:
408 |                 error_msg = f"Ordinal encoding is selected, but column '{col}' is missing in ordinal_encode_map."
409 |                 raise ValueError(error_msg)
410 |             
411 |             # Get distinct values in the column
412 |             distinct_values = set(data[col].dropna().unique())
413 |             map_values = set(ordinal_encode_map[col])
414 |             
415 |             # Check if the values in ordinal_encode_map match exactly with the distinct values
416 |             if distinct_values != map_values:
417 |                 error_msg = (
418 |                     f"Distinct values in column '{col}' do not match "
419 |                     f"Ensure they match exactly."
420 |                 )
421 |                 raise ValueError(error_msg)
422 |         
423 |         # Check that ordinal_encode_map does not include extra columns
424 |         extra_columns = set(ordinal_encode_map.keys()) - set(categorical_columns)
425 |         if extra_columns:
426 |             error_msg = (
427 |                 f"Ordinal_encode_map includes extra columns not in the categorical columns: {extra_columns}. "
428 |                 f"Remove these columns from the mapping."
429 |             )
430 |             raise ValueError(error_msg)
431 | 
432 |     # Check if encoding_method_map is provided and has ordinal_encoder
433 |     if encoding_method_map:
434 |         ordinal_columns = [
435 |             col for col, method in encoding_method_map.items() if method == "ordinal_encoder"
436 |         ]
437 |     else:
438 |         ordinal_columns = []
439 | 
440 |     if ordinal_columns:
441 |         if not ordinal_encode_map:
442 |             raise ValueError(
443 |                 "Ordinal encoding is specified in encoding_method_map, but no ordinal_encode_map is provided."
444 |             )
445 |         
446 |         # Validate only the columns specified for ordinal encoding
447 |         for col in ordinal_columns:
448 |             if col not in ordinal_encode_map:
449 |                 raise ValueError(
450 |                     f"Column '{col}' is marked for ordinal encoding but is missing in ordinal_encode_map."
451 |                 )
452 |             
453 |             # Get distinct values in the column
454 |             distinct_values = set(data[col].dropna().unique())
455 |             map_values = set(ordinal_encode_map[col])
456 |             
457 |             # Check if the values in ordinal_encode_map match exactly with the distinct values
458 |             if distinct_values != map_values:
459 |                 raise ValueError(
460 |                     f"Unique values in '{col}' do not match with the ones given in ordinal_encode_map. "
461 |                     f"Ensure they match exactly."
462 |                 )
463 |         
464 |         # Ensure ordinal_encode_map does not include extra columns
465 |         extra_columns = set(ordinal_encode_map.keys()) - set(ordinal_columns)
466 |         if extra_columns:
467 |             raise ValueError(
468 |                 f"Ordinal_encode_map includes extra columns not specified for ordinal encoding."
469 |                 f"Remove these columns from the mapping."
470 |             )
471 | 
472 |     return True


--------------------------------------------------------------------------------
/flexml/_feature_engineer.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
  4 | from sklearn.pipeline import Pipeline
  5 | from sklearn.base import BaseEstimator, TransformerMixin
  6 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, MaxAbsScaler, normalize 
  7 | from typing import List, Optional, Dict, Any
  8 | from flexml.logger import get_logger
  9 | 
 10 | class ColumnDropper(BaseEstimator, TransformerMixin):
 11 |     """
 12 |     A transformer to drop specified columns from a dataset
 13 |     """
 14 |     def __init__(self, drop_columns: Optional[List[str]] = None):
 15 |         self.drop_columns = drop_columns or []
 16 | 
 17 |     def fit(self, X, y=None):
 18 |         return self
 19 | 
 20 |     def transform(self, X):
 21 |         """
 22 |         Drops specified columns from the input DataFrame
 23 |         
 24 |         Returns
 25 |         -------
 26 |         pd.DataFrame
 27 |             A DataFrame with the specified columns dropped
 28 |         """
 29 |         return X.drop(columns=self.drop_columns, axis=1, errors='ignore')
 30 |     
 31 | 
 32 | class ColumnImputer(BaseEstimator, TransformerMixin):
 33 |     """
 34 |     A transformer to impute missing values in a dataset
 35 |     """
 36 |     def __init__(
 37 |         self, 
 38 |         column_imputation_mapper: Dict[str, str],
 39 |         numerical_imputation_constant: float = 0.0, 
 40 |         categorical_imputation_constant: str = "Unknown"
 41 |     ):
 42 |         self.column_imputation_mapper = column_imputation_mapper
 43 |         self.numerical_imputation_constant = numerical_imputation_constant
 44 |         self.categorical_imputation_constant = categorical_imputation_constant
 45 | 
 46 |     def fit(self, X, y=None):
 47 |         return self
 48 | 
 49 |     def transform(self, X) -> pd.DataFrame:
 50 |         # Categorical columns are converted to string
 51 |         categorical_cols = X.select_dtypes(exclude=['number']).columns
 52 |         X[categorical_cols] = X[categorical_cols].astype(str)
 53 | 
 54 |         for column, method in self.column_imputation_mapper.items():
 55 |             X[column] = X[column].replace("nan", pd.NA)
 56 |             if method == "mean":
 57 |                 mean_value = X[column].mean()
 58 |                 X[column] = X[column].fillna(mean_value)
 59 | 
 60 |             elif method == "median":
 61 |                 median_value = X[column].median()
 62 |                 X[column] = X[column].fillna(median_value)
 63 | 
 64 |             elif method == "mode":
 65 |                 mode_values = X[column].mode()
 66 |                 if len(mode_values) > 0:
 67 |                     mode_value = mode_values[0]
 68 |                 else:
 69 |                     # TODO: Notify user that mode is not available
 70 |                     mode_value = self.categorical_imputation_constant
 71 |                 X[column] = X[column].replace("nan", np.nan).fillna(mode_value)
 72 | 
 73 |             elif method == "constant":
 74 |                 if X[column].dtype != 'object':
 75 |                     constant = self.numerical_imputation_constant
 76 |                 else:
 77 |                     constant = self.categorical_imputation_constant
 78 |                 X[column] = X[column].replace("nan", np.nan).fillna(constant)
 79 | 
 80 |             elif method == "drop":
 81 |                 X = X.dropna(subset=[column])
 82 | 
 83 |             else:
 84 |                 raise ValueError(f"Invalid imputation method: {method}")
 85 |             
 86 |         return X
 87 | 
 88 | 
 89 | class CategoricalEncoder(BaseEstimator, TransformerMixin):
 90 |     """
 91 |     A transformer to encode categorical columns in a dataset
 92 |     """
 93 |     def __init__(
 94 |         self, 
 95 |         encoding_method_mapper: Dict[str, str], 
 96 |         ordinal_map: Dict[str, List[str]],
 97 |         onehot_limit: int = 25
 98 |     ):
 99 |         self.encoding_method_mapper = encoding_method_mapper
100 |         self.ordinal_map = ordinal_map
101 |         self.onehot_limit = onehot_limit
102 |         self.label_encoders = {}
103 |         self.onehot_encoders = {}
104 |         self.ordinal_encoders = {}
105 | 
106 |     def fit(self, X, y=None):
107 |         # Categorical columns are converted to string
108 |         categorical_cols = X.select_dtypes(exclude=['number']).columns
109 |         X[categorical_cols] = X[categorical_cols].astype(str)
110 | 
111 |         for col, method in self.encoding_method_mapper.items():
112 |             if method == "label_encoder":
113 |                 encoder = LabelEncoder()
114 |                 encoder.fit(X[col].fillna("Unknown"))
115 |                 self.label_encoders[col] = encoder
116 | 
117 |             elif method == "onehot_encoder":
118 |                 encoder = OneHotEncoder(
119 |                     sparse_output=False, 
120 |                     handle_unknown="ignore", 
121 |                     max_categories=self.onehot_limit
122 |                 )
123 |                 encoder.fit(X[[col]])
124 |                 self.onehot_encoders[col] = encoder
125 | 
126 |             elif method == "ordinal_encoder":
127 |                 if col in self.ordinal_map:
128 |                     categories = [self.ordinal_map[col]]
129 |                     encoder = OrdinalEncoder(categories=categories)
130 |                     encoder.fit(X[[col]])
131 |                     self.ordinal_encoders[col] = encoder
132 | 
133 |         return self
134 | 
135 |     def transform(self, X) -> pd.DataFrame:
136 |         # Categorical columns are converted to string
137 |         categorical_cols = X.select_dtypes(exclude=['number']).columns
138 |         X[categorical_cols] = X[categorical_cols].astype(str)
139 | 
140 |         for col, method in self.encoding_method_mapper.items():
141 |             if method == "label_encoder":
142 |                 if col in self.label_encoders:
143 |                     encoder = self.label_encoders[col]
144 |                     # Identify known and unknown labels
145 |                     known_mask = X[col].isin(encoder.classes_)
146 |                     # Transform known labels
147 |                     if known_mask.any():
148 |                          X.loc[known_mask, col] = encoder.transform(X.loc[known_mask, col])
149 |                     # Handle unknown labels
150 |                     X.loc[~known_mask, col] = -1
151 |                     X[col] = X[col].astype(int)
152 | 
153 |             elif method == "onehot_encoder":
154 |                 if col in self.onehot_encoders:
155 |                     encoder = self.onehot_encoders[col]
156 |                     one_hot_encoded = encoder.transform(X[[col]])
157 |                     one_hot_df = pd.DataFrame(
158 |                         one_hot_encoded,
159 |                         columns=encoder.get_feature_names_out([col]),
160 |                         index=X.index
161 |                     )
162 |                     X = pd.concat([X.drop(columns=[col]), one_hot_df], axis=1)
163 | 
164 |             elif method == "ordinal_encoder":
165 |                 if col in self.ordinal_encoders:
166 |                     encoder = self.ordinal_encoders[col]
167 |                     # Identify known and unknown categories
168 |                     known_categories = encoder.categories_[0]
169 |                     known_mask = X[col].isin(known_categories)
170 |                     # Transform known categories
171 |                     if known_mask.any():
172 |                          X.loc[known_mask, col] = encoder.transform(X.loc[known_mask, [col]])[:, 0]
173 |                     # Handle unknown categories
174 |                     X.loc[~known_mask, col] = -1
175 |                     X[col] = X[col].astype(int)
176 | 
177 |         return X
178 | 
179 | 
180 | class NumericalNormalizer(BaseEstimator, TransformerMixin):
181 |     """
182 |     A transformer to normalize numerical columns in a dataset
183 |     """
184 |     def __init__(self, normalization_method_map: Dict[str, str]): 
185 |         self.normalization_method_map = normalization_method_map or {}
186 |         self.scalers = {}
187 |         self.logger = get_logger(__name__, "PROD")
188 | 
189 |     def fit(self, X, y=None):
190 |         for column, method in self.normalization_method_map.items():
191 |             if method == "standard_scaler":
192 |                 scaler = StandardScaler()
193 | 
194 |             elif method == "minmax_scaler":
195 |                 scaler = MinMaxScaler()
196 | 
197 |             elif method == "robust_scaler":
198 |                 scaler = RobustScaler()
199 | 
200 |             elif method == "quantile_transformer":
201 |                 scaler = QuantileTransformer()
202 | 
203 |             elif method == "maxabs_scaler":
204 |                 scaler = MaxAbsScaler()
205 | 
206 |             elif method == "normalize_scaler":
207 |                 scaler = None
208 | 
209 |             else:
210 |                 self.logger.warning(f"Unknown method '{method}' for column '{column}'. Skipping.")
211 |                 continue
212 | 
213 |             if scaler is not None:
214 |                 scaler.fit(X[[column]])
215 |                 self.scalers[column] = scaler
216 |             else:
217 |                 self.scalers[column] = None
218 | 
219 |         return self
220 | 
221 |     def transform(self, X):
222 |         for column, scaler in self.scalers.items():
223 |             if scaler is None:  # Directly use sklearn's normalize method
224 |                 X[column] = normalize(X[[column]], axis=0).flatten()  # Normalize to unit length
225 |             else:
226 |                 X[column] = scaler.transform(X[[column]])
227 | 
228 |         return X
229 | 
230 | 
231 | class FeatureEngineering:
232 |     """
233 |     A class for performing feature engineering on a dataset
234 | 
235 |     Parameters
236 |     ----------
237 |     data : pd.DataFrame
238 |         The input data for the model training process
239 |     
240 |     target_col : str
241 |         The target column name in the data
242 | 
243 |     drop_columns : list, default=None
244 |         Columns that will be dropped from the data
245 |     
246 |     categorical_imputation_method : str, default='mode'
247 |         Imputation method for categorical columns. Options:
248 |         * 'mode': Replace missing values with the most frequent value
249 |         * 'constant': Replace missing values with a constant value
250 |         * 'drop': Drop rows with missing values
251 | 
252 |     numerical_imputation_method : str, default='mean'
253 |         Imputation method for numerical columns. Options:
254 |         * 'mean': Replace missing values with the column mean
255 |         * 'median': Replace missing values with the column median
256 |         * 'mode': Replace missing values with the column mode
257 |         * 'constant': Replace missing values with a constant value
258 |         * 'drop': Drop rows with missing values
259 | 
260 |     column_imputation_map : dict, default=None
261 |         Custom mapping of columns to specific imputation methods
262 |         Example usage: {'column_name1': 'mean', 'column_name2': 'mode'}
263 | 
264 |     categorical_imputation_constant : str, default='Unknown'
265 |         The constant value for imputing categorical columns when 'constant' is selected
266 | 
267 |     numerical_imputation_constant : float, default=0.0
268 |         The constant value for imputing numerical columns when 'constant' is selected
269 | 
270 |     encoding_method : str, default='onehot_encoder'
271 |         Encoding method for categorical columns. Options:
272 |         * 'label_encoder': Use label encoding
273 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
274 |         * 'onehot_encoder': Use one-hot encoding
275 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
276 |         * 'ordinal_encoder': Use ordinal encoding
277 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
278 |         
279 |     onehot_limit : int, default=25
280 |         Maximum number of categories to use for one-hot encoding
281 | 
282 |     encoding_method_map : dict, default=None
283 |         Custom mapping of columns to encoding methods
284 |         Example usage: {'column_name': 'onehot_encoder', 'column_name2': 'label_encoder'}
285 |     
286 |     ordinal_encode_map : dict, default=None
287 |         Custom mapping of columns to category order for ordinal encoding
288 |         Example usage: {'column_name': ['low', 'medium', 'high']}
289 |     
290 |     normalize : str, default=None
291 |         Standardize the data using StandardScaler. Options:
292 |         * 'standard_scaler': Standardize the data using StandardScaler
293 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
294 |         * 'minmax_scaler': Scale the data using MinMaxScaler
295 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
296 |         * 'robust_scaler': Scale the data using RobustScaler
297 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
298 |         * 'quantile_transformer': Transform the data using QuantileTransformer
299 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
300 |         * 'maxabs_scaler': Scale the data using MaxAbsScaler
301 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
302 |         * 'normalize_scaler': Normalize the data to unit length
303 |             * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
304 |     """
305 |     def __init__(
306 |         self, 
307 |         data: pd.DataFrame, 
308 |         target_col: str, 
309 |         drop_columns: Optional[List[str]] = None,
310 |         categorical_imputation_method: str = "mode",
311 |         numerical_imputation_method: str = "mean", 
312 |         column_imputation_map: Optional[Dict[str, str]] = None,
313 |         categorical_imputation_constant: str = "Unknown", 
314 |         numerical_imputation_constant: float = 0.0,
315 |         encoding_method: str = "onehot_encoder",
316 |         onehot_limit: int = 25,
317 |         encoding_method_map: Optional[Dict[str, str]] = None,
318 |         ordinal_encode_map: Optional[Dict[str, List[str]]] = None,
319 |         normalize: Optional[str] = None
320 |     ):
321 |         self.logger = get_logger(__name__, "PROD")
322 | 
323 |         # Initialize attributes
324 |         self.data = data
325 |         self.target_col = target_col
326 |         self.drop_columns = drop_columns or []
327 |         self.categorical_imputation_method = categorical_imputation_method
328 |         self.numerical_imputation_method = numerical_imputation_method
329 |         self.column_imputation_map = column_imputation_map or {}
330 |         self.numerical_imputation_constant = numerical_imputation_constant
331 |         self.categorical_imputation_constant = categorical_imputation_constant
332 |         self.encoding_method = encoding_method
333 |         self.onehot_limit = onehot_limit
334 |         self.encoding_method_map = encoding_method_map or {}
335 |         self.ordinal_encode_map = ordinal_encode_map or {}
336 |         self.normalize = normalize
337 |         self.y_class_mapping = None
338 |         
339 |     def setup(self, data: Optional[pd.DataFrame] = None):
340 |         """
341 |         Setup the feature engineering pipeline
342 | 
343 |         Parameters
344 |         ----------
345 |         data : pd.DataFrame, default=None
346 |             The data to override the existing data attribute
347 |         """
348 |         if data is not None:
349 |             self.data = data
350 | 
351 |         # Initialize encoder for target column
352 |         self.target_encoder = LabelEncoder()
353 |         # Separate features and target column
354 |         self.feature_data = self.data.drop(columns=[self.target_col, *self.drop_columns], errors='ignore')
355 |         self.numerical_columns = self.feature_data.select_dtypes(include=['number']).columns.tolist()
356 |         self.categorical_columns = self.feature_data.columns.difference(self.numerical_columns).tolist()
357 | 
358 |         # Separate imputation mapping for numerical and categorical columns
359 |         self.numerical_column_imputation_mapper = {
360 |             col: self.numerical_imputation_method for col in self.numerical_columns
361 |         }
362 | 
363 |         # For categorical columns, handle imputation separately
364 |         self.categorical_column_imputation_mapper = {
365 |             col: self.categorical_imputation_method for col in self.categorical_columns
366 |         }
367 | 
368 |         # Combine both mappers to have a comprehensive imputation mapping
369 |         self.column_imputation_mapper = {**self.numerical_column_imputation_mapper, 
370 |                                          **self.categorical_column_imputation_mapper}
371 |         
372 |         # Update the mappers with any custom map provided
373 |         if self.column_imputation_map:
374 |             self.column_imputation_mapper.update(self.column_imputation_map)
375 |         
376 |         # Initialize encoding method mapper with default value and update with custom map
377 |         self.encoding_method_mapper = {col: self.encoding_method for col in self.categorical_columns}
378 |         if self.encoding_method_map:
379 |             self.encoding_method_mapper.update(self.encoding_method_map)
380 |         
381 |         if self.ordinal_encode_map:
382 |             for col in self.ordinal_encode_map.keys():
383 |                 if col in self.encoding_method_mapper:
384 |                     self.encoding_method_mapper[col] = 'ordinal_encoder'
385 |         
386 |         # Initialize numerical normalization map
387 |         if self.normalize:
388 |             self.normalization_map = {
389 |                 col: self.normalize for col in self.numerical_columns
390 |             }
391 | 
392 | 
393 |         pipeline_steps = []
394 | 
395 |         # Add drop_columns step if drop_columns is not empty
396 |         if self.drop_columns:
397 |             pipeline_steps.append(("drop_columns", ColumnDropper(drop_columns=self.drop_columns)))
398 | 
399 |         # Add imputer step
400 |         pipeline_steps.append(
401 |             ("imputer", ColumnImputer(
402 |                 self.column_imputation_mapper, 
403 |                 self.numerical_imputation_constant, 
404 |                 self.categorical_imputation_constant
405 |                 )
406 |             )
407 |         )
408 |         
409 |         # Add normalization step if not None
410 |         if self.normalize:
411 |             pipeline_steps.append(("normalizer", NumericalNormalizer(self.normalization_map)))
412 | 
413 |         # Add encoding step
414 |         pipeline_steps.append(("encoder", CategoricalEncoder(
415 |             self.encoding_method_mapper, 
416 |             self.ordinal_encode_map,
417 |             onehot_limit=self.onehot_limit
418 |         )))
419 |         
420 |         # Create the pipeline
421 |         self.pipeline = Pipeline(pipeline_steps, memory=None)
422 | 
423 |     def check_column_anomalies(self, threshold: float = 0.5):
424 |         """
425 |         Identifies columns that are likely to be ID columns or have too many unique values
426 | 
427 |         Parameters
428 |         ----------
429 |         threshold : float 
430 |             Threshold for the ratio (default is 0.5, e.g., 50%)
431 |         """
432 | 
433 |         id_columns = self._id_finder()
434 |         if id_columns:
435 |             for column in id_columns:
436 |                 if column not in self.drop_columns:
437 |                     self.logger.warning(f"Column '{column}' seems like an ID column. Consider dropping it via 'drop_columns' parameter if it is not a feature")
438 | 
439 |         columns_to_consider = self._anomaly_unique_values_finder(threshold=threshold)
440 |         if columns_to_consider:
441 |             for column, ratio in columns_to_consider.items():
442 |                 self.logger.warning(
443 |                     f"Column '{column}' has too many unique values ({ratio:.2%}). "
444 |                     "Recommended to either process or drop this column via 'drop_columns'"
445 |                 )
446 | 
447 |         # Find the columns that exceeds one_hot_limit
448 |         columns_exceeding_limit = self._anomaly_onehot_limit_finder()
449 |         # remove columns_to_consider from columns_exceeding_limit to avoid duplicate warnings
450 |         columns_exceeding_limit = {k: v for k, v in columns_exceeding_limit.items() if k not in columns_to_consider}
451 |         if columns_exceeding_limit:
452 |             for column, count in columns_exceeding_limit.items():
453 |                 self.logger.warning(
454 |                     f"Column '{column}' has {count} unique values. "
455 |                     "Consider operations like increasing value of 'onehot_limit', "
456 |                     "changing the encoding method or processing the column"
457 |                 )
458 | 
459 |     def _id_finder(self) -> list:
460 |         """
461 |         Identifies potential ID columns by checking if values in the first 100 rows 
462 |         match their respective index values
463 |         
464 |         Returns
465 |         -------
466 |         list 
467 |             List of column names that could be ID columns
468 |         """
469 |         potential_ids = []
470 | 
471 |         for column in self.data.columns:
472 |             # Check if the first 100 rows match the index values
473 |             if (self.data[column].iloc[:100] == self.data.index[:100]).all():
474 |                 potential_ids.append(column)
475 |         
476 |         return potential_ids
477 | 
478 |     def _anomaly_unique_values_finder(self, threshold: float = 0.5) -> dict:
479 |         """
480 |         Identifies categorical columns where the ratio of unique values to non-null rows
481 |         exceeds the given threshold
482 | 
483 |         Parameters
484 |         ----------
485 |         threshold : float 
486 |             Threshold for the ratio (default is 0.5, e.g., 50%)
487 | 
488 |         Returns
489 |         -------
490 |         dict
491 |             Dictionary of column names and their unique value ratios
492 |         """
493 |         columns_above_threshold = {}
494 | 
495 |         for column in self.categorical_columns:
496 |             # Calculate the ratio using non-null data
497 |             non_null_count = self.data[column].notnull().sum()
498 |             if non_null_count > 0:  # Avoid division by zero
499 |                 unique_ratio = self.data[column].nunique() / non_null_count
500 |                 if unique_ratio > threshold:
501 |                     columns_above_threshold[column] = unique_ratio
502 | 
503 |         return columns_above_threshold
504 |     
505 |     def _anomaly_onehot_limit_finder(self) -> dict:
506 |         """
507 |         Identifies categorical columns where the number of unique values exceeds the one_hot_limit
508 | 
509 |         Returns
510 |         -------
511 |         dict
512 |             Dictionary of column names and their unique value counts
513 |         """
514 |         columns_above_threshold = {}
515 |         
516 |         for column in self.categorical_columns:
517 |             if self.data[column].nunique() > self.onehot_limit:
518 |                 columns_above_threshold[column] = self.data[column].nunique()
519 | 
520 |         return columns_above_threshold
521 |     
522 |     def fit_transform(self) -> pd.DataFrame:
523 |         """
524 |         Perform feature engineering on the training data
525 | 
526 |         Processes features and the target column by:
527 |         - Dropping specified columns from the data
528 |         - Imputing missing values for numerical and categorical columns
529 |         - Encoding categorical features
530 |         - Encoding the target column if it is categorical
531 |         - Normalizing numerical columns if specified
532 | 
533 |         Returns
534 |         -------
535 |         pd.DataFrame
536 |             A DataFrame containing the processed features and target column
537 |         """
538 |         # Process features
539 |         processed_features = self.pipeline.fit_transform(self.feature_data)
540 |         
541 |         # Process if target column is categorical
542 |         target_data = self.data[self.target_col]
543 |         if target_data.dtype in ['object', 'category']:
544 |             target_data = self.target_encoder.fit_transform(target_data)
545 |             self.y_class_mapping = { # for example: {0: 'male', 1: 'female'}
546 |                 i: label for i, label in enumerate(self.target_encoder.classes_)
547 |             }
548 |         processed_features[self.target_col] = target_data
549 | 
550 |         return processed_features.drop(self.target_col, axis=1), processed_features[self.target_col]
551 | 
552 |     def transform(self, test_data: pd.DataFrame, y_included: bool = False) -> pd.DataFrame:
553 |         """
554 |         Perform feature engineering on test data using the fitted pipeline
555 | 
556 |         Processes features by:
557 |         - Imputing missing values for numerical and categorical columns
558 |         - Encoding categorical features
559 |         - Normalizing numerical columns if specified
560 | 
561 |         Parameters
562 |         ----------
563 |         test_data : pd.DataFrame
564 |             The test dataset to process
565 | 
566 |         y_included : bool, default=False
567 |             Whether the target column is included in the test data so It also transforms the target column
568 | 
569 |         Returns
570 |         -------
571 |         pd.DataFrame
572 |             A DataFrame containing the processed test features
573 |         """
574 |         if y_included:
575 |             test_features = test_data
576 |         else:
577 |             test_features = test_data.drop(columns=[self.target_col], errors='ignore')
578 | 
579 |         processed_test_features = self.pipeline.transform(test_features)
580 |         
581 |         # Add target column if it exists in test data
582 |         if self.target_col in test_data.columns:
583 |             target_data = test_data[self.target_col]
584 |             if target_data.dtype in ['object', 'category']:
585 |                 target_data = self.target_encoder.transform(target_data)
586 |             processed_test_features[self.target_col] = target_data
587 | 
588 |         if not y_included:
589 |             return processed_test_features
590 |         else:
591 |             return processed_test_features.drop(self.target_col, axis=1), processed_test_features[self.target_col]


--------------------------------------------------------------------------------
/flexml/_model_tuner.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import optuna
  4 | import joblib
  5 | from joblib.parallel import BatchCompletionCallBack
  6 | from contextlib import contextmanager
  7 | from typing import Optional, Union
  8 | from time import time
  9 | from sklearn.model_selection import ParameterGrid, GridSearchCV, RandomizedSearchCV
 10 | from sklearn.pipeline import Pipeline
 11 | from sklearn.base import clone
 12 | from flexml.config import TUNING_METRIC_TRANSFORMATIONS
 13 | from flexml.logger import get_logger
 14 | from flexml.helpers import evaluate_model_perf
 15 | from copy import deepcopy
 16 | from tqdm import tqdm
 17 | 
 18 | 
 19 | class TqdmBatchCompletionCallback(BatchCompletionCallBack):
 20 |     def __init__(self, *args, **kwargs):
 21 |         super().__init__(*args, **kwargs)
 22 |     
 23 |     def __call__(self, *args, **kwargs):
 24 |         self.tqdm_object.update(n=self.batch_size)
 25 |         return super().__call__(*args, **kwargs)
 26 | 
 27 | @contextmanager
 28 | def tqdm_joblib(tqdm_object):
 29 |     """Context manager to patch joblib to report into tqdm progress bar"""
 30 |     class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
 31 |         def __init__(self, *args, **kwargs):
 32 |             super().__init__(*args, **kwargs)
 33 | 
 34 |         def __call__(self, *args, **kwargs):
 35 |             tqdm_object.update(n=self.batch_size)
 36 |             return super().__call__(*args, **kwargs)
 37 | 
 38 |     old_batch_callback = joblib.parallel.BatchCompletionCallBack
 39 |     joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
 40 |     try:
 41 |         yield tqdm_object
 42 |     finally:
 43 |         joblib.parallel.BatchCompletionCallBack = old_batch_callback
 44 |         tqdm_object.close()
 45 | 
 46 | 
 47 | class ModelTuner:
 48 |     """
 49 |     Implements hyperparameter tuning on the machine learning models with the desired tuning method from the following:
 50 | 
 51 |     * 'grid_search' for GridSearchCV (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
 52 |         Note that GridSearch optimization may take too long to finish since It tries all the possible combinations of the parameters
 53 | 
 54 |     * 'randomized_search' for RandomizedSearchCV (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
 55 |             
 56 |     * 'optuna' for Optuna (https://optuna.readthedocs.io/en/stable/)
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     ml_problem_type : str
 61 |         The type of the machine learning problem. It can be one of the following:
 62 |         
 63 |         * 'Classification' for classification problems
 64 |         
 65 |         * 'Regression' for regression problems
 66 | 
 67 |     logging_to_file: bool, (default=False)
 68 |         If True, the logs will be saved to a file in the current path, located in /logs/flexml_logs.log, Otherwise, it will not be saved
 69 |     """
 70 |     def __init__(
 71 |         self, 
 72 |         ml_problem_type: str,
 73 |         X: Union[pd.DataFrame, np.ndarray], 
 74 |         y: Union[pd.DataFrame, np.ndarray],
 75 |         logging_to_file: bool = False
 76 |     ):
 77 |         """
 78 |         Parameters
 79 |         ----------
 80 |         ml_problem_type : str
 81 |             Type of the ML problem ('Classification' or 'Regression')
 82 | 
 83 |         X : pd.DataFrame
 84 |             The feature values of the dataset
 85 | 
 86 |         y : pd.DataFrame
 87 |             The target values of the dataset
 88 | 
 89 |         logging_to_file : bool, optional (default=False)
 90 |             Whether to log to a file
 91 |         """
 92 |         self.ml_problem_type = ml_problem_type.lower().capitalize()  # Normalize case
 93 |         self.X = X
 94 |         self.y = y
 95 | 
 96 |         self.logger = get_logger(__name__, "PROD", logging_to_file)
 97 | 
 98 |         self.eval_metrics_in_tuning_format = TUNING_METRIC_TRANSFORMATIONS.get(self.ml_problem_type)
 99 |         self.reverse_signed_eval_metrics = TUNING_METRIC_TRANSFORMATIONS.get("reverse_signed_eval_metrics")
100 | 
101 |         # Revise classification metrics for multi-class classification
102 |         if self.ml_problem_type == "Classification" and self.y.nunique() > 2:
103 |             self.eval_metrics_in_tuning_format['ROC-AUC'] = 'roc_auc_ovr'
104 |             self.eval_metrics_in_tuning_format['Precision'] = 'precision_macro'
105 |             self.eval_metrics_in_tuning_format['Recall'] = 'recall_macro'
106 |             self.eval_metrics_in_tuning_format['F1 Score'] = 'f1_macro'
107 | 
108 |     def _param_grid_validator(
109 |         self,
110 |         model_available_params: dict,
111 |         param_grid: dict,
112 |         prefix_param_grid_flag: bool = True
113 |     ) -> dict:
114 |         """
115 |         This method is used to validate the param_grid dictionary for the model
116 | 
117 |         Parameters
118 |         ----------
119 |         model_available_params : dict
120 |             All params that model has
121 | 
122 |         param_grid : dict
123 |             The dictionary that contains the hyperparameters and their possible values
124 | 
125 |         prefix_param_grid_flag : bool
126 |             Indicates If param_grid keys will be modified to be suitable for Pipeline object, adds model__ prefix to the begining of them
127 |         """
128 |         param_amount = len(param_grid)
129 |         if param_amount == 0:
130 |             error_msg = "Error while validating the param_grid for the model. The param_grid should not be empty"
131 |             self.logger.error(error_msg)
132 |             raise ValueError(error_msg)
133 | 
134 |         if prefix_param_grid_flag:
135 |             param_grid = {f"model__{key}": value for key, value in param_grid.items()}
136 |          
137 |         # Check if all params that param_grid has are available in the model's params
138 |         for param_name in param_grid.keys():
139 |             if param_name not in model_available_params:
140 |                 error_msg = f"Error while validating the param_grid for the model. The '{param_name}' parameter is not available in the model's available params.\n \
141 |                     Available params: {list(model_available_params)}"
142 |                 self.logger.error(error_msg)
143 |                 raise ValueError(error_msg)
144 |             
145 |         return param_grid
146 |     
147 |     def _setup_tuning(
148 |         self,
149 |         tuning_method: str,
150 |         model: Union[object, Pipeline],
151 |         param_grid: dict,
152 |         n_iter: Optional[int] = None,
153 |         n_jobs: int = -1,
154 |         prefix_param_grid_flag = True
155 |     ):
156 |         """
157 |         Sets up the tuning process by creating the model_stats dictionary
158 | 
159 |         Parameters
160 |         ----------
161 |         tuning_method : str
162 |             The tuning method that will be used for the optimization. It can be one of the following:
163 |             
164 |             * 'grid_search' for GridSearchCV (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
165 |             
166 |             * 'randomized_search' for RandomizedSearchCV (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
167 |             
168 |             * 'optuna' for Optuna (https://optuna.readthedocs.io/en/stable/)
169 | 
170 |         model : object or Pipeline
171 |             The model or Pipeline object that will be used for tuning
172 | 
173 |         n_iter : int, optional (default=10)
174 |             The number of iterations. The default is 10.
175 |         
176 |         n_jobs : int (default=-1)
177 |             The number of parallel jobs to run. The default is -1.
178 | 
179 |         prefix_param_grid_flag : bool
180 |             Indicates If param_grid keys will be modified to be suitable for Pipeline object, adds model__ prefix to the begining of them
181 | 
182 |         Returns
183 |         -------
184 |         model_stats: dict
185 |             Dictionary including tuning information and model:
186 | 
187 |             * 'tuning_method': The tuning method that is used for the optimization
188 |             
189 |             * 'tuning_param_grid': The hyperparameter grid that is used for the optimization
190 |             
191 |             * 'n_iter': The number of iterations
192 |             
193 |             * 'n_jobs': The number of parallel jobs to run
194 |             
195 |             * 'tuned_model': The tuned model object
196 |             
197 |             * 'tuned_model_score': The evaluation metric score of the tuned model
198 |             
199 |             * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model
200 |         """
201 |         model_params = None
202 |         
203 |         if isinstance(model, Pipeline):
204 |             model = model.named_steps['model']
205 | 
206 |         if "CatBoost" in model.__class__.__name__:
207 |             model_params = model.get_all_params()
208 |         else:
209 |             model_params = model.get_params()
210 |         
211 |         if prefix_param_grid_flag:
212 |             model_params = {f"model__{key}": value for key, value in model_params.items()}
213 | 
214 |         param_grid = self._param_grid_validator(
215 |             model_available_params=model_params,
216 |             param_grid=param_grid,
217 |             prefix_param_grid_flag=prefix_param_grid_flag
218 |         )
219 | 
220 |         model_stats = {
221 |             "tuning_method": tuning_method,
222 |             "tuning_param_grid": param_grid,
223 |             "n_iter": n_iter,
224 |             "n_jobs": n_jobs,
225 |             "tuned_model": None,
226 |             "tuned_model_score": None,
227 |             "tuned_model_evaluation_metric": None
228 |         }
229 | 
230 |         return model_stats
231 |             
232 |     def grid_search(
233 |         self,
234 |         pipeline: Pipeline,
235 |         param_grid: dict,
236 |         eval_metric: str,
237 |         cv: list,
238 |         n_jobs: int = -1,
239 |         verbose: int = 0
240 |     ) -> Optional[dict]:
241 |         """
242 |         Implements grid search hyperparameter optimization on the giveen machine learning model
243 | 
244 |         Parameters
245 |         ----------
246 |         pipeline : Pipeline
247 |             The pipeline object includes feature engineering and model object that will be tuned
248 | 
249 |         param_grid : dict
250 |             The dictionary that contains the hyperparameters and their possible values
251 | 
252 |         eval_metric : str
253 |             The evaluation metric that will be used to evaluate the model. It can be one of the following:
254 |             
255 |             * 'R2' for R^2 score
256 |             
257 |             * 'MAE' for Mean Absolute Error
258 |             
259 |             * 'MSE' for Mean Squared Error
260 | 
261 |             * 'RMSE' for Root Mean Squared Error
262 | 
263 |             * 'MAPE' for Mean Absolute Percentage Error
264 |             
265 |             * 'Accuracy' for Accuracy
266 |             
267 |             * 'Precision' for Precision
268 |             
269 |             * 'Recall' for Recall
270 |             
271 |             * 'F1 Score' for F1 score
272 | 
273 |         cv : list of tuples
274 |             A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices
275 |             for the training and test sets for that fold. For example:
276 |             [(array([1,2,4,...]), array([0,3,6,...])), ...]
277 | 
278 |         n_jobs : int (default=-1)
279 |             The number of parallel jobs to run. The default is -1.
280 | 
281 |         verbose: int (default = 0)
282 |             The verbosity level of the tuning process. If It's set to 0, no logs will be shown during the tuning process. Otherwise, the logs will be shown based on the value of the verbose parameter:
283 |             
284 |             * 1 : the computation time for each fold and parameter candidate is displayed
285 | 
286 |             * 2 : the score is also displayed
287 | 
288 |             * 3 : the fold and candidate parameter indexes are also displayed together with the starting time of the computation
289 | 
290 |         Returns
291 |         -------
292 |         model_stats: dict
293 |             Dictionary including tuning information and model:
294 | 
295 |             * 'tuning_method': The tuning method that is used for the optimization
296 |             
297 |             * 'tuning_param_grid': The hyperparameter grid that is used for the optimization
298 |             
299 |             * 'cv': The number of cross-validation splits
300 |             
301 |             * 'n_jobs': The number of parallel jobs to run
302 |             
303 |             * 'tuned_model': The tuned model object
304 |             
305 |             * 'tuned_model_score': The evaluation metric score of the tuned model
306 |             
307 |             * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model
308 |         """
309 |         model_stats = self._setup_tuning("GridSearchCV", pipeline, param_grid, n_iter=None, n_jobs=n_jobs)
310 |         param_grid = model_stats['tuning_param_grid']
311 |         
312 |         try:
313 |             t_start = time()
314 |             
315 |             # Calculate total fits
316 |             total_params = len(ParameterGrid(param_grid))
317 |             n_splits = len(cv)
318 |             total_fits = total_params * n_splits
319 | 
320 |             # Create GridSearchCV object
321 |             search = GridSearchCV(
322 |                 pipeline,
323 |                 param_grid,
324 |                 scoring=self.eval_metrics_in_tuning_format,
325 |                 refit=eval_metric,
326 |                 cv=cv,
327 |                 n_jobs=n_jobs,
328 |                 verbose=verbose
329 |             )
330 |             
331 |             # Fit with progress bar
332 |             with tqdm_joblib(tqdm(
333 |                 total=total_fits,
334 |                 desc="INFO | Grid Search Progress",
335 |                 bar_format="{desc}: |{bar}| {percentage:.0f}%"
336 |             )):
337 |                 search_result = search.fit(self.X, self.y)
338 | 
339 |             t_end = time()
340 |             time_taken = round(t_end - t_start, 2)
341 | 
342 |             scores = {
343 |                 metric: (
344 |                     -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
345 |                     if metric in self.reverse_signed_eval_metrics else
346 |                     search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
347 |                 )
348 |                 for metric in list(self.eval_metrics_in_tuning_format.keys())
349 |             }
350 | 
351 |             model_stats['tuned_model'] = search_result.best_estimator_.named_steps['model'] 
352 |             mean_score = search_result.cv_results_[f'mean_test_{eval_metric}'][search_result.best_index_]
353 |             model_stats['tuned_model_score'] = round(mean_score, 6)
354 |             model_stats['model_perf'] = scores
355 |             model_stats['time_taken_sec'] = time_taken
356 |             model_stats['tuned_model_evaluation_metric'] = eval_metric
357 |             return model_stats
358 |         except Exception as e:
359 |             self.logger.error(f"Error while tuning the model with GridSearchCV, Error: {e}")
360 |             return None
361 |     
362 |     def randomized_search(
363 |         self,
364 |         pipeline: Pipeline,
365 |         param_grid: dict,
366 |         eval_metric: str,
367 |         cv: list,
368 |         n_iter: int = 10,
369 |         n_jobs: int = -1,
370 |         verbose: int = 0
371 |     ) -> Optional[dict]:
372 |         """
373 |         Implements randomized search hyperparameter optimization on the giveen machine learning model
374 | 
375 |         Parameters
376 |         ----------
377 |         pipeline : Pipeline
378 |             The pipeline object includes feature engineering and model object that will be tuned
379 | 
380 |         param_grid : dict
381 |             The dictionary that contains the hyperparameters and their possible values
382 | 
383 |         eval_metric : str
384 |             The evaluation metric that will be used to evaluate the model. It can be one of the following:
385 |             
386 |             * 'R2' for R^2 score
387 |             
388 |             * 'MAE' for Mean Absolute Error
389 |             
390 |             * 'MSE' for Mean Squared Error
391 | 
392 |             * 'RMSE' for Root Mean Squared Error
393 | 
394 |             * 'MAPE' for Mean Absolute Percentage Error
395 |             
396 |             * 'Accuracy' for Accuracy
397 |             
398 |             * 'Precision' for Precision
399 |             
400 |             * 'Recall' for Recall
401 |             
402 |             * 'F1 Score' for F1 score
403 | 
404 |         cv : list of tuples
405 |             A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices
406 |             for the training and test sets for that fold. For example:
407 |             [(array([1,2,4,...]), array([0,3,6,...])), ...]
408 | 
409 |         n_iter : int, optional (default=10)
410 |             The number of trials. The default is 10
411 | 
412 |         n_jobs : int (default=-1)
413 |             The number of parallel jobs to run. The default is -1
414 |         
415 |         Returns
416 |         -------
417 |         model_stats: dict
418 |             Dictionary including tuning information and model:
419 | 
420 |             * 'tuning_method': The tuning method that is used for the optimization
421 |             
422 |             * 'tuning_param_grid': The hyperparameter grid that is used for the optimization
423 |             
424 |             * 'n_jobs': The number of parallel jobs to run
425 |             
426 |             * 'tuned_model': The tuned model object
427 |             
428 |             * 'tuned_model_score': The evaluation metric score of the tuned model
429 |             
430 |             * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model
431 |         """
432 |         model_stats = self._setup_tuning("randomized_search", pipeline, param_grid, n_iter=n_iter, n_jobs=n_jobs)
433 |         param_grid = model_stats['tuning_param_grid']
434 | 
435 |         t_start = time()
436 |         
437 |         # Calculate total fits
438 |         n_splits = len(cv)
439 |         total_fits = n_iter * n_splits
440 | 
441 |         # Create RandomizedSearchCV object
442 |         search = RandomizedSearchCV(
443 |             estimator=pipeline,
444 |             param_distributions=param_grid, 
445 |             n_iter=n_iter,
446 |             scoring=self.eval_metrics_in_tuning_format, 
447 |             refit=eval_metric,
448 |             cv=cv,
449 |             n_jobs=n_jobs,
450 |             verbose=verbose
451 |         )
452 |         
453 |         # Fit with progress bar
454 |         with tqdm_joblib(tqdm(
455 |             total=total_fits,
456 |             desc="INFO | Randomized Search Progress",
457 |             bar_format="{desc}: |{bar}| {percentage:.0f}%"
458 |         )):
459 |             search_result = search.fit(self.X, self.y)
460 | 
461 |         t_end = time()
462 |         time_taken = round(t_end - t_start, 2)
463 | 
464 |         scores = {
465 |             metric: (
466 |                 -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
467 |                 if metric in self.reverse_signed_eval_metrics else
468 |                 search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
469 |             )
470 |             for metric in list(self.eval_metrics_in_tuning_format.keys())
471 |         }
472 | 
473 |         model_stats['tuned_model'] = search_result.best_estimator_.named_steps['model']
474 |         mean_score = search_result.cv_results_[f'mean_test_{eval_metric}'][search_result.best_index_]
475 |         model_stats['tuned_model_score'] = round(mean_score, 6)
476 |         model_stats['model_perf'] = scores
477 |         model_stats['time_taken_sec'] = time_taken
478 |         model_stats['tuned_model_evaluation_metric'] = eval_metric
479 |         return model_stats
480 | 
481 |         
482 |     def optuna_search(
483 |         self,
484 |         pipeline: Pipeline,
485 |         param_grid: dict,
486 |         eval_metric: str,
487 |         cv: list,
488 |         n_iter: int = 10,
489 |         timeout: Optional[int] = None,
490 |         n_jobs: int = -1,
491 |         verbose: int = 0
492 |     ) -> Optional[dict]:
493 |         """
494 |         Implements Optuna hyperparameter optimization on the given machine learning model
495 | 
496 |         Parameters
497 |         ----------
498 |         pipeline : Pipeline
499 |             The pipeline object includes feature engineering and model object that will be tuned
500 | 
501 |         param_grid : dict
502 |             The dictionary that contains the hyperparameters and their possible values
503 | 
504 |         eval_metric : str
505 |             The evaluation metric that will be used to evaluate the model. It can be one of the following:
506 |             
507 |             * 'R2' for R^2 score
508 |             
509 |             * 'MAE' for Mean Absolute Error
510 |             
511 |             * 'MSE' for Mean Squared Error
512 | 
513 |             * 'RMSE' for Root Mean Squared Error
514 | 
515 |             * 'MAPE' for Mean Absolute Percentage Error
516 |             
517 |             * 'Accuracy' for Accuracy
518 |             
519 |             * 'Precision' for Precision
520 |             
521 |             * 'Recall' for Recall
522 |             
523 |             * 'F1 Score' for F1 score
524 | 
525 |         cv : list of tuples
526 |             A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices
527 |             for the training and test sets for that fold. For example:
528 |             [(array([1,2,4,...]), array([0,3,6,...])), ...]
529 | 
530 |         n_iter : int, optional (default=10)
531 |             The number of trials. The default is 10
532 | 
533 |         timeout : int, optional (default=None)
534 |             The timeout in seconds. The default is None
535 | 
536 |         n_jobs : int, optional (default=-1)
537 |             The number of parallel jobs to run. The default is -1
538 | 
539 |         verbose: int (default = 0)
540 |             The verbosity level of the tuning process. If It's set to 0, no logs will be shown during the tuning process. Otherwise, the logs will be shown based on the value of the verbose parameter:
541 | 
542 |             * DEBUG (Equals to 4): Most detailed logging (prints almost everything)
543 | 
544 |             * INFO (Equals to 3): Standard informational output
545 | 
546 |             * WARNING (Equals to 2): Only warnings and errors
547 | 
548 |             * ERROR (Equals to 1): Only error messages
549 | 
550 |             * CRITICAL (Equals to 0): Only critical errors
551 | 
552 |         Returns
553 |         -------
554 |         model_stats: dict
555 |             Dictionary including tuning information and model:
556 | 
557 |             * 'tuning_method': The tuning method that is used for the optimization
558 |             
559 |             * 'tuning_param_grid': The hyperparameter grid that is used for the optimization
560 |             
561 |             * 'cv': The number of cross-validation splits
562 |             
563 |             * 'n_jobs': The number of parallel jobs to run
564 |             
565 |             * 'tuned_model': The tuned model object
566 |             
567 |             * 'tuned_model_score': The evaluation metric score of the tuned model
568 |             
569 |             * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model
570 |         """
571 |         model_stats = self._setup_tuning("optuna", pipeline, param_grid, n_iter=n_iter, n_jobs=n_jobs, prefix_param_grid_flag=False)
572 |         param_grid = model_stats['tuning_param_grid']
573 | 
574 |         # Set verbosity levels
575 |         if verbose == 0:
576 |             optuna.logging.set_verbosity(optuna.logging.CRITICAL)
577 |         elif verbose == 1:
578 |             optuna.logging.set_verbosity(optuna.logging.ERROR)
579 |         elif verbose == 2:
580 |             optuna.logging.set_verbosity(optuna.logging.WARNING)
581 |         elif verbose == 3:
582 |             optuna.logging.set_verbosity(optuna.logging.INFO)
583 |         elif verbose == 4:
584 |             optuna.logging.set_verbosity(optuna.logging.DEBUG)
585 | 
586 |         study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'] else "minimize"
587 | 
588 |         def objective(trial):
589 |             # Generate parameters for the trial
590 |             params = pipeline.named_steps['model'].get_params()
591 |             for param_name, param_values in param_grid.items():
592 |                 first_element = param_values[0]
593 |                 
594 |                 if isinstance(first_element, (str, bool)):
595 |                     params[param_name] = trial.suggest_categorical(param_name, param_values)
596 |                 elif isinstance(first_element, int):
597 |                     params[param_name] = trial.suggest_int(param_name, param_values[0], param_values[-1])
598 |                 elif isinstance(first_element, float):
599 |                     params[param_name] = trial.suggest_float(param_name, param_values[0], param_values[-1])
600 |                 else:
601 |                     info_msg = f"{param_name} parameter is not added to tuning since its type is not supported by Optuna."
602 |                     self.logger.info(info_msg)
603 | 
604 |             # Clone the entire pipeline and its steps to avoid shared state between trials
605 |             preprocessing_steps = [(name, clone(step)) for name, step in pipeline.steps[:-1]]
606 |             new_pipeline = Pipeline(
607 |                 steps=preprocessing_steps + [
608 |                     ('model', clone(pipeline.named_steps['model']).set_params(**params))
609 |                 ]
610 |             )
611 | 
612 |             # Perform cross-validation and calculate the score
613 |             scores = []
614 |             for train_idx, test_idx in cv:
615 |                 X_train, X_test = self.X.iloc[train_idx], self.X.iloc[test_idx]
616 |                 y_train, y_test = self.y.iloc[train_idx], self.y.iloc[test_idx]
617 | 
618 |                 new_pipeline.fit(X_train, y_train)
619 | 
620 |                 if self.ml_problem_type == "Classification" and hasattr(new_pipeline, 'predict_proba'):
621 |                     y_pred = new_pipeline.predict_proba(X_test)
622 |                 else:   
623 |                     y_pred = new_pipeline.predict(X_test)
624 | 
625 |                 # Evaluate performance
626 |                 scores.append(evaluate_model_perf(self.ml_problem_type, y_test, y_pred))
627 | 
628 |             # Calculate the mean score across all folds
629 |             avg_metrics = {k: np.mean([m[k] if m[k] is not None else -1 for m in scores]) for k in scores[0]}
630 |             mean_score = avg_metrics.get(eval_metric, float('inf'))
631 | 
632 |             # Update the best score and model
633 |             if model_stats['tuned_model_score'] is None or (study_direction == "maximize" and mean_score > model_stats['tuned_model_score']) or (study_direction == "minimize" and mean_score < model_stats['tuned_model_score']):
634 |                 model_stats['tuned_model_score'] = round(mean_score, 6)
635 |                 model_stats['tuned_model'] = new_pipeline.named_steps['model']
636 |                 model_stats['model_perf'] = avg_metrics
637 | 
638 |             return mean_score
639 |         
640 |         try:
641 |             # Perform Optuna optimization
642 |             t_start = time()
643 |             study = optuna.create_study(direction=study_direction)
644 |             study.optimize(objective, n_trials=n_iter, timeout=timeout, n_jobs=n_jobs, show_progress_bar=True)
645 |             t_end = time()
646 | 
647 |             # Update model stats
648 |             model_stats['time_taken_sec'] = round(t_end - t_start, 2)
649 |             return model_stats
650 |         
651 |         except Exception as e:
652 |             self.logger.error(f"Error while tuning the model with Optuna, Error: {e}")
653 |             return None


--------------------------------------------------------------------------------