├── img ├── .gitkeep ├── flexml_banner.jpeg ├── flexml_logo.jpeg ├── start_guide_reg_output.jpg └── start_guide_reg_tuning_output.jpg ├── tests ├── __init__.py ├── test_performance.py ├── test_ml_models.py ├── test_cross_validation.py ├── test_feature_engineering.py ├── test_supervised.py └── test_helpers.py ├── flexml ├── logs │ ├── .gitkeep │ └── __init__.py ├── structures │ └── __init__.py ├── logger │ ├── __init__.py │ └── logger.py ├── config │ ├── __init__.py │ ├── supervised_config.py │ └── ml_models.py ├── __init__.py ├── helpers │ ├── __init__.py │ ├── tools.py │ ├── cross_validation.py │ ├── supervised_helpers.py │ ├── plot_model_graphs.py │ └── validators.py ├── classification.py ├── regression.py ├── _feature_engineer.py └── _model_tuner.py ├── codecov.yml ├── MANIFEST.in ├── .github └── workflows │ └── tests.yml ├── pyproject.toml ├── README.md ├── .gitignore └── LICENSE /img/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flexml/logs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flexml/logs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false -------------------------------------------------------------------------------- /flexml/structures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flexml/logger/__init__.py: -------------------------------------------------------------------------------- 1 | from flexml.logger.logger import get_logger -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.md 2 | recursive-include tests *.py -------------------------------------------------------------------------------- /img/flexml_banner.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozguraslank/flexml/HEAD/img/flexml_banner.jpeg -------------------------------------------------------------------------------- /img/flexml_logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozguraslank/flexml/HEAD/img/flexml_logo.jpeg -------------------------------------------------------------------------------- /img/start_guide_reg_output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozguraslank/flexml/HEAD/img/start_guide_reg_output.jpg -------------------------------------------------------------------------------- /img/start_guide_reg_tuning_output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozguraslank/flexml/HEAD/img/start_guide_reg_tuning_output.jpg -------------------------------------------------------------------------------- /flexml/config/__init__.py: -------------------------------------------------------------------------------- 1 | from flexml.config.ml_models import ( 2 | get_ml_models 3 | ) 4 | 5 | from flexml.config.supervised_config import ( 6 | EVALUATION_METRICS, 7 | TUNING_METRIC_TRANSFORMATIONS, 8 | CROSS_VALIDATION_METHODS, 9 | FEATURE_ENGINEERING_METHODS, 10 | PLOT_TYPES 11 | ) -------------------------------------------------------------------------------- /flexml/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | FlexML: Easy-to-use and flexible AutoML library for Python 3 | """ 4 | 5 | from flexml.helpers.tools import check_numpy_dtype_error 6 | check_numpy_dtype_error() # Check cronic Colab version issue 7 | 8 | from .regression import Regression 9 | from .classification import Classification 10 | 11 | __version__ = "1.1.0" 12 | 13 | __all__ = ["Regression", "Classification"] 14 | -------------------------------------------------------------------------------- /flexml/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from flexml.helpers.tools import check_numpy_dtype_error 2 | check_numpy_dtype_error() # Check cronic Colab version issue 3 | 4 | from flexml.helpers.validators import ( 5 | eval_metric_checker, 6 | random_state_checker, 7 | cross_validation_checker, 8 | validate_inputs 9 | ) 10 | from flexml.helpers.cross_validation import get_cv_splits 11 | from flexml.helpers.supervised_helpers import evaluate_model_perf 12 | from flexml.helpers.plot_model_graphs import ( 13 | plot_feature_importance, 14 | plot_confusion_matrix, 15 | plot_roc_curve, 16 | plot_shap, 17 | plot_residuals, 18 | plot_prediction_error, 19 | plot_calibration_curve 20 | ) 21 | from flexml.helpers.tools import is_interactive_notebook -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | permissions: 12 | id-token: write 13 | contents: read 14 | 15 | jobs: 16 | test: 17 | runs-on: ubuntu-latest 18 | 19 | strategy: 20 | matrix: 21 | python-version: ['3.9', '3.10', '3.11', '3.12'] 22 | 23 | steps: 24 | - name: Checkout code 25 | uses: actions/checkout@v3 26 | 27 | - name: Set up Python ${{ matrix.python-version }} 28 | uses: actions/setup-python@v4 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | 32 | - name: Install dependencies 33 | run: | 34 | python -m pip install --upgrade pip 35 | python -m pip install uv 36 | python -m uv pip install .[test] 37 | 38 | - name: Run tests 39 | run: pytest --cov --cov-branch --cov-report=xml 40 | 41 | - name: Upload results to Codecov 42 | uses: codecov/codecov-action@v5 43 | with: 44 | token: ${{ secrets.CODECOV_TOKEN }} -------------------------------------------------------------------------------- /flexml/helpers/tools.py: -------------------------------------------------------------------------------- 1 | from IPython import get_ipython 2 | from flexml.logger import get_logger 3 | import warnings 4 | warnings.filterwarnings("ignore") 5 | 6 | 7 | def is_interactive_notebook(): 8 | """Detects interactive environments including Jupyter and Colab""" 9 | try: 10 | # Get the shell class name 11 | shell = get_ipython().__class__.__name__ 12 | # Both Jupyter and Colab have specific shell names 13 | if shell in ['ZMQInteractiveShell', 'Shell']: # ZMQ is for Jupyter, Shell is for Colab 14 | return True 15 | return False 16 | except: 17 | # get_ipython() will not be defined in non-interactive environments 18 | return False 19 | 20 | 21 | def check_numpy_dtype_error(): 22 | """ 23 | Checks if the numpy version is compatible with the pandas version in Colab 24 | """ 25 | logger = get_logger(__name__, "PROD") 26 | try: 27 | shell = get_ipython().__class__.__name__ 28 | if shell != "Shell": # If environment is not Colab, no need for this check since It only happens in Colab 29 | return 30 | 31 | import pandas 32 | except ValueError as e: # Catch ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject 33 | if 'numpy.dtype size changed' in str(e): 34 | logger.warning("Colab has cronic version issue, restarting the kernel... (details: https://shorturl.at/ZMJBh)") 35 | try: 36 | import os 37 | os.kill(os.getpid(), 9) 38 | except: # If it fails, try to exit the program. 39 | exit() 40 | else: 41 | raise e -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "flexml" 7 | version = "1.1.0" 8 | authors = [ 9 | { name="Ozgur Aslan", email="ozguraslank@gmail.com"}, 10 | ] 11 | description = "Easy-to-use and flexible AutoML library for Python" 12 | readme = "README.md" 13 | requires-python = ">=3.9,<3.13" 14 | license = { file = "LICENSE" } 15 | keywords = ["AutoML", "Machine Learning", "Data Science", "Regression", "Classification"] 16 | classifiers = [ 17 | "Programming Language :: Python :: 3.9", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: 3.11", 20 | "Programming Language :: Python :: 3.12", 21 | "Operating System :: OS Independent", 22 | "License :: OSI Approved :: Apache Software License", 23 | ] 24 | 25 | dependencies = [ 26 | "numpy>=1.21,<=1.26.4", 27 | "pandas>=2.0.1,<2.2", 28 | "scikit-learn>=1.5.0,<=1.5.2", 29 | "xgboost>=2.0.0,<3.0.0", 30 | "lightgbm>=4.0.0", 31 | "catboost>=1.2.5", 32 | "tqdm>=4.60.0", 33 | "optuna>=3.0.0", 34 | "ipython>=7.11.0", 35 | "jinja2>=3.1.0", 36 | "nbformat>=5.10.0", 37 | "plotly>=6.0.0", 38 | "yellowbrick>=1.5", 39 | "shap>=0.46.0", 40 | "rich>=13.9.0", 41 | 'setuptools; python_version>="3.12"', 42 | ] 43 | 44 | [project.optional-dependencies] 45 | test = [ 46 | "pytest>=8.0.1", 47 | "parameterized>=0.8.1", 48 | "pytest-cov>=6.0.0", 49 | "seaborn>=0.13.0", 50 | ] 51 | 52 | [project.urls] 53 | Repository = "https://github.com/ozguraslank/flexml" 54 | Issues = "https://github.com/ozguraslank/flexml/issues" 55 | 56 | [tool.setuptools.packages.find] 57 | where = ["."] 58 | include = ["flexml*"] 59 | exclude = ["tests*"] -------------------------------------------------------------------------------- /tests/test_performance.py: -------------------------------------------------------------------------------- 1 | from seaborn import load_dataset 2 | import unittest 3 | from flexml import Classification, Regression 4 | import numpy as np 5 | 6 | import warnings 7 | warnings.filterwarnings("ignore") 8 | 9 | 10 | class PerformanceTest(unittest.TestCase): 11 | """ 12 | Test cases for the performance of the Classification class 13 | """ 14 | 15 | df_class = load_dataset('diamonds') 16 | # Set seed for reproducibility 17 | np.random.seed(42) 18 | # Randomly select 20% of the data (excluding 'price') and set to NaN 19 | mask = np.random.rand(*df_class.shape) < 0.2 20 | mask[:, df_class.columns.get_loc('cut')] = False 21 | df_class.where(~mask, np.nan, inplace=True) 22 | fml_class = Classification(df_class, target_col='cut') 23 | 24 | df_regression = load_dataset('diamonds') 25 | # Randomly select 20% of the data (excluding 'price') and set to NaN 26 | mask = np.random.rand(*df_regression.shape) < 0.2 27 | mask[:, df_regression.columns.get_loc('price')] = False 28 | df_regression.where(~mask, np.nan, inplace=True) 29 | fml_regression = Regression(df_regression, target_col='price') 30 | 31 | def test_performance_classification(self): 32 | """ 33 | Performance test for the Classification class 34 | """ 35 | self.fml_class.start_experiment(experiment_size="wide",cv_method="holdout") 36 | 37 | # Calculate the average R2 score 38 | avg_accuracy = self.fml_class._model_stats_df["Accuracy"].mean() 39 | self.assertGreater( 40 | avg_accuracy, 41 | 0.55, 42 | f"Average Accuracy score {avg_accuracy:.4f} is not greater than 0.55" 43 | ) 44 | 45 | def test_performance_regression(self): 46 | """ 47 | Performance test for the Classification class 48 | """ 49 | self.fml_regression.start_experiment(experiment_size="wide",cv_method="holdout") 50 | 51 | # Calculate the average R2 score 52 | avg_r2 = self.fml_regression._model_stats_df["R2"].mean() 53 | self.assertGreater( 54 | avg_r2, 55 | 0.75, 56 | f"Average R² score {avg_r2:.4f} is not greater than 0.75" 57 | ) -------------------------------------------------------------------------------- /flexml/logger/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | __LOG_DIR_PATH = "logs" 5 | __LOG_FILE_PATH = os.path.join(__LOG_DIR_PATH, "flexml_logs.log") 6 | 7 | def _logger_configuration(log_level: str, logging_to_file: bool = False): 8 | """ 9 | Configures the logger to save logs to a file or not. 10 | 11 | Parameters 12 | ---------- 13 | log_level: str, 14 | The log level to set for the logger. It can be either "TEST" or "PROD" 15 | 16 | logging_to_file : bool, (default=False) 17 | If True, logs are saved to /logs/flexml_logs.log. Otherwise, logs are not saved to a file. 18 | """ 19 | handlers = [logging.StreamHandler()] 20 | log_format = None 21 | 22 | if log_level == "TEST": 23 | log_format = '%(levelname)s | %(asctime)-3s | %(name)s.%(funcName)s | %(message)-3s' 24 | elif log_level == "PROD": 25 | log_format = '%(levelname)s | %(asctime)-3s | %(message)-3s' 26 | else: 27 | raise ValueError("Invalid log level. It should be either 'TEST' or 'PROD'.") 28 | 29 | if logging_to_file: 30 | os.makedirs(__LOG_DIR_PATH, exist_ok=True) 31 | handlers.append(logging.FileHandler(__LOG_FILE_PATH)) 32 | 33 | logging.basicConfig( 34 | level="INFO", 35 | format=log_format, 36 | datefmt="%Y-%m-%d %H:%M:%S", 37 | handlers=handlers, 38 | force=True 39 | ) 40 | 41 | # Set some of the libraries logging to ERROR level to reduce verbosity 42 | logging.getLogger('shap').setLevel(logging.ERROR) 43 | logging.getLogger('sklearn').setLevel(logging.ERROR) 44 | logging.getLogger("numexpr").setLevel(logging.ERROR) 45 | 46 | def get_logger( 47 | name: str, 48 | log_level: str, 49 | logging_to_file: bool = False 50 | ) -> logging.Logger: 51 | """ 52 | Returns a logger object with the given name 53 | 54 | Parameters 55 | ---------- 56 | name : str 57 | The name of the logger (It's always the name of the class or the module) 58 | 59 | log_level: str 60 | The log level to set for the logger. It can be either "TEST" or "PROD" 61 | 62 | Example output for TEST 63 | >>> logger = get_logger("test_logger", "TEST") 64 | >>> logger.info("This is a test message") 65 | >>> 2021-07-07 12:00:00 | test_logger. | This is a test message 66 | 67 | Example output for PROD 68 | >>> logger = get_logger("test_logger", "PROD") 69 | >>> logger.info("This is a test message") 70 | >>> 2021-07-07 12:00:00 | This is a test message 71 | 72 | logging_to_file : bool, (default=False) 73 | If True, logs are saved to /logs/flexml_logs.log. Otherwise, logs are not saved to a file 74 | 75 | Returns 76 | ------- 77 | logger : logging.Logger 78 | The logger object with the given name 79 | """ 80 | _logger_configuration(log_level, logging_to_file) 81 | return logging.getLogger(name) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](https://img.shields.io/pypi/v/flexml.svg) ![Python versions](https://img.shields.io/pypi/pyversions/flexml) [![Code Coverage](https://codecov.io/gh/ozguraslank/flexml/branch/main/graph/badge.svg)](https://codecov.io/gh/ozguraslank/flexml) 2 | # FlexML 3 | 4 |
5 | drawing 6 |
7 | 8 | ## Introduction 9 | 10 | FlexML is an easy-to-use and flexible AutoML library for Python that simplifies the process of building machine learning models. It automates model selection and hyperparameter tuning, offering users the flexibility to customize the size of their experiments by allowing to train all available models in the library or only a subset of them for faster results, FlexML adapts to your needs!

11 | At the moment, FlexML supports only regression and classification tasks and offers two experiment modes; 'quick' and 'wide' allowing users to choose between fitting the most used machine learning models in the field or the full range of models available in the library. 12 | 13 | ## How to Install 14 | 15 | ```bash 16 | pip install flexml 17 | ``` 18 | 19 | ## Start Guide with Regression Experiment 20 | 21 | ```python 22 | # Experiment for a Regression problem on California House Value Prediction dataset in Quick mode 23 | 24 | from flexml import Regression 25 | from sklearn.datasets import fetch_california_housing 26 | 27 | df = fetch_california_housing(as_frame=True)['frame'] 28 | 29 | reg_exp = Regression(df, target_col="MedHouseVal") 30 | reg_exp.start_experiment() 31 | ``` 32 | --> Once **start_experiment()** process finishes, you will see the model leaderboard as below:
33 |
34 | drawing 35 |
36 | 37 | ```python 38 | # Get the best model 39 | best_model = reg_exp.get_best_models() 40 | 41 | # Get the best model by name (Alternative) 42 | best_model = reg_exp.get_model_by_name("CatBoostRegressor") 43 | 44 | # Tune model (default model is the best model and randomized search for tuning) 45 | reg_exp.tune_model() 46 | ``` 47 | 48 | --> Once **tune_model()** process finishes, you will see the updated model leaderboard as below:
49 |
50 | drawing 51 |
52 | 53 |
54 | You can also take a look to jupyter notebook files in the 'notebooks' folder in the repository for more detailed explanations of the usage 55 | 56 | ## How to Contribute: 57 | 58 | 1. **Fork the repository:** Click on the 'Fork' button at the top right corner of the GitHub repository page 59 | 2. **Create a new branch:** Name your branch descriptively based on the feature or fix you're working on 60 | 3. **Make your changes:** Write code and tests to add your feature or fix the issue 61 | - You can take a look to **tests** folder in the repository to reach the current unittests 62 | 4. **Run tests:** Ensure all existing and new tests pass 63 | 5. **Submit a pull request:** Open a pull request with a clear description of your changes -------------------------------------------------------------------------------- /flexml/config/supervised_config.py: -------------------------------------------------------------------------------- 1 | # Regression & Classification Evaluation Metrics 2 | EVALUATION_METRICS = { 3 | "Regression": {"DEFAULT": "R2", 4 | "ALL": ["R2", "MAE", "MSE", "RMSE", "MAPE"]}, 5 | 6 | "Classification": {"DEFAULT": "Accuracy", 7 | "ALL": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"]} 8 | } 9 | 10 | # Model Tuning Metric Transformations 11 | TUNING_METRIC_TRANSFORMATIONS = { 12 | "Regression": { 13 | 'R2': 'r2', 14 | 'MAE': 'neg_mean_absolute_error', 15 | 'MSE': 'neg_mean_squared_error', 16 | 'RMSE': 'neg_root_mean_squared_error', 17 | 'MAPE': 'neg_mean_absolute_percentage_error' 18 | }, 19 | 20 | "Classification": { 21 | 'Accuracy': 'accuracy', 22 | 'Precision': 'precision', 23 | 'Recall': 'recall', 24 | 'F1 Score': 'f1_weighted', 25 | 'ROC-AUC': 'roc_auc' 26 | }, 27 | 28 | "reverse_signed_eval_metrics": ['MAE','MSE', 'RMSE', 'MAPE'] 29 | # These metrics are used in negative form for optimization processes, so we need to reverse the sign later, e.g. from -0.42 to 0.42 30 | } 31 | 32 | # Supported Cross Validation Methods 33 | CROSS_VALIDATION_METHODS = { 34 | # 'all' used for the get_cv_splits() function at helpers/cross_validation.py, while others are used for SupervisedBase's validations 35 | 'all': { 36 | 'kfold': 'kfold', 37 | 'stratified_kfold': 'stratifiedkfold', 38 | 'holdout': 'holdout', 39 | 'stratified_shuffle_split': 'stratifiedshufflesplit', 40 | 'shuffle_split': 'shufflesplit', 41 | 'group_kfold': 'groupkfold', 42 | 'group_shuffle_split': 'groupshufflesplit' 43 | }, 44 | 45 | 'Regression': { 46 | 'kfold': 'kfold', 47 | 'holdout': 'holdout', 48 | 'shuffle_split': 'shufflesplit', 49 | 'group_kfold': 'groupkfold', 50 | 'group_shuffle_split': 'groupshufflesplit' 51 | }, 52 | 53 | 'Classification': { 54 | 'kfold': 'kfold', 55 | 'stratified_kfold': 'stratifiedkfold', 56 | 'holdout': 'holdout', 57 | 'shuffle_split': 'shufflesplit', 58 | 'stratified_shuffle_split': 'stratifiedshufflesplit', 59 | 'group_kfold': 'groupkfold', 60 | 'group_shuffle_split': 'groupshufflesplit' 61 | } 62 | } 63 | 64 | # Feature Engineering Methods That Can Be Used 65 | FEATURE_ENGINEERING_METHODS = { 66 | "accepted_numeric_imputations_methods": ['median', 'mean', 'mode', 'constant', 'drop'], 67 | "accepted_categorical_imputations_methods": ['mode', 'constant', 'drop'], 68 | "accepted_encoding_methods": ['label_encoder', 'onehot_encoder', 'ordinal_encoder'], 69 | "accepted_standardization_methods": ['standard_scaler', 'normalize_scaler', 'robust_scaler', 'quantile_transformer', 'minmax_scaler', 'maxabs_scaler'] 70 | } 71 | 72 | # Supported Plot Types 73 | PLOT_TYPES = { 74 | "Regression": ["feature_importance", "residuals", "prediction_error", "shap_summary","shap_violin"], 75 | "Classification": ["feature_importance", "confusion_matrix", "roc_curve", "shap_summary", "shap_violin", "calibration_curve"] 76 | } -------------------------------------------------------------------------------- /tests/test_ml_models.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from parameterized import parameterized 4 | from sklearn.datasets import load_diabetes, load_breast_cancer 5 | from flexml.regression import Regression 6 | from flexml.classification import Classification 7 | from flexml.helpers import get_cv_splits 8 | from flexml.logger import get_logger 9 | from flexml.config import get_ml_models 10 | 11 | import warnings 12 | warnings.filterwarnings("ignore") 13 | 14 | class TestMLModels(unittest.TestCase): 15 | logger = get_logger(__name__, "TEST", logging_to_file=False) 16 | logger.setLevel("DEBUG") 17 | 18 | test_config = { 19 | 'Regression': { 20 | 'data': load_diabetes(as_frame=True)['frame'], 21 | 'target_col': 'target', 22 | 'exp_class': Regression, 23 | 'models': get_ml_models(ml_task_type="Regression")['WIDE'] 24 | }, 25 | 'Classification': { 26 | 'data': load_breast_cancer(as_frame=True)['frame'], 27 | 'target_col': 'target', 28 | 'exp_class': Classification, 29 | 'models': get_ml_models(ml_task_type="Classification")['WIDE'] 30 | } 31 | } 32 | 33 | experiments = {} 34 | cv_splitters = {} 35 | 36 | for objective, config in test_config.items(): 37 | exp = config['exp_class']( 38 | data=config['data'], 39 | target_col=config['target_col'], 40 | logging_to_file=False 41 | ) 42 | experiments[objective] = exp 43 | 44 | cv_splitters[objective] = get_cv_splits( 45 | df=config['data'], 46 | cv_method="holdout", 47 | test_size=0.5 # Keeping test_size high to make the training faster 48 | ) 49 | 50 | @parameterized.expand([ 51 | (objective, model_pack['name'], model_pack['model'], model_pack['tuning_param_grid']) 52 | for objective, config in test_config.items() 53 | for model_pack in config['models'] 54 | ]) 55 | def test_ml_models(self, objective, model_name, model, model_tuning_params): 56 | exp = self.experiments[objective] 57 | cv_splitter = self.cv_splitters[objective] 58 | 59 | X, y = exp.X, exp.y 60 | train_idx = cv_splitter[0][0] # holdout validation returns in [(train_index, test_index)] format 61 | 62 | X_train = X.iloc[train_idx] 63 | y_train = y.iloc[train_idx] 64 | 65 | model.fit(X_train, y_train) 66 | 67 | # If its classification problem 68 | if objective == 'Classification': 69 | predictions = model.predict_proba(X_train) 70 | else: 71 | predictions = model.predict(X_train) 72 | 73 | self.assertIsInstance(predictions, np.ndarray) 74 | 75 | try: 76 | exp.tune_model( 77 | model=model, 78 | tuning_method="randomized_search", 79 | param_grid=model_tuning_params, 80 | n_iter=3, 81 | n_folds=3, 82 | n_jobs=-1 83 | ) 84 | 85 | except Exception as e: 86 | if 'Model leaderboard is empty!' in str(e): 87 | # Since we don't use the start_experiment() function, there will be no saved models and this error will be raised -- 88 | # Because, we call _show_tuning_report when tune_model operation is done and that function calls get_best_models() function that calls __top_n_models_checker() where the error will be raised :) 89 | pass 90 | else: 91 | # Handle other exceptions 92 | error_msg = f"An error occurred while tuning {model_name} model with the following param_grid {model_tuning_params}. Error: {e}" 93 | self.logger.error(error_msg) 94 | raise Exception(error_msg) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Addition to below 2 | *.xml 3 | *.iml 4 | *.tsv 5 | *.json 6 | *.txt 7 | *.tfevents 8 | *.csv 9 | *.idea 10 | 11 | # generated docs items 12 | docs/site/ 13 | docs/docs/_partials/termynal.md 14 | docs/docs/_partials/*/*.html 15 | 16 | # test cache 17 | manual_test/ 18 | 19 | # other local dev info 20 | .vscode/ 21 | 22 | # Mac OS-specific storage files 23 | .DS_Store 24 | 25 | # vim 26 | *.swp 27 | *.swo 28 | 29 | ## https://github.com/github/gitignore/blob/4488915eec0b3a45b5c63ead28f286819c0917de/Python.gitignore 30 | 31 | # Byte-compiled / optimized / DLL files 32 | __pycache__/ 33 | *.py[cod] 34 | *$py.class 35 | 36 | # C extensions 37 | *.so 38 | 39 | # Distribution / packaging 40 | .Python 41 | build/ 42 | develop-eggs/ 43 | dist/ 44 | downloads/ 45 | eggs/ 46 | .eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | wheels/ 53 | share/python-wheels/ 54 | *.egg-info/ 55 | .installed.cfg 56 | *.egg 57 | MANIFEST 58 | 59 | # PyInstaller 60 | # Usually these files are written by a python script from a template 61 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 62 | *.manifest 63 | *.spec 64 | 65 | # Installer logs 66 | pip-log.txt 67 | pip-delete-this-directory.txt 68 | 69 | # Unit test / coverage reports 70 | htmlcov/ 71 | .tox/ 72 | .nox/ 73 | .coverage 74 | .coverage.* 75 | .cache 76 | nosetests.xml 77 | coverage.xml 78 | *.cover 79 | *.py,cover 80 | .hypothesis/ 81 | .pytest_cache/ 82 | cover/ 83 | 84 | # Translations 85 | *.mo 86 | *.pot 87 | 88 | # Django stuff: 89 | *.log 90 | local_settings.py 91 | db.sqlite3 92 | db.sqlite3-journal 93 | 94 | # Flask stuff: 95 | instance/ 96 | .webassets-cache 97 | 98 | # Scrapy stuff: 99 | .scrapy 100 | 101 | # PyBuilder 102 | .pybuilder/ 103 | target/ 104 | 105 | # Jupyter Notebook 106 | .ipynb_checkpoints 107 | 108 | # IPython 109 | profile_default/ 110 | ipython_config.py 111 | 112 | # pyenv 113 | # For a library or package, you might want to ignore these files since the code is 114 | # intended to run in multiple environments; otherwise, check them in: 115 | # .python-version 116 | 117 | # pipenv 118 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 119 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 120 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 121 | # install all needed dependencies. 122 | #Pipfile.lock 123 | 124 | # poetry 125 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 126 | # This is especially recommended for binary packages to ensure reproducibility, and is more 127 | # commonly ignored for libraries. 128 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 129 | #poetry.lock 130 | 131 | # pdm 132 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 133 | #pdm.lock 134 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 135 | # in version control. 136 | # https://pdm.fming.dev/#use-with-ide 137 | .pdm.toml 138 | 139 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 140 | __pypackages__/ 141 | 142 | # Celery stuff 143 | celerybeat-schedule 144 | celerybeat.pid 145 | 146 | # SageMath parsed files 147 | *.sage.py 148 | 149 | # Environments 150 | .env 151 | .venv 152 | env/ 153 | venv/ 154 | ENV/ 155 | env.bak/ 156 | venv.bak/ 157 | 158 | # Spyder project settings 159 | .spyderproject 160 | .spyproject 161 | 162 | # Rope project settings 163 | .ropeproject 164 | 165 | # mkdocs documentation 166 | /site 167 | 168 | # mypy 169 | .mypy_cache/ 170 | .dmypy.json 171 | dmypy.json 172 | 173 | # Pyre type checker 174 | .pyre/ 175 | 176 | # pytype static type analyzer 177 | .pytype/ 178 | 179 | # Cython debug symbols 180 | cython_debug/ 181 | 182 | # PyCharm 183 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 184 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 185 | # and can be added to the global gitignore or merged into this file. For a more nuclear 186 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 187 | #.idea/ -------------------------------------------------------------------------------- /flexml/classification.py: -------------------------------------------------------------------------------- 1 | from flexml.structures.supervised_base import SupervisedBase 2 | 3 | class Classification(SupervisedBase): 4 | """ 5 | A class to train and evaluate different classification models. 6 | 7 | Parameters 8 | ---------- 9 | data : pd.DataFrame 10 | The input data for the model training process 11 | 12 | target_col : str 13 | The target column name in the data 14 | 15 | random_state : int, (default=42) 16 | The random state for data processing processes 17 | 18 | drop_columns : list, default=None 19 | Columns that will be dropped from the data. 20 | 21 | categorical_imputation_method : str, default='mode' 22 | Imputation method for categorical columns. Options: 23 | * 'mode': Replace missing values with the most frequent value 24 | * 'constant': Replace missing values with a constant value 25 | * 'drop': Drop rows with missing values 26 | 27 | numerical_imputation_method : str, default='mean' 28 | Imputation method for numerical columns. Options: 29 | * 'mean': Replace missing values with the column mean 30 | * 'median': Replace missing values with the column median 31 | * 'mode': Replace missing values with the column mode 32 | * 'constant': Replace missing values with a constant value 33 | * 'drop': Drop rows with missing values 34 | 35 | column_imputation_map : dict, default=None 36 | Custom mapping of columns to specific imputation methods 37 | Example usage: {'column_name': 'mean', 'column_name2': 'mode'} 38 | 39 | categorical_imputation_constant : str, default='Unknown' 40 | The constant value for imputing categorical columns when 'constant' is selected 41 | 42 | numerical_imputation_constant : float, default=0.0 43 | The constant value for imputing numerical columns when 'constant' is selected 44 | 45 | encoding_method : str, default='onehot_encoder' 46 | Encoding method for categorical columns. Options: 47 | * 'label_encoder': Use label encoding 48 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html 49 | * 'onehot_encoder': Use one-hot encoding 50 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html 51 | * 'ordinal_encoder': Use ordinal encoding 52 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html 53 | 54 | onehot_limit : int, default=25 55 | Maximum number of categories to use for one-hot encoding. 56 | 57 | encoding_method_map : dict, default=None 58 | Custom mapping of columns to encoding methods 59 | Example usage: {'column_name': 'onehot_encoder', 'column_name2': 'label_encoder'} 60 | 61 | ordinal_encode_map : dict, default=None 62 | Custom mapping of columns to category order for ordinal encoding 63 | Example usage: {'column_name': ['low', 'medium', 'high']} 64 | 65 | normalize : str, default=None 66 | Standardize the data using StandardScaler. Options: 67 | * 'standard_scaler': Standardize the data using StandardScaler 68 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 69 | * 'minmax_scaler': Scale the data using MinMaxScaler 70 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html 71 | * 'robust_scaler': Scale the data using RobustScaler 72 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html 73 | * 'quantile_transformer': Transform the data using QuantileTransformer 74 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html 75 | * 'maxabs_scaler': Scale the data using MaxAbsScaler 76 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html 77 | * 'normalize_scaler': Normalize the data to unit length 78 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html 79 | 80 | shuffle: bool, (default=True) 81 | If True, the data will be shuffled before the model training process 82 | 83 | logging_to_file: bool, (default=False) 84 | If True, the logs will be saved to a file in the current path, located in /logs/flexml_logs.log, Otherwise, it will not be saved 85 | 86 | Example 87 | ------- 88 | >>> from flexml import Classification 89 | >>> df = pd.read_csv("MY_DATA.csv") 90 | >>> classification_exp = Classification(data=df, target_col='target_col') 91 | >>> classification_exp.start_experiment(experiment_size = 'quick') 92 | >>> classification_exp.show_model_stats(eval_metric='accuracy') 93 | 94 | ------------------------------------------------------------ 95 | | model_name |accuracy|precision|recall|f1_score| 96 | ------------------------|--------|---------|------|--------| 97 | | LogisticRegression | 0.7863 | 0.6721 |0.5921| 0.2469 | 98 | | DecisionTreeClassifier| 0.7725 | 0.6441 |0.4642| 0.4347 | 99 | | LGBMClassifier | 0.7521 | 0.4751 |0.3531| 0.1445 | 100 | | RidgeClassifier | 0.7011 | 0.7590 |0.6155| 0.3411 | 101 | | XGBClassifier | 0.6213 | 0.4701 |0.2923| 0.4039 | 102 | ------------------------------------------------------------ 103 | >>> best_model = classification_exp.get_best_models(eval_metric = 'accuracy') 104 | """ 105 | pass -------------------------------------------------------------------------------- /flexml/regression.py: -------------------------------------------------------------------------------- 1 | from flexml.structures.supervised_base import SupervisedBase 2 | 3 | class Regression(SupervisedBase): 4 | """ 5 | A class to train and evaluate different regression models 6 | 7 | Parameters 8 | ---------- 9 | data : pd.DataFrame 10 | The input data for the model training process 11 | 12 | target_col : str 13 | The target column name in the data 14 | 15 | random_state : int, (default=42) 16 | The random state for data processing processes 17 | 18 | drop_columns : list, default=None 19 | Columns that will be dropped from the data 20 | 21 | categorical_imputation_method : str, default='mode' 22 | Imputation method for categorical columns. Options: 23 | * 'mode': Replace missing values with the most frequent value 24 | * 'constant': Replace missing values with a constant value 25 | * 'drop': Drop rows with missing values 26 | 27 | numerical_imputation_method : str, default='mean' 28 | Imputation method for numerical columns. Options: 29 | * 'mean': Replace missing values with the column mean 30 | * 'median': Replace missing values with the column median 31 | * 'mode': Replace missing values with the column mode 32 | * 'constant': Replace missing values with a constant value 33 | * 'drop': Drop rows with missing values 34 | 35 | column_imputation_map : dict, default=None 36 | Custom mapping of columns to specific imputation methods 37 | Example usage: {'column_name': 'mean', 'column_name2': 'mode'} 38 | 39 | categorical_imputation_constant : str, default='Unknown' 40 | The constant value for imputing categorical columns when 'constant' is selected 41 | 42 | numerical_imputation_constant : float, default=0.0 43 | The constant value for imputing numerical columns when 'constant' is selected 44 | 45 | encoding_method : str, default='onehot_encoder' 46 | Encoding method for categorical columns. Options: 47 | * 'label_encoder': Use label encoding 48 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html 49 | * 'onehot_encoder': Use one-hot encoding 50 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html 51 | * 'ordinal_encoder': Use ordinal encoding 52 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html 53 | 54 | onehot_limit : int, default=25 55 | Maximum number of categories to use for one-hot encoding 56 | 57 | encoding_method_map : dict, default=None 58 | Custom mapping of columns to encoding methods 59 | Example usage: {'column_name': 'onehot_encoder', 'column_name2': 'label_encoder'} 60 | 61 | ordinal_encode_map : dict, default=None 62 | Custom mapping of columns to category order for ordinal encoding 63 | Example usage: {'column_name': ['low', 'medium', 'high']} 64 | 65 | normalize : str, default=None 66 | Standardize the data using StandardScaler. Options: 67 | * 'standard_scaler': Standardize the data using StandardScaler 68 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 69 | * 'minmax_scaler': Scale the data using MinMaxScaler 70 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html 71 | * 'robust_scaler': Scale the data using RobustScaler 72 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html 73 | * 'quantile_transformer': Transform the data using QuantileTransformer 74 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html 75 | * 'maxabs_scaler': Scale the data using MaxAbsScaler 76 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html 77 | * 'normalize_scaler': Normalize the data to unit length 78 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html 79 | 80 | shuffle: bool, (default=True) 81 | If True, the data will be shuffled before the model training process 82 | 83 | logging_to_file: bool, (default=False) 84 | If True, the logs will be saved to a file in the current path, located in /logs/flexml_logs.log, Otherwise, it will not be saved 85 | 86 | Example 87 | ------- 88 | >>> from flexml import Regression 89 | >>> df = pd.read_csv("MY_DATA.csv") 90 | >>> reg_exp = Regression(data=df, target_col='target_col') 91 | >>> reg_exp.start_experiment(experiment_size = 'quick') 92 | >>> reg_exp.show_model_stats(eval_metric='r2') 93 | 94 | --------------------------------------------------------------------- 95 | | model_name | r2 | mae | mse | rmse | mape | 96 | ------------------------|--------|---------|------|--------|--------| 97 | | LinearRegression | 0.7863 | 0.6721 |0.5921| 0.2469 | 0.2011 | 98 | | DecisionTreeRegressor | 0.7725 | 0.6441 |0.4642| 0.4347 | 0.3011 | 99 | | LGBMRegressor | 0.7521 | 0.4751 |0.3531| 0.1445 | 0.1011 | 100 | | Ridge | 0.7011 | 0.7590 |0.6155| 0.3411 | 0.2011 | 101 | | XGBRegressor | 0.6213 | 0.4701 |0.2923| 0.4039 | 0.3011 | 102 | | DecisionTreeRegressor | 0.6096 | 0.4541 |0.2821| 0.4011 | 0.3011 | 103 | | ElasticNet | 0.5812 | 0.4201 |0.2111| 0.3011 | 0.2011 | 104 | | Lasso | 0.5209 | 0.4101 |0.2011| 0.2911 | 0.2011 | 105 | --------------------------------------------------------------------- 106 | >>> best_model = reg_exp.get_best_models(eval_metric = 'r2') 107 | """ 108 | pass -------------------------------------------------------------------------------- /flexml/helpers/cross_validation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from typing import Optional, Any, Iterator 3 | from sklearn.model_selection import (KFold, StratifiedKFold, ShuffleSplit, 4 | StratifiedShuffleSplit, train_test_split, 5 | GroupKFold, GroupShuffleSplit) 6 | from flexml.config import CROSS_VALIDATION_METHODS 7 | from flexml.helpers import cross_validation_checker 8 | from flexml.logger import get_logger 9 | 10 | 11 | def get_cv_splits( 12 | df: pd.DataFrame, 13 | cv_method: str = "kfold", 14 | n_folds: Optional[int] = None, 15 | test_size: Optional[float] = None, 16 | y_array: Optional[pd.Series] = None, 17 | groups_col: Optional[str] = None, 18 | random_state: Optional[int] = None, 19 | shuffle: bool = True, 20 | ml_task_type: Optional[str] = None, 21 | logging_to_file: str = False 22 | ) -> Iterator[Any]: 23 | """ 24 | Returns indices for cross-validation splits according to the specified method and parameters. 25 | 26 | Parameters 27 | ---------- 28 | df : pd.DataFrame 29 | The full dataset (features and target combined) 30 | 31 | cv_method : str, (default='kfold' for Regression, 'stratified_kfold' for Classification If `ml_task_type` is provided, else 'kfold') 32 | Cross-validation method to use. Options: 33 | - For Regression: 34 | - "kfold" (default) (Provide `n_folds`) 35 | - "holdout" (Provide `test_size`) 36 | - "shuffle_split" (Provide `n_folds` and `test_size`) 37 | - "group_kfold" (Provide `n_folds` and `groups_col`) 38 | - "group_shuffle_split" (Provide `n_folds`, `test_size`, and `groups_col`) 39 | 40 | - For Classification: 41 | - "kfold" (Provide `n_folds`) 42 | - "stratified_kfold" (default) (Provide `n_folds`) 43 | - "holdout" (Provide `test_size`) 44 | - "stratified_shuffle_split" (Provide `n_folds`, `test_size`) 45 | - "group_kfold" (Provide `n_folds` and `groups_col`) 46 | - "group_shuffles_plit" (Provide `n_folds`, `test_size`, and `groups_col`) 47 | 48 | n_folds : int, optional (default=None for hold-out validation, 5 for other cv methods) 49 | Number of splits/folds for methods that use folds. Default is 5 50 | 51 | test_size : float, optional 52 | The test size to use for holdout, shuffle-based methods, or group shuffle split 53 | 54 | y_array : pd.Series or array-like, optional 55 | The target variable. Required for stratified splits to ensure class balance in each fold 56 | 57 | groups_col : str, optional 58 | The name of the column in `df` that contains group labels. Required for group-based methods 59 | 60 | random_state : int, optional (default=None) 61 | The random state value for the data processing process (Ignored If 'shuffle' is set to False) 62 | 63 | shuffle: bool, (default=True) 64 | If True, the data will be shuffled before the model training process 65 | 66 | ml_task_type : str, optional 67 | The type of ML task. Options: "Regression" or "Classification" 68 | 69 | If you don't pass a value, the function won't accept None value for cv_method since It won't know the default cv method for your task 70 | 71 | If you pass a value, the default `cv_method` will be set based on the task type: 72 | - "Regression" => "kfold" 73 | - "Classification" => "stratified_kfold" 74 | 75 | logging_to_file : bool, optional 76 | Whether to log to file or not. Default is False 77 | 78 | Returns 79 | ------- 80 | generator 81 | A generator that yields (train_index, test_index) for each split 82 | """ 83 | logger = get_logger(__name__, "PROD", logging_to_file) 84 | valid_methods = CROSS_VALIDATION_METHODS.get('all') 85 | 86 | cv_method = cross_validation_checker( 87 | df=df, 88 | cv_method=cv_method, 89 | n_folds=n_folds, 90 | test_size=test_size, 91 | groups_col=groups_col, 92 | available_cv_methods=valid_methods, 93 | ml_task_type=ml_task_type 94 | ) 95 | 96 | if cv_method == 'holdout' and not test_size: 97 | test_size = 0.25 98 | 99 | if cv_method == 'holdout' and test_size and n_folds: 100 | logger.warning(f"Both 'n_folds' and 'test_size' provided for {cv_method} validation method. Ignoring 'n_folds'") 101 | n_folds = None 102 | 103 | if cv_method == 'kfold' and test_size: 104 | logger.warning(f"Both 'n_folds' and 'test_size' provided for {cv_method} method. Ignoring 'test_size'") 105 | test_size = None 106 | 107 | if cv_method != 'holdout' and not n_folds: 108 | n_folds = 5 109 | 110 | if cv_method in ["stratified_kfold", "stratified_shuffle_split"] and y_array is None: 111 | error_msg = "`y_array` must be provided for stratified methods" 112 | logger.error(error_msg) 113 | raise ValueError(error_msg) 114 | 115 | groups = df[groups_col].values if groups_col else None 116 | if groups is not None and cv_method not in ["group_kfold", "group_shuffle_split"]: 117 | logger.warning(f"'groups_col' provided even though 'cv_method' is {cv_method}. Ignoring 'groups_col'") 118 | groups = None 119 | 120 | if cv_method == "kfold": 121 | splitter = KFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) 122 | return splitter.split(df) 123 | 124 | elif cv_method == "shuffle_split": 125 | splitter = ShuffleSplit(n_splits=n_folds, test_size=test_size, random_state=random_state) 126 | return splitter.split(df) 127 | 128 | elif cv_method == "stratified_shuffle_split": 129 | splitter = StratifiedShuffleSplit(n_splits=n_folds, test_size=test_size, random_state=random_state) 130 | return splitter.split(df, y_array) 131 | 132 | elif cv_method == "group_shuffle_split": 133 | splitter = GroupShuffleSplit(n_splits=n_folds, test_size=test_size, random_state=random_state) 134 | return splitter.split(df, groups=groups) 135 | 136 | elif cv_method == "stratified_kfold": 137 | splitter = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) 138 | return splitter.split(df, y_array) 139 | 140 | elif cv_method == "group_kfold": 141 | splitter = GroupKFold(n_splits=n_folds) 142 | return splitter.split(df, groups=groups) 143 | 144 | elif cv_method == "holdout": 145 | train_index, test_index = train_test_split( 146 | df.index, 147 | test_size=test_size, 148 | shuffle=shuffle, 149 | random_state=random_state, 150 | stratify=y_array if cv_method == "stratified_kfold" else None 151 | ) 152 | return [(train_index, test_index)] -------------------------------------------------------------------------------- /tests/test_cross_validation.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from typing import Optional, Union 3 | from types import GeneratorType 4 | import numpy as np 5 | from parameterized import parameterized 6 | from sklearn.datasets import load_breast_cancer, load_diabetes 7 | from flexml.logger import get_logger 8 | from flexml import Regression, Classification 9 | from flexml.helpers import cross_validation_checker, get_cv_splits 10 | 11 | import warnings 12 | warnings.filterwarnings("ignore") 13 | 14 | class TestCrossValidation(unittest.TestCase): 15 | logger = get_logger(__name__, "TEST", logging_to_file=False) 16 | 17 | # Datasets for testing 18 | regression_data = load_diabetes(as_frame=True)['frame'] 19 | breast_data = load_breast_cancer(as_frame=True) 20 | classification_data = breast_data['frame'] 21 | classification_data["target"] = breast_data['target'] 22 | classification_data["group"] = classification_data.index % 3 # Add synthetic group column 23 | regression_data["group"] = regression_data.index % 3 # Add synthetic group column 24 | 25 | @parameterized.expand([ 26 | ("Classification", "kfold", {"n_splits": 3}), 27 | ("Classification", "stratified_kfold", {"n_splits": 3}), 28 | ("Classification", "shuffle_split", {"n_splits": 3, "test_size": 0.25}), 29 | ("Classification", "stratified_shuffle_split", {"n_splits": 3, "test_size": 0.25}), 30 | ("Classification", "group_kfold", {"n_splits": 3, "groups_col": "group"}), 31 | ("Classification", "group_shuffle_split", {"n_splits": 3, "test_size": 0.25, "groups_col": "group"}), 32 | ("Classification", "holdout", {"test_size": 0.25}), 33 | ("Regression", "kfold", {"n_splits": 3}), 34 | ("Regression", "shuffle_split", {"n_splits": 3, "test_size": 0.25}), 35 | ("Regression", "group_kfold", {"n_splits": 3, "groups_col": "group"}), 36 | ("Regression", "group_shuffle_split", {"n_splits": 3, "test_size": 0.2, "groups_col": "group"}), 37 | ("Regression", "holdout", {"test_size": 0.25}), 38 | 39 | # Edge cases 40 | ("Regression", "holdout", {"test_size": None, "n_splits": 3}), # holdout but no test_size given 41 | ("Regression", "holdout", {"test_size": 0.25, "n_splits": 3}), # holdout but n_splits given 42 | ("Regression", "kfold", {"test_size": 0.25, "n_splits": 3}), # kfold but test_size given 43 | ("Regression", "kfold", {"n_splits": None}), # kfold but no n_splits given 44 | ("Regression", "holdout", {"groups_col": "group"}) # not a group cross-validation but groups_col given 45 | ]) 46 | def test_cross_validation( 47 | self, 48 | ml_task_type: str, 49 | cv_method: Optional[str], 50 | params: dict 51 | ): 52 | target_col = "target" 53 | 54 | if ml_task_type == "Classification": 55 | df = self.classification_data.copy() 56 | 57 | # Skip Stratified methods if classes are not sufficiently populated 58 | has_sufficient_class_instances = not ("Stratified" in cv_method and (df["target"].value_counts() < 2).any()) 59 | self.assertTrue( 60 | has_sufficient_class_instances, 61 | f"{cv_method} couldn't be executed due to insufficient class instances, please take a look to data used for the test" 62 | ) 63 | 64 | experiment_object = Classification(df, target_col) 65 | 66 | else: # Classification 67 | self.assertNotIn( 68 | "Stratified", 69 | cv_method, 70 | f"Stratified methods are for Classification only. You've passed {cv_method} for Regression" 71 | ) 72 | 73 | df = self.regression_data.copy() 74 | experiment_object = Regression(df, target_col) 75 | 76 | experiment_object.start_experiment( 77 | experiment_size="wide", 78 | cv_method=cv_method, 79 | n_folds=params.get("n_splits"), 80 | test_size=params.get("test_size"), 81 | groups_col=params.get("groups_col") 82 | ) 83 | 84 | predictions = experiment_object.predict(df.drop(columns=[target_col]), full_train=False) 85 | self.assertIsInstance(predictions, np.ndarray) 86 | 87 | @parameterized.expand([ 88 | ("test_invalid_cv_method", "X", {}, ValueError), 89 | ("test_invalid_n_folds", "kfold", {"n_folds": 1}, ValueError), 90 | ("test_invalid_test_size", "holdout", {"test_size": 1.1}, ValueError), 91 | ("test_invalid_groups_col", "group_kfold", {"n_folds": 3, "groups_col": "X"}, ValueError), 92 | ("test_missing_groups_col_for_group_shuffle_split", "group_shuffle_split", {"n_folds": 3}, ValueError), 93 | ("test_missing_groups_col_for_group_kfold", "group_kfold", {"n_folds": 3, "test_size": 0.25}, ValueError), 94 | ("test_default_cv_for_classification", None, {"ml_task_type": "Classification"}, "stratified_kfold"), 95 | ("test_invalid_ml_task_type", "kfold", {"ml_task_type": "X"}, ValueError), 96 | ("test_normalize_stratified_kfold_name", "stratifiedkfold", {"ml_task_type": "Classification"}, "stratified_kfold") 97 | ]) 98 | def test_expected_results(self, test_name: str, cv_method: str, params: dict, expected_result: Union[str, Exception]): 99 | if isinstance(expected_result, type) and issubclass(expected_result, BaseException): # Your IDE might say 'code is not reachable' here, but Its 100 | with self.assertRaises(expected_result): 101 | cross_validation_checker( 102 | df=self.regression_data, 103 | cv_method=cv_method, 104 | **params 105 | ) 106 | else: 107 | result = cross_validation_checker( 108 | df=self.regression_data, 109 | cv_method=cv_method, 110 | **params 111 | ) 112 | self.assertEqual(result, expected_result) 113 | 114 | @parameterized.expand([ 115 | ("test_cv_with_none_nfolds", "kfold", {"n_folds": None}, GeneratorType), 116 | ("test_stratified_without_y_array", "stratified_kfold", {}, ValueError), 117 | ("test_holdout_returns_generator", "holdout", {"test_size": 0.25}, list) 118 | ]) 119 | def test_get_cv_splits(self, test_name: str, cv_method: str, params: dict, expected_result: Union[str, Exception]): 120 | if issubclass(expected_result, BaseException): # Your IDE might say 'code is not reachable' here, but Its 121 | with self.assertRaises(expected_result): 122 | get_cv_splits( 123 | df=self.regression_data, 124 | cv_method=cv_method, 125 | **params 126 | ) 127 | else: 128 | splits = get_cv_splits( 129 | df=self.regression_data, 130 | cv_method=cv_method, 131 | **params 132 | ) 133 | self.assertIsInstance(splits, expected_result) -------------------------------------------------------------------------------- /flexml/helpers/supervised_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from typing import Union 4 | 5 | from sklearn.metrics import ( 6 | r2_score, 7 | mean_absolute_error, 8 | mean_squared_error, 9 | accuracy_score, 10 | precision_score, 11 | recall_score, 12 | f1_score, 13 | roc_auc_score) 14 | 15 | 16 | def _safe_mape(y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np.ndarray]) -> float: 17 | """ 18 | Computes the Mean Absolute Percentage Error (MAPE) while ignoring zero values in y_true since MAPE is undefined for zero values. 19 | 20 | Parameters 21 | ---------- 22 | y_true : pd.Series or np.ndarray 23 | The actual values of the target column 24 | 25 | y_pred : pd.Series or np.ndarray 26 | The predicted values of the target column 27 | 28 | Returns 29 | ------- 30 | float 31 | The MAPE score for the desired eval metric 32 | """ 33 | mask = y_true != 0 # Ignore zero values in y_true 34 | return round(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])), 6) 35 | 36 | def _evaluate_preds( 37 | y_true: Union[pd.Series, np.ndarray], 38 | y_pred: Union[pd.Series, np.ndarray], 39 | eval_metric: str, 40 | average: str = 'macro' 41 | ) -> float: 42 | """ 43 | Evaluates the model with the given evaluation metric by using the test set 44 | 45 | Parameters 46 | ---------- 47 | y_true : pd.Series or np.ndarray 48 | The actual values of the target column 49 | 50 | y_pred : pd.Series or np.ndarray 51 | The predicted values/probabilities of the target column 52 | 53 | eval_metric : str 54 | The evaluation metric that will be used to evaluate the model 55 | 56 | - Avaiable evalulation metrics for Regression: 57 | - R2, MAE, MSE, RMSE, MAPE 58 | 59 | - Avaiable evalulation metrics for Classification: 60 | - Accuracy, Precision, Recall, F1 Score, ROC-AUC 61 | 62 | average : str, default='macro' 63 | The averaging method to use for multiclass classification metrics. 64 | Options are ['binary', 'micro', 'macro', 'weighted']. 65 | For binary classification, 'binary' is recommended. 66 | For multiclass, 'macro' treats all classes equally. 67 | 68 | Returns 69 | ------- 70 | float 71 | The evaluation metric score for the desired eval metric 72 | """ 73 | if eval_metric == 'R2': 74 | return round(r2_score(y_true, y_pred), 6) 75 | elif eval_metric == 'MAE': 76 | return round(mean_absolute_error(y_true, y_pred), 6) 77 | elif eval_metric == 'MSE': 78 | return round(mean_squared_error(y_true, y_pred), 6) 79 | elif eval_metric == 'RMSE': 80 | return round(np.sqrt(mean_squared_error(y_true, y_pred)), 6) 81 | elif eval_metric == 'MAPE': 82 | return _safe_mape(y_true, y_pred) 83 | elif eval_metric == 'Accuracy': 84 | return round(accuracy_score(y_true, y_pred), 6) 85 | elif eval_metric == 'Precision': 86 | return round(precision_score(y_true, y_pred, average=average), 6) 87 | elif eval_metric == 'Recall': 88 | return round(recall_score(y_true, y_pred, average=average), 6) 89 | elif eval_metric == 'F1 Score': 90 | return round(f1_score(y_true, y_pred, average=average), 6) 91 | elif eval_metric == 'ROC-AUC': 92 | if len(y_pred.shape) > 1: # If probabilites are returned 93 | if y_pred.shape[1] >= 3: # If there are 3 or more classes 94 | return round(roc_auc_score(y_true, y_pred, average=average, multi_class='ovr'), 6) 95 | elif y_pred.shape[1] == 2: # If there are 2 classes 96 | return round(roc_auc_score(y_true, y_pred[:, 1]), 6) 97 | else: # If class labels are returned, ROC-AUC is not applicable (Some models don't have predict_proba method) 98 | return -1.0 99 | else: 100 | raise ValueError(f"Error while evaluating the current model. The eval_metric should be one of the following: 'R2', 'MAE', 'MSE', 'RMSE', 'MAPE', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'. Got {eval_metric}") 101 | 102 | def evaluate_model_perf( 103 | ml_task_type, 104 | y_test, 105 | y_pred 106 | ) -> dict: 107 | """ 108 | Evaluates how good are the predictions by comparing them with the actual values, returns regression evaluation scores 109 | 110 | Parameters 111 | ---------- 112 | ml_task_type : str 113 | The type of the machine learning task. It can be either 'Regression' or 'Classification' 114 | 115 | y_test : np.ndarray 116 | The actual values of the target column. 117 | 118 | y_pred : np.ndarray 119 | For regression tasks: The predicted values of the target column. 120 | For classification tasks: The predicted probabilities for each class. 121 | Note: Some models like Perceptron, PassiveAggressiveClassifier, etc. don't have predict_proba method, so they return class labels directly. 122 | 123 | Returns 124 | ------- 125 | dict 126 | A dictionary containing the evaluation metric of the current task 127 | 128 | * R2, MAE, MSE, RMSE, MAPE for Regression tasks 129 | 130 | * Accuracy, Precision, Recall, F1 Score, ROC-AUC for Classification tasks 131 | """ 132 | 133 | if ml_task_type == "Regression": 134 | r2 = _evaluate_preds(y_test, y_pred, 'R2') 135 | mae = _evaluate_preds(y_test, y_pred, 'MAE') 136 | mse = _evaluate_preds(y_test, y_pred, 'MSE') 137 | rmse = _evaluate_preds(y_test, y_pred, 'RMSE') 138 | mape = _evaluate_preds(y_test, y_pred, 'MAPE') 139 | return { 140 | "R2": r2, 141 | "MAE": mae, 142 | "MSE": mse, 143 | "RMSE": rmse, 144 | "MAPE": mape 145 | } 146 | 147 | else: # Classification 148 | # Convert probabilities to class labels for metrics except ROC-AUC if y_pred is probabilities 149 | if len(y_pred.shape) > 1: 150 | y_pred_labels = np.argmax(y_pred, axis=1) 151 | else: 152 | y_pred_labels = (y_pred > 0.5).astype(int) 153 | 154 | # Determine appropriate averaging method based on number of classes 155 | n_classes = len(np.unique(y_test)) 156 | avg_method = 'binary' if n_classes == 2 else 'macro' 157 | 158 | # Use labels for standard classification metrics 159 | accuracy = _evaluate_preds(y_test, y_pred_labels, 'Accuracy') 160 | precision = _evaluate_preds(y_test, y_pred_labels, 'Precision', average=avg_method) 161 | recall = _evaluate_preds(y_test, y_pred_labels, 'Recall', average=avg_method) 162 | f1 = _evaluate_preds(y_test, y_pred_labels, 'F1 Score', average=avg_method) 163 | 164 | # Use probabilities for ROC-AUC 165 | roc_auc = _evaluate_preds(y_test, y_pred, 'ROC-AUC', average=avg_method) 166 | 167 | return { 168 | "Accuracy": accuracy, 169 | "Precision": precision, 170 | "Recall": recall, 171 | "F1 Score": f1, 172 | "ROC-AUC": roc_auc 173 | } -------------------------------------------------------------------------------- /tests/test_feature_engineering.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | import numpy as np 4 | from flexml._feature_engineer import FeatureEngineering 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | import warnings 8 | warnings.filterwarnings("ignore") 9 | 10 | 11 | class TestFeatureEngineering(unittest.TestCase): 12 | """ 13 | Test cases for the feature engineering pipeline in the Classification class 14 | """ 15 | np.random.seed(42) 16 | n_rows = 100 17 | 18 | df = pd.DataFrame({ 19 | 'id': range(1, n_rows + 1), 20 | 'category_default': np.random.choice(['A', 'B', 'C'], n_rows), 21 | 'value_default': np.random.normal(100, 15, n_rows), 22 | 'status': np.random.choice(['Active', 'Pending', 'Closed'], n_rows), 23 | 'priority': np.random.choice(['High', 'Medium', 'Low'], n_rows), 24 | 'score': np.random.randint(0, 100, n_rows), 25 | 'amount': np.random.uniform(10, 1000, n_rows), 26 | 'target': np.random.choice([0, 1], n_rows) 27 | }) 28 | 29 | # This will create artificial null values within dataframe 30 | for column in df.columns: 31 | if column not in ['id', 'target']: 32 | mask = np.random.random(n_rows) < 0.2 33 | df.loc[mask, column] = np.nan 34 | 35 | encoding_methods = ['label_encoder', 'onehot_encoder', 'ordinal_encoder'] 36 | imputation_methods = ['mean', 'median', 'mode', 'constant', 'drop'] 37 | normalization_methods = ['standard_scaler', 'minmax_scaler', 'robust_scaler', 'quantile_transformer', 'maxabs_scaler', 'normalize_scaler'] 38 | 39 | def test_feature_engineering_with_inputs(self): 40 | """ 41 | End-to-end test for feature engineering pipeline through Classification class 42 | """ 43 | feature_exp = FeatureEngineering( 44 | self.df, 45 | target_col='target', 46 | drop_columns=['id'], 47 | column_imputation_map={'status': 'constant','amount': 'constant'}, 48 | categorical_imputation_constant='test_constant', 49 | numerical_imputation_constant=-1, 50 | encoding_method_map={'category_default': 'ordinal_encoder', 'priority': 'onehot_encoder'}, 51 | ordinal_encode_map={'category_default': ['A', 'C', 'B']}, 52 | onehot_limit=3, 53 | normalize='normalize_scaler' 54 | ) 55 | 56 | feature_exp.setup() 57 | 58 | X_train, y_train = feature_exp.fit_transform() 59 | lr = LogisticRegression(max_iter=500).fit(X_train, y_train) 60 | 61 | # Check if all columns are numerical, including target 62 | self.assertFalse( 63 | X_train.select_dtypes(exclude=[np.number]).columns.tolist(), 64 | "Not all columns are numerical" 65 | ) 66 | 67 | # Check if there are any null values 68 | self.assertFalse( 69 | X_train.isnull().any().any(), 70 | "There are null values in the processed data" 71 | ) 72 | 73 | def test_feature_engineering_without_inputs(self): 74 | """ 75 | End-to-end test for feature engineering pipeline through Classification class 76 | """ 77 | feature_exp = FeatureEngineering(self.df, target_col='target') 78 | feature_exp.setup() 79 | 80 | X_train, y_train = feature_exp.fit_transform() 81 | lr = LogisticRegression(max_iter=500).fit(X_train, y_train) 82 | 83 | # Check if all columns are numerical, including target 84 | self.assertFalse( 85 | X_train.select_dtypes(exclude=[np.number]).columns.tolist(), 86 | "Not all columns are numerical" 87 | ) 88 | 89 | # Check if there are any null values 90 | self.assertFalse( 91 | X_train.isnull().any().any(), 92 | "There are null values in the processed data" 93 | ) 94 | 95 | def test_feature_engineering_with_dynamic_inputs(self): 96 | """ 97 | Dynamic end-to-end test for feature engineering pipeline through Classification class 98 | """ 99 | # Nested loops for encoding, imputation, and normalization methods 100 | for encoding_method in self.encoding_methods: 101 | for imputation_method in self.imputation_methods: 102 | for normalization_method in self.normalization_methods: 103 | encoding_method_map = {'category_default': encoding_method, 'priority': encoding_method} 104 | ordinal_encode_map = None 105 | 106 | # Handle specific cases for encoding methods 107 | if encoding_method == 'ordinal_encoder': 108 | ordinal_encode_map = {'priority': ['Low', 'Medium', 'High'], 'category_default':['A','C','B']} 109 | 110 | # Distinguish between categorical and numerical imputation methods 111 | if imputation_method in ['mode', 'constant', 'drop']: 112 | column_imputation_map = {'status': imputation_method, 'amount': 'mean'} 113 | elif imputation_method in ['mean', 'median']: 114 | column_imputation_map = {'status': 'mode', 'amount': imputation_method} 115 | 116 | with self.subTest(encoding_method=encoding_method, imputation_method=imputation_method, normalization_method=normalization_method): 117 | feature_test = FeatureEngineering( 118 | data=self.df, 119 | target_col='target', 120 | drop_columns=['id'], 121 | column_imputation_map=column_imputation_map, 122 | categorical_imputation_constant='test_constant', 123 | numerical_imputation_constant=-1, 124 | encoding_method_map=encoding_method_map, 125 | ordinal_encode_map=ordinal_encode_map, 126 | onehot_limit=3, 127 | normalize=normalization_method 128 | ) 129 | feature_test.setup() 130 | 131 | X_train, y_train = feature_test.fit_transform() 132 | lr = LogisticRegression(max_iter=500).fit(X_train, y_train) 133 | 134 | # Check if all columns are numerical, including target 135 | self.assertFalse( 136 | X_train.select_dtypes(exclude=[np.number]).columns.tolist(), 137 | f"Not all columns are numerical. Failed parameters are: " 138 | f"Encoding method: {encoding_method}, " 139 | f"Imputation method: {imputation_method}, " 140 | f"Normalization method: {normalization_method}" 141 | ) 142 | 143 | # Check if there are any null values 144 | self.assertFalse( 145 | X_train.isnull().any().any(), 146 | f"There are null values in the processed data. Failed parameters are: " 147 | f"Encoding method: {encoding_method}, " 148 | f"Imputation method: {imputation_method}, " 149 | f"Normalization method: {normalization_method}" 150 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. -------------------------------------------------------------------------------- /tests/test_supervised.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import unittest 4 | import numpy as np 5 | from parameterized import parameterized 6 | from sklearn.datasets import load_diabetes, load_breast_cancer, load_iris 7 | from flexml import Regression, Classification 8 | from flexml.logger import get_logger 9 | import warnings 10 | warnings.filterwarnings("ignore") 11 | 12 | 13 | class TestRegression(unittest.TestCase): 14 | logger = get_logger(__name__, "TEST") 15 | logger.setLevel("DEBUG") 16 | 17 | test_config = { 18 | 'Regression': { 19 | 'data': load_diabetes(as_frame=True)['frame'], 20 | 'target_col': 'target', 21 | 'exp_obj': None 22 | }, 23 | 'BinaryClassification': { 24 | 'data': load_breast_cancer(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'No', 1: 'Yes'})), 25 | 'target_col': 'target', 26 | 'exp_obj': None 27 | }, 28 | 'MulticlassClassification': { 29 | 'data': load_iris(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'Iris-Setosa', 1: 'Iris-Versicolor', 2: 'Iris-Virginica'})), 30 | 'target_col': 'target', 31 | 'exp_obj': None 32 | } 33 | } 34 | 35 | n_folds = 3 36 | 37 | @parameterized.expand(list(test_config.keys())) 38 | def test_01_supervised(self, objective: str): 39 | df = self.test_config[objective].get('data') 40 | target_col = self.test_config[objective].get('target_col') 41 | exp_size = "wide" 42 | 43 | if objective == 'Regression': 44 | exp_obj = Regression( 45 | data = df, 46 | target_col = target_col 47 | ) 48 | else: # BinaryClassification or MulticlassClassification 49 | exp_obj = Classification( 50 | data = df, 51 | target_col = target_col 52 | ) 53 | 54 | exp_obj.start_experiment( 55 | experiment_size = exp_size, 56 | n_folds = self.n_folds, 57 | eval_metric = "RMSE" if objective == "Regression" else "Accuracy" 58 | ) 59 | 60 | top_x_models = exp_obj.get_best_models(top_n_models = 3) 61 | self.assertEqual( 62 | len(top_x_models), 3, 63 | f"An error occured while retriving the best models in {exp_size} {objective}, expected 3, got {len(top_x_models)}" 64 | ) 65 | 66 | exp_obj.show_model_stats() 67 | 68 | tuning_methods = ["grid_search", "randomized_search", "optuna"] 69 | for method in tuning_methods: 70 | if method == "grid_search": 71 | model = "LGBMRegressor" if objective == "Regression" else "LGBMClassifier" 72 | param_grid = { 73 | "n_estimators": [100, 200], 74 | "max_depth": [3, 5], 75 | "learning_rate": [0.5, 0.1] 76 | } 77 | exp_obj.tune_model(model=model, tuning_method=method, param_grid=param_grid, n_folds=self.n_folds, n_iter=3) 78 | else: 79 | exp_obj.tune_model(tuning_method=method, n_folds=self.n_folds, n_iter=3) 80 | self.assertIsNotNone(exp_obj.tuned_model, f"An error occured while tuning the model with {method} in {exp_size} {objective}, tuned model is None") 81 | self.assertIsNotNone(exp_obj.tuned_model_score, f"An error occured while calculating the tuned model's score with {method} in {exp_size} {objective}, tuned model score is None") 82 | 83 | # Save experiment objects to config 84 | self.test_config[objective]['exp_obj'] = exp_obj 85 | 86 | def test_02_save_regression_model(self): 87 | exp_obj = self.test_config['Regression']['exp_obj'] 88 | 89 | # Test saving model with full_train=True and model_only=True (only the model object, not a pipeline) 90 | save_path = "test_regression_model_full_train_model_only.pkl" 91 | exp_obj.save_model(save_path=save_path, full_train=True, model_only=True) 92 | self.assertTrue(os.path.exists(save_path)) 93 | 94 | # Load the saved model and check if it's the model object (not a pipeline) 95 | with open(save_path, 'rb') as f: 96 | saved_model = pickle.load(f) 97 | self.assertFalse(hasattr(saved_model, 'named_steps')) 98 | os.remove(save_path) # Clean up saved model 99 | 100 | # Test saving model with full_train=False and model_only=False (should return a pipeline) 101 | save_path = "test_regression_model_no_full_train_model_only_false.pkl" 102 | exp_obj.save_model(save_path=save_path, full_train=False, model_only=False) 103 | self.assertTrue(os.path.exists(save_path)) 104 | 105 | # Load the saved model and check if it's a pipeline 106 | with open(save_path, 'rb') as f: 107 | saved_model = pickle.load(f) 108 | self.assertTrue(hasattr(saved_model, 'named_steps')) 109 | os.remove(save_path) # Clean up saved model 110 | 111 | def test_03_save_binary_classification_model(self): 112 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 113 | 114 | # Test saving model with full_train=True and model_only=True (only the model object, not a pipeline) 115 | save_path = "test_binary_classification_model_full_train_model_only.pkl" 116 | exp_obj.save_model(save_path=save_path, full_train=True, model_only=True) 117 | self.assertTrue(os.path.exists(save_path)) 118 | 119 | # Load the saved model and check if it's the model object (not a pipeline) 120 | with open(save_path, 'rb') as f: 121 | saved_model = pickle.load(f) 122 | self.assertFalse(hasattr(saved_model, 'named_steps')) 123 | os.remove(save_path) # Clean up saved model 124 | 125 | # Test saving model with full_train=False and model_only=False (should return a pipeline) 126 | save_path = "test_binary_classification_model_no_full_train_model_only_false.pkl" 127 | exp_obj.save_model(save_path=save_path, full_train=False, model_only=False) 128 | self.assertTrue(os.path.exists(save_path)) 129 | 130 | # Load the saved model and check if it's a pipeline 131 | with open(save_path, 'rb') as f: 132 | saved_model = pickle.load(f) 133 | self.assertTrue(hasattr(saved_model, 'named_steps')) 134 | os.remove(save_path) # Clean up saved model 135 | 136 | def test_04_save_multiclass_classification_model(self): 137 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 138 | 139 | # Test saving model with full_train=True and model_only=True (only the model object, not a pipeline) 140 | save_path = "test_multiclass_classification_model_full_train_model_only.pkl" 141 | exp_obj.save_model(save_path=save_path, full_train=True, model_only=True) 142 | self.assertTrue(os.path.exists(save_path)) 143 | 144 | # Load the saved model and check if it's the model object (not a pipeline) 145 | with open(save_path, 'rb') as f: 146 | saved_model = pickle.load(f) 147 | self.assertFalse(hasattr(saved_model, 'named_steps')) 148 | os.remove(save_path) # Clean up saved model 149 | 150 | # Test saving model with full_train=False and model_only=False (should return a pipeline) 151 | save_path = "test_multiclass_classification_model_no_full_train_model_only_false.pkl" 152 | exp_obj.save_model(save_path=save_path, full_train=False, model_only=False) 153 | self.assertTrue(os.path.exists(save_path)) 154 | 155 | # Load the saved model and check if it's a pipeline 156 | with open(save_path, 'rb') as f: 157 | saved_model = pickle.load(f) 158 | self.assertTrue(hasattr(saved_model, 'named_steps')) 159 | os.remove(save_path) # Clean up saved model 160 | 161 | def test_05_predict_model_regression(self): 162 | # Test regression predictions 163 | exp_obj = self.test_config['Regression']['exp_obj'] 164 | test_data = self.test_config['Regression'].get('data').drop(columns=['target']) 165 | 166 | predictions = exp_obj.predict( 167 | test_data=test_data, 168 | model=exp_obj.get_model_by_name("LGBMRegressor"), 169 | full_train=True, 170 | ) 171 | self.assertIsInstance(predictions, np.ndarray) 172 | 173 | def test_06_predict_model_binary_classification(self): 174 | # Test binary classification predictions 175 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 176 | test_data = self.test_config['BinaryClassification'].get('data').drop(columns=['target']) 177 | 178 | predictions = exp_obj.predict(test_data, full_train=False) 179 | predictions_probabilities = exp_obj.predict_proba(test_data, full_train=False) 180 | self.assertIsInstance(predictions, np.ndarray) 181 | self.assertIsInstance(predictions_probabilities, np.ndarray) 182 | self.assertEqual(predictions_probabilities.shape[1], 2) # Binary classification should have 2 probability columns 183 | 184 | def test_07_predict_model_multiclass(self): 185 | # Test multiclass classification predictions 186 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 187 | test_data = self.test_config['MulticlassClassification'].get('data').drop(columns=['target']) 188 | 189 | predictions = exp_obj.predict(test_data, full_train=False) 190 | predictions_probabilities = exp_obj.predict_proba(test_data, full_train=False) 191 | self.assertIsInstance(predictions, np.ndarray) 192 | self.assertIsInstance(predictions_probabilities, np.ndarray) 193 | self.assertEqual(predictions_probabilities.shape[1], 3) # Iris has 3 classes 194 | 195 | def test_08_plot_regression_feature_importance(self): 196 | exp_obj = self.test_config['Regression']['exp_obj'] 197 | exp_obj.plot("CatBoostRegressor", kind="feature_importance") 198 | 199 | def test_09_plot_binary_classification_feature_importance(self): 200 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 201 | exp_obj.plot("XGBClassifier", kind="feature_importance") 202 | 203 | def test_10_plot_multiclass_classification_feature_importance(self): 204 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 205 | exp_obj.plot("LogisticRegression", kind="feature_importance") 206 | 207 | def test_11_plot_regression_residuals(self): 208 | exp_obj = self.test_config['Regression']['exp_obj'] 209 | exp_obj.plot("LinearRegression", kind="residuals") 210 | 211 | def test_12_plot_regression_prediction_error(self): 212 | exp_obj = self.test_config['Regression']['exp_obj'] 213 | exp_obj.plot("LGBMRegressor", kind="prediction_error") 214 | 215 | def test_13_plot_regression_shap_summary(self): 216 | exp_obj = self.test_config['Regression']['exp_obj'] 217 | exp_obj.plot("XGBRegressor", kind="shap_summary") 218 | 219 | def test_14_plot_regression_shap_violin(self): 220 | exp_obj = self.test_config['Regression']['exp_obj'] 221 | exp_obj.plot("RandomForestRegressor", kind="shap_violin") 222 | 223 | def test_15_plot_binary_classification_confusion_matrix(self): 224 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 225 | exp_obj.plot("LogisticRegression", kind="confusion_matrix") 226 | 227 | def test_16_plot_binary_classification_roc_curve(self): 228 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 229 | exp_obj.plot("RandomForestClassifier", kind="roc_curve") 230 | 231 | def test_17_plot_binary_classification_calibration_uniform(self): 232 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 233 | exp_obj.plot("XGBClassifier", kind="calibration_curve", strategy='uniform', n_bins=10) 234 | 235 | def test_18_plot_binary_classification_calibration_quantile(self): 236 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 237 | exp_obj.plot("LGBMClassifier", kind="calibration_curve", strategy='quantile', n_bins=8) 238 | 239 | def test_19_plot_binary_classification_shap_summary(self): 240 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 241 | exp_obj.plot("CatBoostClassifier", kind="shap_summary") 242 | 243 | def test_20_plot_binary_classification_shap_violin(self): 244 | exp_obj = self.test_config['BinaryClassification']['exp_obj'] 245 | exp_obj.plot("XGBClassifier", kind="shap_violin") 246 | 247 | def test_21_plot_multiclass_classification_confusion_matrix(self): 248 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 249 | exp_obj.plot("RandomForestClassifier", kind="confusion_matrix") 250 | 251 | def test_22_plot_multiclass_classification_roc_curve(self): 252 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 253 | exp_obj.plot("LogisticRegression", kind="roc_curve") 254 | 255 | def test_23_plot_multiclass_calibration_uniform(self): 256 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 257 | exp_obj.plot("XGBClassifier", kind="calibration_curve", strategy='uniform', n_bins=10) 258 | 259 | def test_24_plot_multiclass_calibration_quantile(self): 260 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 261 | exp_obj.plot("CatBoostClassifier", kind="calibration_curve", strategy='quantile', n_bins=12) 262 | 263 | def test_25_plot_multiclass_classification_shap_summary(self): 264 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 265 | exp_obj.plot("LGBMClassifier", kind="shap_summary") 266 | 267 | def test_26_plot_multiclass_classification_shap_violin(self): 268 | exp_obj = self.test_config['MulticlassClassification']['exp_obj'] 269 | exp_obj.plot("RandomForestClassifier", kind="shap_violin") -------------------------------------------------------------------------------- /flexml/helpers/plot_model_graphs.py: -------------------------------------------------------------------------------- 1 | import plotly.graph_objects as go 2 | import numpy as np 3 | import shap 4 | from typing import Union, Optional, Dict 5 | from sklearn.metrics import confusion_matrix 6 | from sklearn.metrics import roc_curve, auc 7 | from yellowbrick.regressor import ResidualsPlot, PredictionError 8 | 9 | 10 | def plot_feature_importance( 11 | model: object, 12 | feature_names: list[str], 13 | top_x_features: int = 20, 14 | width: int = 800, 15 | height: int = 600, 16 | ) -> Union[go.Figure, str]: 17 | """ 18 | Create a plotly figure showing feature importance for a given model 19 | 20 | Parameters 21 | ---------- 22 | model: object 23 | Machine learning model to display its feature importance 24 | 25 | feature_names: list[str] 26 | List of feature names to display in the plot 27 | 28 | top_x_features: int (default = 20), optional 29 | Number of top features to display in the plot 30 | 31 | width: int (default = 800), optional 32 | Width of the plot 33 | 34 | height: int (default = 600), optional 35 | Height of the plot 36 | 37 | Returns 38 | ------- 39 | plotly.graph_objects.Figure or str 40 | A plotly figure object containing the feature importance visualization, 41 | or an error message if an error occurs during the process. 42 | """ 43 | try: 44 | model_name = model.__class__.__name__ 45 | importance = None 46 | 47 | # Check if the model has 'feature_importances_' attribute (tree-based models) 48 | if hasattr(model, 'feature_importances_'): 49 | importance = model.feature_importances_ 50 | 51 | # Check if the model has coefficients (linear models) 52 | elif hasattr(model, 'coef_'): 53 | importance = np.abs(model.coef_) 54 | if importance.ndim > 1: # Handle multi-output models (e.g., LogisticRegression with multiple classes) 55 | importance = np.mean(importance, axis=0) 56 | 57 | if importance is not None and len(importance) == len(feature_names): 58 | indices = np.argsort(importance)[::-1] # Sort in descending order 59 | 60 | # Limit to top 20 features 61 | indices = indices[:top_x_features] 62 | sorted_importance = importance[indices] 63 | sorted_features = np.array(feature_names)[indices] 64 | 65 | fig = go.Figure() 66 | fig.add_trace(go.Bar( 67 | y=sorted_features, 68 | x=sorted_importance, 69 | orientation='h', 70 | marker=dict( 71 | color=sorted_importance, 72 | colorscale='Viridis' 73 | ) 74 | )) 75 | 76 | fig.update_layout( 77 | title=f"Feature Importance for {model_name} (Top {top_x_features} Features)", 78 | xaxis_title="Importance", 79 | yaxis_title="Features", 80 | height=height, 81 | width=width, 82 | yaxis=dict(autorange="reversed") 83 | ) 84 | 85 | return fig 86 | else: 87 | return f"Feature importance is not available or mismatched for {model_name}" 88 | 89 | except Exception as e: 90 | return f"Could not calculate feature importance for the model {model_name}. Error: {e}" 91 | 92 | 93 | def plot_confusion_matrix( 94 | y_true: np.array, 95 | y_pred: np.array, 96 | class_mapping: dict = None, 97 | width: int = 800, 98 | height: int = 600 99 | ) -> Union[go.Figure, str]: 100 | """ 101 | Create a plotly figure showing confusion matrix. 102 | 103 | Parameters 104 | ---------- 105 | y_true : np.array 106 | Array of true (correct) labels 107 | 108 | y_pred : np.array 109 | Array of predicted labels 110 | 111 | class_mapping : dict, optional 112 | Dictionary mapping encoded values to class labels (e.g., {0: 'male', 1: 'female'}) 113 | 114 | width: int (default = 800), optional 115 | Width of the plot 116 | 117 | height: int (default = 600), optional 118 | Height of the plot 119 | 120 | Returns 121 | ------- 122 | plotly.graph_objects.Figure or str 123 | A plotly figure object containing the confusion matrix visualization, 124 | or an error message if an error occurs during the process. 125 | """ 126 | try: 127 | cm = confusion_matrix(y_true, y_pred) 128 | 129 | # Convert class indices to labels using the provided mapping 130 | class_names = [class_mapping[i] for i in range(cm.shape[0])] if class_mapping else list(range(cm.shape[0])) 131 | 132 | fig = go.Figure(data=go.Heatmap( 133 | z=cm, 134 | x=class_names, 135 | y=class_names, 136 | colorscale='Viridis', 137 | text=cm, 138 | texttemplate="%{text}", 139 | textfont={"size": 16}, 140 | hoverongaps=False)) 141 | 142 | fig.update_layout( 143 | title='Confusion Matrix', 144 | xaxis_title='Predicted label', 145 | yaxis_title='True label', 146 | yaxis=dict(autorange="reversed"), 147 | width=width, 148 | height=height 149 | ) 150 | 151 | return fig 152 | except Exception as e: 153 | return f"Error creating confusion matrix plot: {str(e)}" 154 | 155 | 156 | def plot_roc_curve( 157 | y_true: np.array, 158 | y_prob: np.array, 159 | class_names: list = None, 160 | width: int = 800, 161 | height: int = 600 162 | ) -> Union[go.Figure, str]: 163 | """ 164 | Create a plotly figure showing ROC curve. 165 | 166 | Parameters 167 | ---------- 168 | y_true : np.array 169 | Array of true (correct) labels 170 | 171 | y_prob : np.array 172 | Array of predicted probabilities 173 | 174 | class_names : list, optional 175 | List of class names for multiple classes 176 | 177 | width: int (default = 800), optional 178 | Width of the plot 179 | 180 | height: int (default = 600), optional 181 | Height of the plot 182 | 183 | Returns 184 | ------- 185 | plotly.graph_objects.Figure or str 186 | A plotly figure object containing the ROC curve visualization, 187 | or an error message if an error occurs during the process. 188 | """ 189 | try: 190 | fig = go.Figure() 191 | 192 | # Handle binary classification 193 | if y_prob.ndim == 1 or y_prob.shape[1] == 2: 194 | if y_prob.ndim == 2: 195 | y_prob = y_prob[:, 1] 196 | fpr, tpr, _ = roc_curve(y_true, y_prob) 197 | auc_score = auc(fpr, tpr) 198 | 199 | fig.add_trace(go.Scatter( 200 | x=fpr, y=tpr, 201 | name=f'ROC curve (AUC = {auc_score:.3f})', 202 | mode='lines' 203 | )) 204 | 205 | # Handle multi-class 206 | else: 207 | if class_names is None: 208 | class_names = [f'Class {i}' for i in range(y_prob.shape[1])] 209 | 210 | for i in range(y_prob.shape[1]): 211 | fpr, tpr, _ = roc_curve(y_true == i, y_prob[:, i]) 212 | auc_score = auc(fpr, tpr) 213 | 214 | fig.add_trace(go.Scatter( 215 | x=fpr, y=tpr, 216 | name=f'{class_names[i]} (AUC = {auc_score:.3f})', 217 | mode='lines' 218 | )) 219 | 220 | fig.add_trace(go.Scatter( 221 | x=[0, 1], y=[0, 1], 222 | name='Random', 223 | mode='lines', 224 | line=dict(dash='dash', color='gray') 225 | )) 226 | 227 | fig.update_layout( 228 | title='Receiver Operating Characteristic (ROC) Curve', 229 | xaxis_title='False Positive Rate', 230 | yaxis_title='True Positive Rate', 231 | width=width, 232 | height=height, 233 | showlegend=True 234 | ) 235 | 236 | return fig 237 | except Exception as e: 238 | return f"Error creating ROC curve plot: {str(e)}" 239 | 240 | 241 | def plot_calibration_curve( 242 | y_true: np.array, 243 | y_prob: np.array, 244 | class_mapping: Optional[Dict[int, str]] = None, 245 | n_bins: int = 10, 246 | strategy: str = 'uniform', 247 | width: int = 800, 248 | height: int = 600, 249 | ) -> Union[go.Figure, str]: 250 | """ 251 | Create a plotly figure showing probability calibration curve. 252 | 253 | Parameters 254 | ---------- 255 | y_true : np.array 256 | True labels (binary or multiclass) 257 | 258 | y_prob : np.array 259 | Predicted probabilities (shape [n_samples, n_classes] for multiclass) 260 | 261 | n_bins : int (default = 10), optional 262 | Number of bins to discretize the [0, 1] interval 263 | 264 | strategy : {'uniform', 'quantile'} (default = 'uniform'), optional 265 | Strategy used to define the widths of the bins 266 | 267 | width: int (default = 800), optional 268 | Width of the plot 269 | 270 | height: int (default = 600), optional 271 | Height of the plot 272 | 273 | class_mapping: Dict[int, str] (default = None), optional 274 | Dictionary mapping class indices to class names 275 | 276 | Returns 277 | ------- 278 | plotly.graph_objects.Figure or str 279 | A plotly figure object containing the calibration curve visualization, 280 | or an error message if an error occurs during the process. 281 | """ 282 | try: 283 | from sklearn.calibration import calibration_curve 284 | from sklearn.preprocessing import LabelBinarizer 285 | 286 | fig = go.Figure() 287 | 288 | # Handle binary classification 289 | if y_prob.ndim == 1 or y_prob.shape[1] == 2: 290 | if y_prob.ndim == 2: 291 | y_prob = y_prob[:, 1] 292 | 293 | prob_true, prob_pred = calibration_curve(y_true, y_prob, 294 | n_bins=n_bins, 295 | strategy=strategy) 296 | 297 | class_name = class_mapping.get(1, 'Positive Class') if class_mapping else 'Calibration Curve' 298 | fig.add_trace(go.Scatter( 299 | x=prob_pred, 300 | y=prob_true, 301 | name=class_name, 302 | mode='lines+markers', 303 | marker=dict(size=8) 304 | )) 305 | 306 | # Handle multiclass using one-vs-rest approach 307 | else: 308 | lb = LabelBinarizer().fit(y_true) 309 | y_onehot = lb.transform(y_true) 310 | 311 | for class_idx in range(y_prob.shape[1]): 312 | prob_true, prob_pred = calibration_curve(y_onehot[:, class_idx], 313 | y_prob[:, class_idx], 314 | n_bins=n_bins, 315 | strategy=strategy) 316 | 317 | class_name = class_mapping.get(class_idx, f'Class {class_idx}') if class_mapping else f'Class {class_idx}' 318 | 319 | # Apply class mapping here 320 | if class_mapping and class_idx in class_mapping: 321 | class_name = class_mapping[class_idx] 322 | 323 | fig.add_trace(go.Scatter( 324 | x=prob_pred, 325 | y=prob_true, 326 | name=class_name, 327 | mode='lines+markers', 328 | marker=dict(size=8) 329 | )) 330 | 331 | # Add perfect calibration line 332 | fig.add_trace(go.Scatter( 333 | x=[0, 1], 334 | y=[0, 1], 335 | name='Perfect Calibration', 336 | line=dict(dash='dash', color='gray'), 337 | mode='lines' 338 | )) 339 | 340 | fig.update_layout( 341 | title='Calibration Curve (Reliability Diagram)', 342 | xaxis_title='Mean Predicted Probability', 343 | yaxis_title='Fraction of Positives', 344 | width=width, 345 | height=height, 346 | showlegend=True, 347 | legend=dict(x=0.7, y=0.1), 348 | xaxis=dict(range=[0, 1]), 349 | yaxis=dict(range=[0, 1]) 350 | ) 351 | 352 | return fig 353 | 354 | except Exception as e: 355 | return f"Error creating calibration curve plot: {str(e)}" 356 | 357 | 358 | def plot_shap( 359 | model: object, 360 | X_test: np.array, 361 | shap_type: str = 'shap_summary' 362 | ) -> Union[go.Figure, str]: 363 | """ 364 | Create a plotly figure showing SHAP values visualization. 365 | 366 | Parameters 367 | ---------- 368 | model : object 369 | Trained model 370 | 371 | X_test : np.array 372 | Feature data for explanation 373 | 374 | shap_type : str 375 | Type of SHAP plot to generate: 376 | - 'shap_summary': shap.summary_plot 377 | - 'shap_violin': shap.plots.violin 378 | 379 | Returns 380 | ------- 381 | plotly.graph_objects.Figure or str 382 | A plotly figure object containing the SHAP values visualization, 383 | or an error message if an error occurs during the process. 384 | """ 385 | try: 386 | # Check if model is a tree-based model 387 | model_type = str(type(model)) 388 | 389 | tree_based_models = [ 390 | "RandomForest", "GradientBoosting", "AdaBoost", 391 | "HistGradientBoosting", "DecisionTree", "ExtraTrees", 392 | "XGB", "CatBoost", "LGBM" 393 | ] 394 | is_tree_based = any(model_name in model_type for model_name in tree_based_models) 395 | 396 | if is_tree_based: 397 | explainer = shap.TreeExplainer(model) 398 | shap_values = explainer.shap_values(X_test) 399 | else: 400 | explainer = shap.KernelExplainer(model.predict, X_test) 401 | shap_values = explainer.shap_values(X_test, silent=True) 402 | 403 | if len(shap_values.shape) == 3: # Models like DecisionTree, RandomForest return probabilities for each class, Let's downgrade to 2D array 404 | shap_values = shap_values[:, :, 1] 405 | # Convert SHAP values to appropriate format if needed 406 | if isinstance(shap_values, list) and shap_type != 'shap_dependence': 407 | shap_values = np.array(shap_values).mean(axis=0) 408 | 409 | # Generate the appropriate SHAP plot based on shap_type 410 | if shap_type == 'shap_summary': 411 | shap.summary_plot(shap_values, X_test) 412 | elif shap_type == 'shap_violin': 413 | shap.plots.violin(shap_values, X_test) 414 | else: 415 | return f"Invalid shap_type: {shap_type}" 416 | 417 | return True 418 | 419 | except Exception as e: 420 | return f"Error creating SHAP plot: {str(e)}" 421 | 422 | 423 | def plot_residuals( 424 | model: object, 425 | X_train: np.array, 426 | y_train: np.array, 427 | X_test: np.array, 428 | y_test: np.array 429 | ) -> object: 430 | """ 431 | Create a residuals plot using Yellowbrick. 432 | 433 | Parameters 434 | ---------- 435 | model : object 436 | Trained regressor 437 | 438 | X_train : np.array 439 | Training features 440 | 441 | y_train : np.array 442 | Training targets 443 | 444 | X_test : np.array 445 | Test features 446 | 447 | y_test : np.array 448 | Test targets 449 | 450 | Returns 451 | ------- 452 | object 453 | Visualizer object from Yellowbrick 454 | """ 455 | try: 456 | if model.__class__.__name__ == "CatBoostRegressor": # https://github.com/DistrictDataLabs/yellowbrick/issues/1099 457 | from yellowbrick.contrib.wrapper import regressor as wrap_regressor 458 | model = wrap_regressor(model) 459 | 460 | visualizer = ResidualsPlot(model) 461 | visualizer.fit(X_train, y_train) 462 | visualizer.score(X_test, y_test) 463 | return visualizer 464 | 465 | except Exception as e: 466 | return f"Error creating residuals plot: {str(e)}" 467 | 468 | 469 | def plot_prediction_error( 470 | model: object, 471 | X_train: np.array, 472 | y_train: np.array, 473 | X_test: np.array, 474 | y_test: np.array 475 | ) -> object: 476 | """ 477 | Create a prediction error plot using Yellowbrick. 478 | 479 | Parameters 480 | ---------- 481 | model : object 482 | Trained regressor 483 | 484 | X_train : np.array 485 | Training features 486 | 487 | y_train : np.array 488 | Training targets 489 | 490 | X_test : np.array 491 | Test features 492 | 493 | y_test : np.array 494 | Test targets 495 | 496 | Returns 497 | ------- 498 | object 499 | Visualizer object from Yellowbrick 500 | """ 501 | try: 502 | if model.__class__.__name__ == "CatBoostRegressor": # https://github.com/DistrictDataLabs/yellowbrick/issues/1099 503 | from yellowbrick.contrib.wrapper import regressor as wrap_regressor 504 | model = wrap_regressor(model) 505 | 506 | visualizer = PredictionError(model) 507 | visualizer.fit(X_train, y_train) 508 | visualizer.score(X_test, y_test) 509 | return visualizer 510 | 511 | except Exception as e: 512 | return f"Error creating prediction error plot: {str(e)}" -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | import numpy as np 4 | from parameterized import parameterized 5 | from flexml.helpers import validate_inputs, eval_metric_checker, random_state_checker 6 | import warnings 7 | warnings.filterwarnings("ignore") 8 | 9 | 10 | class TestHelpers(unittest.TestCase): 11 | """ 12 | Test cases for the feature engineering pipeline in the Classification class 13 | """ 14 | np.random.seed(42) 15 | n_rows = 100 16 | 17 | df = pd.DataFrame({ 18 | 'id': range(1, n_rows + 1), 19 | 'category_default': np.random.choice(['A', 'B', 'C'], n_rows), 20 | 'value_default': np.random.normal(100, 15, n_rows), 21 | 'status': np.random.choice(['Active', 'Pending', 'Closed'], n_rows), 22 | 'priority': np.random.choice(['High', 'Medium', 'Low'], n_rows), 23 | 'score': np.random.randint(0, 100, n_rows), 24 | 'amount': np.random.uniform(10, 1000, n_rows), 25 | 'target': np.random.choice([0, 1], n_rows) 26 | }) 27 | 28 | # This will create artificial null values within dataframe 29 | for column in df.columns: 30 | if column not in ['id', 'target']: 31 | mask = np.random.random(n_rows) < 0.2 32 | df.loc[mask, column] = np.nan 33 | 34 | 35 | @parameterized.expand([ 36 | # Basic validation errors 37 | ( 38 | "target_in_drop_columns", 39 | {"drop_columns": ["target"]}, 40 | ValueError, 41 | "target column 'target' cannot be in the drop_columns list" 42 | ), 43 | 44 | ( 45 | "no_features_after_drop", 46 | {"drop_columns": ["category_default", "value_default", "status", "priority", "score", "amount", "id"]}, 47 | ValueError, 48 | "After dropping columns, only {'target'} remain" 49 | ), 50 | 51 | # Imputation method errors 52 | ( 53 | "invalid_cat_imputation", 54 | {"categorical_imputation_method": "invalid"}, 55 | ValueError, 56 | "categorical_imputation_method 'invalid' is not valid" 57 | ), 58 | 59 | ( 60 | "invalid_num_imputation", 61 | {"numerical_imputation_method": "invalid"}, 62 | ValueError, 63 | "numerical_imputation_method 'invalid' is not valid" 64 | ), 65 | 66 | # Column imputation map errors 67 | ( 68 | "column_imputation_invalid_column", 69 | {"column_imputation_map": {"nonexistent_column": "mean"}}, 70 | ValueError, 71 | "column 'nonexistent_column' in column_imputation_map is not in the data" 72 | ), 73 | 74 | ( 75 | "column_imputation_invalid_numeric_method", 76 | {"column_imputation_map": {"value_default": "invalid"}}, 77 | ValueError, 78 | "numeric imputation method 'invalid' for column 'value_default' is not valid" 79 | ), 80 | 81 | ( 82 | "column_imputation_invalid_categorical_method", 83 | {"column_imputation_map": {"category_default": "invalid"}}, 84 | ValueError, 85 | "categorical imputation method 'invalid' for column 'category_default' is not valid" 86 | ), 87 | 88 | # Constant type errors 89 | ( 90 | "invalid_numerical_constant", 91 | {"numerical_imputation_constant": "invalid"}, 92 | ValueError, 93 | "numerical_imputation_constant should be a number" 94 | ), 95 | 96 | ( 97 | "invalid_categorical_constant", 98 | {"categorical_imputation_constant": 123}, 99 | ValueError, 100 | "categorical_imputation_constant should be a string" 101 | ), 102 | 103 | # Encoding method errors 104 | ( 105 | "invalid_encoding_method", 106 | {"encoding_method": "invalid_encoder"}, 107 | ValueError, 108 | "encoding_method 'invalid_encoder' is not valid" 109 | ), 110 | 111 | ( 112 | "invalid_onehot_limit", 113 | {"onehot_limit": -5}, 114 | ValueError, 115 | "onehot_limit should be a positive integer" 116 | ), 117 | 118 | # Encoding method map errors 119 | ( 120 | "encoding_map_invalid_column", 121 | {"encoding_method_map": {"nonexistent_column": "label_encoder"}}, 122 | ValueError, 123 | "column 'nonexistent_column' in encoding_method_map is not in the data" 124 | ), 125 | 126 | ( 127 | "encoding_map_dropped_column", 128 | {"drop_columns": ["category_default"], "encoding_method_map": {"category_default": "label_encoder"}}, 129 | ValueError, 130 | "column 'category_default' in encoding_method_map is in drop_columns" 131 | ), 132 | 133 | ( 134 | "encoding_map_invalid_method", 135 | {"encoding_method_map": {"category_default": "invalid"}}, 136 | ValueError, 137 | "encoding method 'invalid' for column 'category_default' is not valid" 138 | ), 139 | 140 | # Ordinal encoding errors 141 | ( 142 | "missing_ordinal_map", 143 | {"encoding_method": "ordinal_encoder"}, 144 | ValueError, 145 | "Ordinal encoding is selected but no ordinal_encode_map is provided" 146 | ), 147 | 148 | ( 149 | "missing_column_ordinal_map", 150 | {"encoding_method": "ordinal_encoder", "ordinal_encode_map": {}}, 151 | ValueError, 152 | "Ordinal encoding is selected for column 'category_default' but no ordinal_encode_map is provided" 153 | ), 154 | 155 | ( 156 | "mismatched_ordinal_values", 157 | {"encoding_method": "ordinal_encoder", 158 | "ordinal_encode_map": { 159 | "category_default": ["X", "Y", "Z"], 160 | "status": ["Active", "Pending", "Closed"], 161 | "priority": ["Low", "Medium", "High"]}}, 162 | ValueError, 163 | "Distinct values in column 'category_default' do not match" 164 | ), 165 | 166 | ( 167 | "extra_columns_ordinal_map", 168 | {"encoding_method": "ordinal_encoder", 169 | "ordinal_encode_map": { 170 | "category_default": ["A", "B", "C"], 171 | "status": ["Active", "Pending", "Closed"], 172 | "priority": ["Low", "Medium", "High"], 173 | "extra_column": ["X", "Y", "Z"]}}, 174 | ValueError, 175 | "Ordinal_encode_map includes extra columns not in the categorical columns" 176 | ), 177 | 178 | # Normalization errors 179 | ( 180 | "invalid_normalization", 181 | {"normalize": "invalid_scaler"}, 182 | ValueError, 183 | "normalize method 'invalid_scaler' is not valid" 184 | ), 185 | 186 | # Drop columns validation 187 | ( 188 | "drop_column_not_in_data", 189 | {"drop_columns": ["nonexistent_column"]}, 190 | ValueError, 191 | "column 'nonexistent_column' in drop_columns is not in the data" 192 | ), 193 | 194 | # Ordinal encoding in method map errors 195 | ( 196 | "missing_ordinal_map_in_method_map", 197 | {"encoding_method_map": {"category_default": "ordinal_encoder"}}, 198 | ValueError, 199 | "Ordinal encoding is selected for column 'category_default' but no ordinal_encode_map is provided" 200 | ), 201 | 202 | ( 203 | "missing_column_ordinal_map_in_method_map", 204 | {"encoding_method_map": {"category_default": "ordinal_encoder"}, 205 | "ordinal_encode_map": {}}, 206 | ValueError, 207 | "Ordinal encoding is selected for column 'category_default' but no ordinal_encode_map is provided" 208 | ), 209 | 210 | ( 211 | "mismatched_ordinal_values_in_method_map", 212 | {"encoding_method_map": { 213 | "category_default": "ordinal_encoder", 214 | "status": "label_encoder", 215 | "priority": "label_encoder" 216 | }, 217 | "ordinal_encode_map": { 218 | "category_default": ["X", "Y", "Z"] 219 | }}, 220 | ValueError, 221 | "Unique values in 'category_default' do not match with the ones given in ordinal_encode_map" 222 | ), 223 | 224 | ( 225 | "extra_columns_ordinal_map_in_method_map", 226 | {"encoding_method_map": { 227 | "category_default": "ordinal_encoder" 228 | }, 229 | "ordinal_encode_map": { 230 | "category_default": ["A", "B", "C"], 231 | "extra_column": ["X", "Y", "Z"] 232 | }}, 233 | ValueError, 234 | "Ordinal_encode_map includes extra columns not specified for ordinal encoding" 235 | ), 236 | ]) 237 | def test_validate_inputs_errors(self, test_name, params, expected_error, expected_message): 238 | """Test validate_inputs exception raising for invalid parameters""" 239 | with self.assertRaisesRegex(expected_error, expected_message): 240 | validate_inputs( 241 | data=self.df, 242 | target_col='target', 243 | **params 244 | ) 245 | 246 | # helpers/validators.py 247 | @parameterized.expand([ 248 | # Default behavior tests 249 | ( 250 | "regression_default", 251 | {"ml_task_type": "Regression", "eval_metric": None}, 252 | "R2", 253 | None 254 | ), 255 | ( 256 | "classification_default", 257 | {"ml_task_type": "Classification", "eval_metric": None}, 258 | "Accuracy", 259 | None 260 | ), 261 | 262 | # Regression metric tests 263 | ( 264 | "regression_valid_lowercase", 265 | {"ml_task_type": "Regression", "eval_metric": "mae"}, 266 | "MAE", 267 | None 268 | ), 269 | ( 270 | "regression_valid_uppercase", 271 | {"ml_task_type": "Regression", "eval_metric": "RMSE"}, 272 | "RMSE", 273 | None 274 | ), 275 | ( 276 | "regression_invalid_metric", 277 | {"ml_task_type": "Regression", "eval_metric": "invalid"}, 278 | None, 279 | ValueError 280 | ), 281 | 282 | # Classification metric tests 283 | ( 284 | "classification_valid_exact", 285 | {"ml_task_type": "Classification", "eval_metric": "Accuracy"}, 286 | "Accuracy", 287 | None 288 | ), 289 | ( 290 | "classification_valid_flexible", 291 | {"ml_task_type": "Classification", "eval_metric": "roc-auc"}, 292 | "ROC-AUC", 293 | None 294 | ), 295 | ( 296 | "classification_valid_no_special", 297 | {"ml_task_type": "Classification", "eval_metric": "rocauc"}, 298 | "ROC-AUC", 299 | None 300 | ), 301 | ( 302 | "classification_invalid_metric", 303 | {"ml_task_type": "Classification", "eval_metric": "invalid"}, 304 | None, 305 | ValueError 306 | ), 307 | 308 | # Custom metrics list tests 309 | ( 310 | "custom_metrics_valid_classification", 311 | { 312 | "ml_task_type": "Classification", 313 | "eval_metric": "F1 Score", 314 | "all_evaluation_metrics": None, 315 | "default_evaluation_metric": None 316 | }, 317 | "F1 Score", 318 | None 319 | ), 320 | 321 | ( 322 | "custom_metrics_valid_regression", 323 | { 324 | "ml_task_type": "Regression", 325 | "eval_metric": "MAE", 326 | "all_evaluation_metrics": None, 327 | "default_evaluation_metric": None 328 | }, 329 | "MAE", 330 | None 331 | ), 332 | 333 | ]) 334 | def test_eval_metric_checker(self, test_name, params, expected_result, expected_error): 335 | """Test eval_metric_checker validation""" 336 | if expected_error: 337 | with self.assertRaises(expected_error): 338 | eval_metric_checker(**params) 339 | else: 340 | result = eval_metric_checker(**params) 341 | self.assertEqual(result, expected_result) 342 | 343 | # helpers/validators.py 344 | @parameterized.expand([ 345 | # Valid cases 346 | ( 347 | "none_value", 348 | None, 349 | None, 350 | None 351 | ), 352 | ( 353 | "zero_value", 354 | 0, 355 | 0, 356 | None 357 | ), 358 | ( 359 | "positive_integer", 360 | 42, 361 | 42, 362 | None 363 | ), 364 | 365 | # Invalid cases 366 | ( 367 | "negative_integer", 368 | -1, 369 | None, 370 | ValueError 371 | ), 372 | ( 373 | "float_value", 374 | 42.0, 375 | None, 376 | ValueError 377 | ) 378 | ]) 379 | def test_random_state_checker(self, test_name, input_value, expected_result, expected_error): 380 | """Test random_state_checker validation""" 381 | if expected_error: 382 | with self.assertRaises(expected_error): 383 | random_state_checker(input_value) 384 | else: 385 | result = random_state_checker(input_value) 386 | self.assertEqual(result, expected_result) 387 | 388 | # helpers/supervised_helpers.py 389 | def test_binary_classification_probabilities(self): 390 | """ 391 | Test binary classification with probability predictions. 392 | """ 393 | from flexml.helpers.supervised_helpers import evaluate_model_perf 394 | 395 | # Setup binary classification data 396 | y_true = np.array([0, 1, 0, 1, 0]) 397 | y_pred_proba = np.array([ 398 | [0.8, 0.2], # Should predict class 0 399 | [0.3, 0.7], # Should predict class 1 400 | [0.6, 0.4], # Should predict class 0 401 | [0.2, 0.8], # Should predict class 1 402 | [0.9, 0.1] # Should predict class 0 403 | ]) 404 | 405 | # Test model performance evaluation 406 | results = evaluate_model_perf("Classification", y_true, y_pred_proba) 407 | 408 | # Verify all metrics are present 409 | expected_metrics = {"Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"} 410 | self.assertEqual(set(results.keys()), expected_metrics) 411 | 412 | def test_multiclass_classification_probabilities(self): 413 | """ 414 | Test multiclass classification with probability predictions for more than two classes. 415 | """ 416 | from flexml.helpers.supervised_helpers import evaluate_model_perf 417 | 418 | # Setup multiclass classification data 419 | y_true = np.array([0, 1, 2, 1, 0]) 420 | y_pred_proba = np.array([ 421 | [0.8, 0.1, 0.1], # Should predict class 0 422 | [0.1, 0.7, 0.2], # Should predict class 1 423 | [0.2, 0.2, 0.6], # Should predict class 2 424 | [0.1, 0.8, 0.1], # Should predict class 1 425 | [0.6, 0.2, 0.2] # Should predict class 0 426 | ]) 427 | 428 | # Test model performance evaluation 429 | results = evaluate_model_perf("Classification", y_true, y_pred_proba) 430 | 431 | # Verify all metrics are present 432 | expected_metrics = {"Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"} 433 | self.assertEqual(set(results.keys()), expected_metrics) 434 | 435 | def test_classification_with_direct_labels(self): 436 | """ 437 | Test classification with direct label predictions (no probabilities). 438 | Tests the handling of predictions when model doesn't output probabilities. 439 | """ 440 | from flexml.helpers.supervised_helpers import evaluate_model_perf 441 | 442 | # Setup classification data with direct labels 443 | y_true = np.array([0, 1, 0, 1, 0]) 444 | y_pred_labels = np.array([0, 1, 0, 1, 0]) # Direct label predictions 445 | 446 | # Test model performance evaluation 447 | results = evaluate_model_perf("Classification", y_true, y_pred_labels) 448 | 449 | # Verify all metrics are present 450 | expected_metrics = {"Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"} 451 | self.assertEqual(set(results.keys()), expected_metrics) 452 | 453 | def test_evaluate_preds_invalid_metric(self): 454 | """ 455 | Test that _evaluate_preds raises ValueError for invalid metrics. 456 | """ 457 | from flexml.helpers.supervised_helpers import _evaluate_preds 458 | 459 | y_true = np.array([0, 1, 0]) 460 | y_pred = np.array([0, 1, 0]) 461 | 462 | with self.assertRaisesRegex(ValueError, "Error while evaluating the current model"): 463 | _evaluate_preds(y_true, y_pred, "InvalidMetric") 464 | 465 | def test_probability_to_label_conversion(self): 466 | """ 467 | Test the conversion from probability predictions to class labels. 468 | Tests both binary and multiclass cases. 469 | """ 470 | from flexml.helpers.supervised_helpers import evaluate_model_perf 471 | 472 | # Binary case 473 | y_true_binary = np.array([0, 1, 0]) 474 | y_pred_binary_proba = np.array([ 475 | [0.8, 0.2], # Should convert to 0 476 | [0.3, 0.7], # Should convert to 1 477 | [0.6, 0.4] # Should convert to 0 478 | ]) 479 | 480 | binary_results = evaluate_model_perf("Classification", y_true_binary, y_pred_binary_proba) 481 | self.assertEqual(binary_results["Accuracy"], 1.0) # Perfect predictions after conversion 482 | 483 | # Multiclass case 484 | y_true_multi = np.array([0, 1, 2]) 485 | y_pred_multi_proba = np.array([ 486 | [0.8, 0.1, 0.1], # Should convert to 0 487 | [0.1, 0.7, 0.2], # Should convert to 1 488 | [0.2, 0.2, 0.6] # Should convert to 2 489 | ]) 490 | 491 | multi_results = evaluate_model_perf("Classification", y_true_multi, y_pred_multi_proba) 492 | self.assertEqual(multi_results["Accuracy"], 1.0) # Perfect predictions after conversion -------------------------------------------------------------------------------- /flexml/config/ml_models.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import warnings 3 | warnings.filterwarnings("ignore") 4 | 5 | 6 | # TODO: Should be improved 7 | def get_ml_models( 8 | ml_task_type: str, 9 | num_class: Optional[int] = None, 10 | random_state: Optional[int] = None, 11 | n_jobs: Optional[int] = -1 12 | ) -> dict: 13 | """ 14 | Returns a dictionary of quick and wide regression and classification models 15 | 16 | Parameters 17 | ---------- 18 | ml_task_type : str 19 | The type of the machine learning task. It can be "Regression" or "Classification" 20 | 21 | num_class : int, optional (default=None) 22 | The number of classes in the classification task. No need to pass it in regression tasks 23 | It will be set to 2 if None is passed to suppose its binary classification 24 | 25 | random_state : int, optional (default=None) 26 | The random state value for the model training process 27 | 28 | n_jobs : int, optional (default=-1) 29 | The number of jobs to run in parallel. -1 means using all processors 30 | 31 | Returns 32 | ------- 33 | dict 34 | A dictionary of quick and wide Regression/Classification models 35 | """ 36 | if ml_task_type not in ["Regression", "Classification"]: 37 | raise ValueError(f"Expected ml_task_type to be either 'Regression' or 'Classification', got {ml_task_type}") 38 | 39 | if ml_task_type == "Regression": 40 | from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor 41 | from sklearn.linear_model import BayesianRidge, OrthogonalMatchingPursuit 42 | from sklearn.tree import DecisionTreeRegressor 43 | from sklearn.ensemble import ( 44 | AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, 45 | ExtraTreesRegressor, HistGradientBoostingRegressor 46 | ) 47 | from sklearn.neighbors import KNeighborsRegressor 48 | from sklearn.neural_network import MLPRegressor 49 | from xgboost import XGBRegressor 50 | from lightgbm import LGBMRegressor 51 | from catboost import CatBoostRegressor 52 | 53 | 54 | # Quick Regression Models 55 | LINEAR_REGRESSION = LinearRegression(n_jobs=n_jobs) 56 | LASSO_REGRESSION = Lasso(random_state=random_state) 57 | RIDGE_REGRESSION = Ridge(random_state=random_state) 58 | XGBOOST_REGRESSION = XGBRegressor(enable_categorical=True, random_state=random_state, n_jobs=n_jobs) 59 | LIGHTGBM_REGRESSION = LGBMRegressor(verbose=-1, enable_categorical=True, random_state=random_state, n_jobs=n_jobs) 60 | CATBOOST_REGRESSION = CatBoostRegressor(allow_writing_files=False, silent=True, random_seed=random_state, thread_count=n_jobs) 61 | DECISION_TREE_REGRESSION = DecisionTreeRegressor(random_state=random_state) 62 | ELASTIC_NET_REGRESSION = ElasticNet(random_state=random_state) 63 | HUBER_REGRESSION = HuberRegressor() 64 | 65 | # Wide Regression Models 66 | KNN_REGRESSION = KNeighborsRegressor(n_jobs=n_jobs) 67 | BAYESIAN_RIDGE_REGRESSION = BayesianRidge() 68 | ADA_BOOST_REGRESSION = AdaBoostRegressor(random_state=random_state) 69 | HIST_GRADIENT_BOOSTING_REGRESSION = HistGradientBoostingRegressor(random_state=random_state) 70 | GRADIENT_BOOSTING_REGRESSION = GradientBoostingRegressor(random_state=random_state) 71 | RANDOM_FOREST_REGRESSION = RandomForestRegressor(random_state=random_state, n_jobs=n_jobs) 72 | EXTRA_TREES_REGRESSION = ExtraTreesRegressor(random_state=random_state, n_jobs=n_jobs) 73 | OMP_REGRESSION = OrthogonalMatchingPursuit() 74 | MLP_REGRESSION = MLPRegressor( 75 | solver='lbfgs', 76 | hidden_layer_sizes=(50,), 77 | early_stopping=True, 78 | learning_rate='adaptive', 79 | random_state=random_state 80 | ) 81 | 82 | # Quick Regression Model Configurations 83 | QUICK_REGRESSION_MODELS = [ 84 | { 85 | "name": LINEAR_REGRESSION.__class__.__name__, 86 | "model": LINEAR_REGRESSION, 87 | "tuning_param_grid": { 88 | 'fit_intercept': [True, False] 89 | } 90 | }, 91 | { 92 | "name": LASSO_REGRESSION.__class__.__name__, 93 | "model": LASSO_REGRESSION, 94 | "tuning_param_grid": { 95 | "alpha": [0.1, 0.5, 1.0, 2.0], 96 | "max_iter": [1000, 2000, 3000] 97 | } 98 | }, 99 | { 100 | "name": RIDGE_REGRESSION.__class__.__name__, 101 | "model": RIDGE_REGRESSION, 102 | "tuning_param_grid": { 103 | "alpha": [0.1, 0.5, 1.0, 2.0], 104 | "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"] 105 | } 106 | }, 107 | { 108 | "name": XGBOOST_REGRESSION.__class__.__name__, 109 | "model": XGBOOST_REGRESSION, 110 | "tuning_param_grid": { 111 | "n_estimators": [100, 200, 300, 500, 700, 1000], 112 | "max_depth": [3, 5, 7, 9, 10], 113 | "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], 114 | "subsample": [0.5, 0.7, 1], 115 | "colsample_bytree": [0.5, 0.7, 1], 116 | "gamma": [0, 0.1, 0.2], 117 | "reg_alpha": [0, 0.1, 0.5], 118 | "reg_lambda": [0, 0.1, 0.5], 119 | "min_child_weight": [1, 3, 5], 120 | "scale_pos_weight": [1, 2, 3] 121 | } 122 | }, 123 | { 124 | "name": LIGHTGBM_REGRESSION.__class__.__name__, 125 | "model": LIGHTGBM_REGRESSION, 126 | "tuning_param_grid": { 127 | "n_estimators": [100, 200, 300, 500, 700, 1000], 128 | "max_depth": [3, 5, 7, 9, 10, 12], 129 | "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], 130 | "subsample": [0.5, 0.7, 1], 131 | "colsample_bytree": [0.5, 0.7, 1], 132 | "reg_alpha": [0, 0.1, 0.5], 133 | "reg_lambda": [0, 0.1, 0.5], 134 | "min_child_weight": [1, 3, 5], 135 | "num_leaves": [31, 50, 100] 136 | } 137 | }, 138 | { 139 | "name": CATBOOST_REGRESSION.__class__.__name__, 140 | "model": CATBOOST_REGRESSION, 141 | "tuning_param_grid": { 142 | "iterations": [100, 200, 300, 500, 700, 1000, 1500], 143 | "depth": [3, 5, 7, 9, 10, 12], 144 | "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], 145 | "l2_leaf_reg": [0.1, 1, 3, 5, 10], 146 | "border_count": [32, 50, 75, 100, 150] 147 | } 148 | }, 149 | { 150 | "name": DECISION_TREE_REGRESSION.__class__.__name__, 151 | "model": DECISION_TREE_REGRESSION, 152 | "tuning_param_grid": { 153 | "max_depth": [3, 5, 7, 9, 10], 154 | "min_samples_split": [2, 5, 10], 155 | "min_samples_leaf": [1, 2, 4], 156 | "max_features": ["sqrt", "log2"], 157 | "max_leaf_nodes": [10, 20, 30, 40], 158 | "criterion": ["friedman_mse", "poisson", "absolute_error", "squared_error"] 159 | } 160 | }, 161 | { 162 | "name": ELASTIC_NET_REGRESSION.__class__.__name__, 163 | "model": ELASTIC_NET_REGRESSION, 164 | "tuning_param_grid": { 165 | "alpha": [0.1, 0.5, 1.0, 2.0], 166 | "l1_ratio": [0.1, 0.5, 0.7, 1.0] 167 | } 168 | }, 169 | { 170 | "name": HUBER_REGRESSION.__class__.__name__, 171 | "model": HUBER_REGRESSION, 172 | "tuning_param_grid": { 173 | "epsilon": [1.1, 1.35, 1.5, 1.75, 2.0], 174 | "alpha": [0.0001, 0.001, 0.01, 0.1, 1.0] 175 | } 176 | } 177 | ] 178 | 179 | # Wide Regression Model Configurations 180 | WIDE_REGRESSION_MODELS = QUICK_REGRESSION_MODELS + [ 181 | { 182 | "name": KNN_REGRESSION.__class__.__name__, 183 | "model": KNN_REGRESSION, 184 | "tuning_param_grid": { 185 | "n_neighbors": [3, 5, 7, 9], 186 | "weights": ["uniform", "distance"], 187 | "p": [1, 2] 188 | } 189 | }, 190 | { 191 | "name": ADA_BOOST_REGRESSION.__class__.__name__, 192 | "model": ADA_BOOST_REGRESSION, 193 | "tuning_param_grid": { 194 | "n_estimators": [50, 100, 200, 300], 195 | "learning_rate": [0.01, 0.05, 0.1, 0.5, 1], 196 | "loss": ["linear", "square", "exponential"] 197 | } 198 | }, 199 | { 200 | "name": BAYESIAN_RIDGE_REGRESSION.__class__.__name__, 201 | "model": BAYESIAN_RIDGE_REGRESSION, 202 | "tuning_param_grid": { 203 | "max_iter": [100, 200, 300, 400, 500], 204 | "alpha_1": [1e-6, 1e-5, 1e-4], 205 | "alpha_2": [1e-6, 1e-5, 1e-4], 206 | "lambda_1": [1e-6, 1e-5, 1e-4], 207 | "lambda_2": [1e-6, 1e-5, 1e-4] 208 | } 209 | }, 210 | { 211 | "name": RANDOM_FOREST_REGRESSION.__class__.__name__, 212 | "model": RANDOM_FOREST_REGRESSION, 213 | "tuning_param_grid": { 214 | "n_estimators": [50, 100, 200, 300, 400], 215 | "max_depth": [3, 5, 7, 9, 10], 216 | "min_samples_split": [2, 5, 10], 217 | "min_samples_leaf": [1, 2, 4], 218 | "max_features": ["sqrt", "log2", 0.3, 0.5], 219 | "bootstrap": [True, False] 220 | } 221 | }, 222 | { 223 | "name": EXTRA_TREES_REGRESSION.__class__.__name__, 224 | "model": EXTRA_TREES_REGRESSION, 225 | "tuning_param_grid": { 226 | 'n_estimators': [100, 200, 300, 500], 227 | 'max_depth': [3, 5, 7, 9, 10], 228 | 'min_samples_split': [2, 5, 10], 229 | 'min_samples_leaf': [1, 2, 4], 230 | 'max_features': ["sqrt", "log2"], 231 | 'bootstrap': [True, False] 232 | } 233 | }, 234 | { 235 | "name": OMP_REGRESSION.__class__.__name__, 236 | "model": OMP_REGRESSION, 237 | "tuning_param_grid": { 238 | "n_nonzero_coefs": [5, 10, 15, 20], 239 | "tol": [1e-4, 1e-3, 1e-2, 1e-1] 240 | } 241 | }, 242 | { 243 | "name": HIST_GRADIENT_BOOSTING_REGRESSION.__class__.__name__, 244 | "model": HIST_GRADIENT_BOOSTING_REGRESSION, 245 | "tuning_param_grid": { 246 | "max_iter": [100, 200, 300, 500], 247 | "max_depth": [3, 5, 7, 9, 10], 248 | "learning_rate": [0.01, 0.1, 0.3], 249 | "min_samples_leaf": [1, 5, 10], 250 | "l2_regularization": [0, 1.0, 10.0], 251 | "max_bins": [128, 255] 252 | } 253 | }, 254 | { 255 | "name": GRADIENT_BOOSTING_REGRESSION.__class__.__name__, 256 | "model": GRADIENT_BOOSTING_REGRESSION, 257 | "tuning_param_grid": { 258 | "n_estimators": [100, 200, 300, 400, 500], 259 | "max_depth": [3, 5, 7, 9, 10], 260 | "learning_rate": [0.01, 0.02, 0.05, 0.1, 0.2], 261 | "min_samples_split": [2, 5, 10], 262 | "min_samples_leaf": [1, 2, 4], 263 | "alpha": [0.1, 0.5, 0.9] 264 | } 265 | }, 266 | { 267 | "name": MLP_REGRESSION.__class__.__name__, 268 | "model": MLP_REGRESSION, 269 | "tuning_param_grid": { 270 | "hidden_layer_sizes": [(50,), (100,), (50, 50), (100, 50)], 271 | "max_iter": [100, 200, 300, 400], 272 | "activation": ["relu", "tanh"], 273 | "alpha": [0.0001, 0.001, 0.01], 274 | "learning_rate": ["constant", "adaptive"], 275 | "learning_rate_init": [0.001, 0.01] 276 | } 277 | } 278 | ] 279 | 280 | return { 281 | "QUICK": QUICK_REGRESSION_MODELS, 282 | "WIDE": WIDE_REGRESSION_MODELS 283 | } 284 | 285 | else: 286 | from sklearn.linear_model import LogisticRegression 287 | from sklearn.tree import DecisionTreeClassifier 288 | from sklearn.ensemble import ( 289 | AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, 290 | ExtraTreesClassifier, HistGradientBoostingClassifier 291 | ) 292 | from sklearn.neighbors import KNeighborsClassifier 293 | from sklearn.naive_bayes import GaussianNB 294 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis 295 | from sklearn.neural_network import MLPClassifier 296 | from xgboost import XGBClassifier 297 | from lightgbm import LGBMClassifier 298 | from catboost import CatBoostClassifier 299 | 300 | 301 | if num_class is None: # Suppose binary 302 | num_class = 2 303 | 304 | if num_class > 2: 305 | xgb_objective = "multi:softmax" 306 | else: 307 | xgb_objective = "binary:logistic" 308 | 309 | # Quick Classification Models 310 | LOGISTIC_REGRESSION = LogisticRegression(max_iter=1000, random_state=random_state, n_jobs=n_jobs) 311 | XGBOOST_CLASSIFIER = XGBClassifier(objective=xgb_objective, random_state=random_state, n_jobs=n_jobs) 312 | LIGHTGBM_CLASSIFIER = LGBMClassifier(verbose=-1, random_state=random_state, n_jobs=n_jobs) 313 | CATBOOST_CLASSIFIER = CatBoostClassifier(allow_writing_files=False, silent=True, random_seed=random_state, thread_count=n_jobs) 314 | DECISION_TREE_CLASSIFIER = DecisionTreeClassifier(random_state=random_state) 315 | RANDOM_FOREST_CLASSIFIER = RandomForestClassifier(random_state=random_state, n_jobs=n_jobs) 316 | NAIVE_BAYES_CLASSIFIER = GaussianNB() 317 | KNN_CLASSIFIER = KNeighborsClassifier(n_jobs=n_jobs) 318 | 319 | # Wide Classification Models 320 | ADA_BOOST_CLASSIFIER = AdaBoostClassifier(random_state=random_state) 321 | HIST_GRADIENT_BOOSTING_CLASSIFIER = HistGradientBoostingClassifier(random_state=random_state) 322 | GRADIENT_BOOSTING_CLASSIFIER = GradientBoostingClassifier(random_state=random_state) 323 | EXTRA_TREES_CLASSIFIER = ExtraTreesClassifier(random_state=random_state, n_jobs=n_jobs) 324 | QDA_CLASSIFIER = QuadraticDiscriminantAnalysis() 325 | LDA_CLASSIFIER = LinearDiscriminantAnalysis() 326 | MLP_CLASSIFIER = MLPClassifier( 327 | hidden_layer_sizes=(100,), 328 | early_stopping=True, 329 | tol=0.001, 330 | learning_rate='adaptive', 331 | random_state=random_state 332 | ) 333 | 334 | # Quick Classification Model Configurations 335 | QUICK_CLASSIFICATION_MODELS = [ 336 | { 337 | "name": LOGISTIC_REGRESSION.__class__.__name__, 338 | "model": LOGISTIC_REGRESSION, 339 | "tuning_param_grid": { 340 | "penalty": ["l2"], 341 | "C": [0.01, 0.1, 1, 10, 100], 342 | "max_iter": [100, 200, 300, 400, 500] 343 | } 344 | }, 345 | { 346 | "name": XGBOOST_CLASSIFIER.__class__.__name__, 347 | "model": XGBOOST_CLASSIFIER, 348 | "tuning_param_grid": { 349 | "n_estimators": [100, 200, 300, 500, 700, 1000], 350 | "max_depth": [3, 5, 7, 9, 10], 351 | "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], 352 | "subsample": [0.5, 0.7, 0.9, 1], 353 | "colsample_bytree": [0.5, 0.7, 0.9, 1], 354 | "gamma": [0, 0.1, 0.2, 0.3], 355 | "reg_alpha": [0, 0.1, 0.5, 1], 356 | "reg_lambda": [0, 0.1, 0.5, 1], 357 | "min_child_weight": [1, 3, 5], 358 | "scale_pos_weight": [1, 2, 3] 359 | } 360 | }, 361 | { 362 | "name": LIGHTGBM_CLASSIFIER.__class__.__name__, 363 | "model": LIGHTGBM_CLASSIFIER, 364 | "tuning_param_grid": { 365 | "n_estimators": [100, 200, 300, 500, 700, 1000], 366 | "max_depth": [3, 5, 7, 9, 10, 12], 367 | "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], 368 | "subsample": [0.5, 0.7, 0.9, 1], 369 | "colsample_bytree": [0.5, 0.7, 0.9, 1], 370 | "reg_alpha": [0, 0.1, 0.5, 1], 371 | "reg_lambda": [0, 0.1, 0.5, 1], 372 | "min_child_weight": [1, 3, 5], 373 | "num_leaves": [31, 50, 75, 100] 374 | } 375 | }, 376 | { 377 | "name": CATBOOST_CLASSIFIER.__class__.__name__, 378 | "model": CATBOOST_CLASSIFIER, 379 | "tuning_param_grid": { 380 | "iterations": [100, 200, 300, 500, 700, 1000], 381 | "depth": [3, 5, 7, 9, 10, 12], 382 | "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], 383 | "l2_leaf_reg": [0.1, 1, 3, 5, 10], 384 | "border_count": [32, 50, 75, 100, 150] 385 | } 386 | }, 387 | { 388 | "name": DECISION_TREE_CLASSIFIER.__class__.__name__, 389 | "model": DECISION_TREE_CLASSIFIER, 390 | "tuning_param_grid": { 391 | "max_depth": [3, 5, 7, 9, 10], 392 | "min_samples_split": [2, 5, 10], 393 | "min_samples_leaf": [1, 2, 4], 394 | "max_features": ["sqrt", "log2"], 395 | "max_leaf_nodes": [10, 20, 30, 40], 396 | "criterion": ["gini", "entropy"] 397 | } 398 | }, 399 | { 400 | "name": RANDOM_FOREST_CLASSIFIER.__class__.__name__, 401 | "model": RANDOM_FOREST_CLASSIFIER, 402 | "tuning_param_grid": { 403 | "n_estimators": [100, 200, 300, 400], 404 | "max_depth": [3, 5, 7, 9, 10], 405 | "min_samples_split": [2, 5, 10], 406 | "min_samples_leaf": [1, 2, 4], 407 | "max_features": ["sqrt", "log2", 0.3, 0.5], 408 | "bootstrap": [True, False] 409 | } 410 | }, 411 | { 412 | "name": NAIVE_BAYES_CLASSIFIER.__class__.__name__, 413 | "model": NAIVE_BAYES_CLASSIFIER, 414 | "tuning_param_grid": { 415 | "var_smoothing": [1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10] 416 | } 417 | }, 418 | { 419 | "name": KNN_CLASSIFIER.__class__.__name__, 420 | "model": KNN_CLASSIFIER, 421 | "tuning_param_grid": { 422 | "n_neighbors": [3, 5, 7, 9, 11], 423 | "weights": ["uniform", "distance"], 424 | "algorithm": ["auto", "ball_tree", "kd_tree", "brute"], 425 | "leaf_size": [10, 20, 30, 40, 50], 426 | "p": [1, 2] 427 | } 428 | }, 429 | ] 430 | 431 | # Wide Classification Model Configurations 432 | WIDE_CLASSIFICATION_MODELS = QUICK_CLASSIFICATION_MODELS + [ 433 | { 434 | "name": ADA_BOOST_CLASSIFIER.__class__.__name__, 435 | "model": ADA_BOOST_CLASSIFIER, 436 | "tuning_param_grid": { 437 | "n_estimators": [50, 100, 200, 300], 438 | "learning_rate": [0.01, 0.05, 0.1, 0.5, 1], 439 | "algorithm": ["SAMME", "SAMME.R"] 440 | } 441 | }, 442 | { 443 | "name": HIST_GRADIENT_BOOSTING_CLASSIFIER.__class__.__name__, 444 | "model": HIST_GRADIENT_BOOSTING_CLASSIFIER, 445 | "tuning_param_grid": { 446 | "max_iter": [100, 200, 300, 500], 447 | "max_depth": [3, 5, 7, 9, 10], 448 | "learning_rate": [0.01, 0.1, 0.3], 449 | "min_samples_leaf": [1, 5, 10], 450 | "l2_regularization": [0, 1.0, 10.0], 451 | "max_bins": [128, 255] 452 | } 453 | }, 454 | { 455 | "name": GRADIENT_BOOSTING_CLASSIFIER.__class__.__name__, 456 | "model": GRADIENT_BOOSTING_CLASSIFIER, 457 | "tuning_param_grid": { 458 | 'n_estimators': [100, 200, 300, 500], 459 | 'learning_rate': [0.01, 0.1, 0.3], 460 | 'max_depth': [3, 5, 7, 9, 10], 461 | 'min_samples_split': [2, 5, 10], 462 | 'min_samples_leaf': [1, 2, 4] 463 | } 464 | }, 465 | { 466 | "name": EXTRA_TREES_CLASSIFIER.__class__.__name__, 467 | "model": EXTRA_TREES_CLASSIFIER, 468 | "tuning_param_grid": { 469 | 'n_estimators': [100, 200, 300, 500], 470 | 'max_depth': [3, 5, 7, 9, 10], 471 | 'min_samples_split': [2, 5, 10], 472 | 'min_samples_leaf': [1, 2, 4], 473 | 'max_features': ["sqrt", "log2"], 474 | 'bootstrap': [True, False] 475 | } 476 | }, 477 | { 478 | "name": QDA_CLASSIFIER.__class__.__name__, 479 | "model": QDA_CLASSIFIER, 480 | "tuning_param_grid": { 481 | "reg_param": [0.0, 0.1, 0.5, 1.0], 482 | "tol": [1e-4, 1e-3, 1e-2, 1e-1] 483 | } 484 | }, 485 | { 486 | "name": LDA_CLASSIFIER.__class__.__name__, 487 | "model": LDA_CLASSIFIER, 488 | "tuning_param_grid": { 489 | "solver": ["svd", "lsqr", "eigen"], 490 | "shrinkage": [0.1, 0.5, 1.0] 491 | } 492 | }, 493 | { 494 | "name": MLP_CLASSIFIER.__class__.__name__, 495 | "model": MLP_CLASSIFIER, 496 | "tuning_param_grid": { 497 | "hidden_layer_sizes": [(50,), (100,), (50, 50), (100, 50)], 498 | "max_iter": [100, 200, 300, 400], 499 | "activation": ["relu", "tanh"], 500 | "alpha": [0.0001, 0.001, 0.01], 501 | "learning_rate": ["constant", "adaptive"], 502 | "learning_rate_init": [0.001, 0.01] 503 | } 504 | } 505 | ] 506 | 507 | return { 508 | "QUICK": QUICK_CLASSIFICATION_MODELS, 509 | "WIDE": WIDE_CLASSIFICATION_MODELS 510 | } -------------------------------------------------------------------------------- /flexml/helpers/validators.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from typing import Optional, List 3 | from flexml.config import EVALUATION_METRICS, FEATURE_ENGINEERING_METHODS, CROSS_VALIDATION_METHODS 4 | from flexml.logger import get_logger 5 | import re 6 | 7 | def eval_metric_checker( 8 | ml_task_type: str, 9 | eval_metric: Optional[str] = None, 10 | all_evaluation_metrics: Optional[List[str]] = None, 11 | default_evaluation_metric: Optional[str] = None 12 | ) -> str: 13 | """ 14 | Since eval_metric setting and validation is a common process for both Regression and Classification tasks... 15 | this method is used to set and validate the evaluation metric. 16 | 17 | Parameters 18 | ---------- 19 | ml_task_type : str 20 | The type of ML task ('Regression' or 'Classification') 21 | 22 | eval_metric : str, optional (default='R2' for Regression, 'Accuracy' for Classification) 23 | The evaluation metric to use for model evaluation 24 | 25 | - Avaiable evalulation metrics for Regression: 26 | - R2, MAE, MSE, RMSE, MAPE 27 | 28 | - Avaiable evalulation metrics for Classification: 29 | - Accuracy, Precision, Recall, F1 Score, ROC-AUC 30 | 31 | all_evaluation_metrics : List[str], (default=None) 32 | All possible evaluation metrics for the current task (Regression or Classification), e.g. ['R2', 'MAE', 'MSE', 'RMSE', 'MAPE'] for Regression 33 | 34 | If passed as None, they will be fetched from the config file 35 | 36 | default_evaluation_metric : str, (default=None) 37 | The default evaluation metric to use for the current task (Regression or Classification) e.g. 'R2' for Regression, 'Accuracy' for Classification 38 | 39 | If passed as None, it will be fetched from the config file 40 | 41 | Returns 42 | ------- 43 | str 44 | The evaluation metric to use for model evaluation for the current task (Regression or Classification) 45 | """ 46 | logger = get_logger(__name__, "PROD", False) 47 | 48 | if default_evaluation_metric is None or all_evaluation_metrics is None: 49 | default_evaluation_metric = EVALUATION_METRICS[ml_task_type]["DEFAULT"] 50 | all_evaluation_metrics = EVALUATION_METRICS[ml_task_type]["ALL"] 51 | 52 | if eval_metric is None: 53 | return default_evaluation_metric 54 | 55 | if ml_task_type == "Regression": 56 | eval_metric = eval_metric.upper() 57 | else: 58 | # Normalize input for flexible matching 59 | original_metric = eval_metric 60 | normalized_input = re.sub(r'[^a-zA-Z0-9]', '', eval_metric).lower() 61 | normalized_config = {re.sub(r'[^a-zA-Z0-9]', '', m).lower(): m 62 | for m in all_evaluation_metrics} 63 | 64 | if normalized_input in normalized_config: 65 | eval_metric = normalized_config[normalized_input] 66 | else: 67 | error_msg = (f"'{original_metric}' is not a valid evaluation metric for {ml_task_type}, " 68 | f"expected one of: {all_evaluation_metrics}") 69 | logger.error(error_msg) 70 | raise ValueError(error_msg) 71 | 72 | if eval_metric not in all_evaluation_metrics: 73 | error_msg = f"Validation failed for {eval_metric} - not in configured metrics" 74 | logger.error(error_msg) 75 | raise ValueError(error_msg) 76 | 77 | return eval_metric 78 | 79 | def random_state_checker(random_state: Optional[int] = None) -> int: 80 | """ 81 | Validates the random_state parameter 82 | 83 | Parameters 84 | ---------- 85 | random_state : int, optional (default=None) 86 | Random state value 87 | 88 | Returns 89 | ------- 90 | int 91 | Validated random state 92 | """ 93 | logger = get_logger(__name__, "PROD", False) 94 | 95 | if random_state is not None and (not isinstance(random_state, int) or random_state < 0): 96 | error_msg = f"random_state should be either None or a positive integer, got {random_state}" 97 | logger.error(error_msg) 98 | raise ValueError(error_msg) 99 | 100 | return random_state 101 | 102 | def cross_validation_checker( 103 | df: pd.DataFrame, 104 | cv_method: Optional[str] = None, 105 | n_folds: Optional[int] = None, 106 | test_size: Optional[float] = None, 107 | groups_col: Optional[str] = None, 108 | available_cv_methods: Optional[dict] = None, 109 | ml_task_type: Optional[str] = None 110 | ) -> str: 111 | 112 | """ 113 | df : pd.DataFrame 114 | The DataFrame that will be performed cross-validation to 115 | 116 | cv_method : str, (default='kfold' for Regression, 'stratified_kfold' for Classification if ml_task_type is not None) 117 | The cross-validation method to use 118 | 119 | If passed as None, the default cross-validation method for the corresponding ml_task_type will be used If ml_task_type is not None 120 | 121 | n_folds : int, optional (default=None) 122 | Number of folds to use for cross-validation 123 | 124 | test_size : float, optional (default=None) 125 | The proportion of the dataset to include in the test split 126 | 127 | groups_col : str, optional (default=None) 128 | The column in the DataFrame that contains the groups for group-based cross-validation methods 129 | 130 | available_cv_methods : dict, optional (default=None) 131 | A dictionary containing the available cross-validation methods 132 | 133 | ml_task_type : str, optional (default=None) 134 | The type of ML task ('Regression' or 'Classification') 135 | 136 | Returns 137 | ------- 138 | str 139 | The cross-validation method to use for the current task (Regression or Classification) 140 | """ 141 | logger = get_logger(__name__, "PROD", False) 142 | 143 | if ml_task_type is not None and ml_task_type not in ['Regression', 'Classification']: 144 | error_msg = f"ml_task_type should be 'Regression' or 'Classification', got {ml_task_type}" 145 | logger.error(error_msg) 146 | raise ValueError(error_msg) 147 | 148 | if available_cv_methods is None: 149 | if ml_task_type is not None: 150 | available_cv_methods = CROSS_VALIDATION_METHODS[ml_task_type] 151 | else: 152 | available_cv_methods = CROSS_VALIDATION_METHODS['all'] 153 | 154 | if cv_method is None: 155 | if ml_task_type is not None: 156 | if ml_task_type == 'Regression': 157 | cv_method = 'kfold' 158 | elif ml_task_type == "Classification": 159 | cv_method = 'stratified_kfold' 160 | 161 | else: 162 | cv_method = cv_method.lower() 163 | if available_cv_methods.get(cv_method) is None: 164 | # If cv_method is not found in the available cv methods, check the without '_' version --> 165 | # e.g. 'stratified_kfold' and 'stratifiedkfold' 166 | if cv_method in available_cv_methods.values(): 167 | cv_method = list(available_cv_methods.keys())[list(available_cv_methods.values()).index(cv_method)] 168 | 169 | # Check if cv_method is still None 170 | if cv_method is None or cv_method not in list(available_cv_methods.keys()): 171 | error_msg = f"cv_method is not found in the available cross-validation methods, expected one of {list(available_cv_methods.keys())}, got {cv_method}" 172 | logger.error(error_msg) 173 | raise ValueError(error_msg) 174 | 175 | if n_folds is not None and (not isinstance(n_folds, int) or n_folds < 2): 176 | error_msg = "`n_folds` must be an integer >= 2 if provided" 177 | logger.error(error_msg) 178 | raise ValueError(error_msg) 179 | 180 | if test_size is not None and (not isinstance(test_size, float) or not 0 < test_size < 1): 181 | error_msg = f"test_size parameter expected to be a float between 0 and 1, got {test_size}" 182 | logger.error(error_msg) 183 | raise ValueError(error_msg) 184 | 185 | if groups_col is not None and groups_col not in df.columns: 186 | error_msg = f"groups_col should be a column in the DataFrame, got {groups_col}" 187 | logger.error(error_msg) 188 | raise ValueError(error_msg) 189 | 190 | if cv_method in ["group_kfold", "group_shuffle_split"] and groups_col is None: 191 | error_msg = "`groups_col` must be provided for group-based methods" 192 | logger.error(error_msg) 193 | raise ValueError(error_msg) 194 | 195 | return cv_method 196 | 197 | def validate_inputs( 198 | data: pd.DataFrame, 199 | target_col: str, 200 | drop_columns=None, 201 | categorical_imputation_method="mode", 202 | numerical_imputation_method="mean", 203 | column_imputation_map=None, 204 | numerical_imputation_constant=0.0, 205 | categorical_imputation_constant="Unknown", 206 | encoding_method="label_encoder", 207 | onehot_limit=25, 208 | encoding_method_map=None, 209 | ordinal_encode_map=None, 210 | normalize=None 211 | ): 212 | """ 213 | Validates the input parameters for the feature engineering process 214 | 215 | Parameters 216 | ---------- 217 | data : pd.DataFrame 218 | The input data for the model training process 219 | 220 | target_col : str 221 | The target column name in the data 222 | 223 | drop_columns : list, default=None 224 | Columns that will be dropped from the data. 225 | 226 | categorical_imputation_method : str, default='mode' 227 | Imputation method for categorical columns. Options: 228 | * 'mode': Replace missing values with the most frequent value. 229 | * 'constant': Replace missing values with a constant value. 230 | * 'drop': Drop rows with missing values. 231 | 232 | numerical_imputation_method : str, default='mean' 233 | Imputation method for numerical columns. Options: 234 | * 'mean': Replace missing values with the column mean. 235 | * 'median': Replace missing values with the column median. 236 | * 'mode': Replace missing values with the column mode. 237 | * 'constant': Replace missing values with a constant value. 238 | * 'drop': Drop rows with missing values. 239 | 240 | column_imputation_map : dict, default=None 241 | Custom mapping of columns to specific imputation methods. 242 | Example usage: {'column_name': 'mean', 'column_name2': 'mode'} 243 | 244 | numerical_imputation_constant : float, default=0.0 245 | The constant value for imputing numerical columns when 'constant' is selected. 246 | 247 | categorical_imputation_constant : str, default='Unknown' 248 | The constant value for imputing categorical columns when 'constant' is selected. 249 | 250 | encoding_method : str, default='label_encoder' 251 | Encoding method for categorical columns. Options: 252 | * 'label_encoder': Use label encoding 253 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html 254 | * 'onehot_encoder': Use one-hot encoding 255 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html 256 | * 'ordinal_encoder': Use ordinal encoding 257 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html 258 | 259 | onehot_limit : int, default=25 260 | Maximum number of categories to use for one-hot encoding. 261 | 262 | encoding_method_map : dict, default=None 263 | Custom mapping of columns to encoding methods. 264 | Example usage: {'column_name': 'onehot_encoder', 'column_name2': 'label_encoder'} 265 | 266 | ordinal_encode_map : dict, default=None 267 | Custom mapping of columns to category order for ordinal encoding. 268 | Example usage: {'column_name': ['low', 'medium', 'high']} 269 | 270 | normalize : str, default=None 271 | Standardize the data using StandardScaler. Options: 272 | * 'standard_scaler': Standardize the data using StandardScaler 273 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 274 | * 'minmax_scaler': Scale the data using MinMaxScaler 275 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html 276 | * 'robust_scaler': Scale the data using RobustScaler 277 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html 278 | * 'quantile_transformer': Transform the data using QuantileTransformer 279 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html 280 | * 'maxabs_scaler': Scale the data using MaxAbsScaler 281 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html 282 | * 'normalize_scaler': Normalize the data to unit length 283 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html 284 | """ 285 | # Check if any of the columns in drop_columns match the target_col 286 | if drop_columns is not None and target_col in drop_columns: 287 | error_msg = f"The target column '{target_col}' cannot be in the drop_columns list" 288 | raise ValueError(error_msg) 289 | 290 | if drop_columns is None: 291 | drop_columns = [] 292 | remaining_columns = set(data.columns) - set(drop_columns) 293 | 294 | # Ensure the target column is in the remaining columns and there's at least one feature column 295 | if target_col not in remaining_columns or len(remaining_columns) < 2: 296 | error_msg = ( 297 | f"After dropping columns, only {remaining_columns} remain. " 298 | f"There should be at least one feature column and the target column '{target_col}' remaining." 299 | ) 300 | raise ValueError(error_msg) 301 | 302 | # Check if categorical_imputation_method is valid 303 | if categorical_imputation_method not in FEATURE_ENGINEERING_METHODS["accepted_categorical_imputations_methods"]: 304 | error_msg = f"The categorical_imputation_method '{categorical_imputation_method}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_categorical_imputations_methods']}" 305 | raise ValueError(error_msg) 306 | 307 | # Check if numerical_imputation_method is valid 308 | if numerical_imputation_method not in FEATURE_ENGINEERING_METHODS["accepted_numeric_imputations_methods"]: 309 | error_msg = f"The numerical_imputation_method '{numerical_imputation_method}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_numeric_imputations_methods']}" 310 | raise ValueError(error_msg) 311 | 312 | # Check if encoding_method is valid 313 | if encoding_method not in FEATURE_ENGINEERING_METHODS["accepted_encoding_methods"]: 314 | error_msg = f"The encoding_method '{encoding_method}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_encoding_methods']}" 315 | raise ValueError(error_msg) 316 | 317 | # Check if onehot_limit is a positive integer 318 | if not isinstance(onehot_limit, int) or onehot_limit < 0: 319 | error_msg = f"onehot_limit should be a positive integer, got {onehot_limit}" 320 | raise ValueError(error_msg) 321 | 322 | # Check if drop_columns columns are in data 323 | if drop_columns is not None: 324 | for col in drop_columns: 325 | if col not in data.columns: 326 | error_msg = f"The column '{col}' in drop_columns is not in the data" 327 | raise ValueError(error_msg) 328 | 329 | # Check if columns in column_imputation_map are in data and methods are valid 330 | if column_imputation_map is not None: 331 | for col, method in column_imputation_map.items(): 332 | if col not in data.columns: 333 | error_msg = f"The column '{col}' in column_imputation_map is not in the data" 334 | raise ValueError(error_msg) 335 | 336 | if col in data.select_dtypes(include=['number']).columns: 337 | if method not in FEATURE_ENGINEERING_METHODS["accepted_numeric_imputations_methods"]: 338 | error_msg = f"The numeric imputation method '{method}' for column '{col}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_numeric_imputations_methods']}" 339 | raise ValueError(error_msg) 340 | else: 341 | if method not in FEATURE_ENGINEERING_METHODS["accepted_categorical_imputations_methods"]: 342 | error_msg = f"The categorical imputation method '{method}' for column '{col}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_categorical_imputations_methods']}" 343 | raise ValueError(error_msg) 344 | 345 | # Check if numerical_imputation_constant is a number 346 | if not isinstance(numerical_imputation_constant, (int, float)): 347 | error_msg = f"numerical_imputation_constant should be a number, got {type(numerical_imputation_constant)}" 348 | raise ValueError(error_msg) 349 | 350 | # Check if categorical_imputation_constant is a string 351 | if not isinstance(categorical_imputation_constant, str): 352 | error_msg = f"categorical_imputation_constant should be a string, got {type(categorical_imputation_constant)}" 353 | raise ValueError(error_msg) 354 | 355 | # Check if encoding_method is ordinal_encoder and ordinal_encoder_map is provided for every categorical column 356 | if encoding_method == "ordinal_encoder": 357 | if ordinal_encode_map is None: 358 | error_msg = "Ordinal encoding is selected but no ordinal_encode_map is provided" 359 | raise ValueError(error_msg) 360 | # Check if ordinal_encode_map is provided for every categorical column 361 | for col in data.select_dtypes(include=['object', 'category']).columns: 362 | if col not in ordinal_encode_map: 363 | error_msg = f"Ordinal encoding is selected for column '{col}' but no ordinal_encode_map is provided" 364 | raise ValueError(error_msg) 365 | 366 | # Check if methods inside encoding_method_map are valid and columns are in data 367 | if encoding_method_map is not None: 368 | for col, method in encoding_method_map.items(): 369 | if col not in data.columns: 370 | error_msg = f"The column '{col}' in encoding_method_map is not in the data" 371 | raise ValueError(error_msg) 372 | 373 | if col in drop_columns: 374 | error_msg = f"The column '{col}' in encoding_method_map is in drop_columns" 375 | raise ValueError(error_msg) 376 | 377 | if method not in FEATURE_ENGINEERING_METHODS["accepted_encoding_methods"]: 378 | error_msg = f"The encoding method '{method}' for column '{col}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_encoding_methods']}" 379 | raise ValueError(error_msg) 380 | 381 | # Check if there is a ordinal_encoder between methods and ordinal_encode_map is provided 382 | if method == "ordinal_encoder": 383 | if ordinal_encode_map is None: 384 | error_msg = f"Ordinal encoding is selected for column '{col}' but no ordinal_encode_map is provided" 385 | raise ValueError(error_msg) 386 | # Check if map for col is provided within ordinal_encode_map 387 | if col not in ordinal_encode_map: 388 | error_msg = f"Ordinal encoding is selected for column '{col}' but no ordinal_encode_map is provided" 389 | raise ValueError(error_msg) 390 | 391 | # Check if normalize is valid 392 | if normalize is not None and normalize not in FEATURE_ENGINEERING_METHODS["accepted_standardization_methods"]: 393 | error_msg = f"The normalize method '{normalize}' is not valid. Expected one of the following: {FEATURE_ENGINEERING_METHODS['accepted_standardization_methods']}" 394 | raise ValueError(error_msg) 395 | 396 | # Check if encoding_method is ordinal_encoder 397 | if encoding_method == "ordinal_encoder": 398 | if ordinal_encode_map is None: 399 | error_msg = "Ordinal encoding is selected, but no ordinal_encode_map is provided." 400 | raise ValueError(error_msg) 401 | 402 | # Get all categorical columns 403 | categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist() 404 | 405 | # Check that all categorical columns are in ordinal_encode_map 406 | for col in categorical_columns: 407 | if col not in ordinal_encode_map: 408 | error_msg = f"Ordinal encoding is selected, but column '{col}' is missing in ordinal_encode_map." 409 | raise ValueError(error_msg) 410 | 411 | # Get distinct values in the column 412 | distinct_values = set(data[col].dropna().unique()) 413 | map_values = set(ordinal_encode_map[col]) 414 | 415 | # Check if the values in ordinal_encode_map match exactly with the distinct values 416 | if distinct_values != map_values: 417 | error_msg = ( 418 | f"Distinct values in column '{col}' do not match " 419 | f"Ensure they match exactly." 420 | ) 421 | raise ValueError(error_msg) 422 | 423 | # Check that ordinal_encode_map does not include extra columns 424 | extra_columns = set(ordinal_encode_map.keys()) - set(categorical_columns) 425 | if extra_columns: 426 | error_msg = ( 427 | f"Ordinal_encode_map includes extra columns not in the categorical columns: {extra_columns}. " 428 | f"Remove these columns from the mapping." 429 | ) 430 | raise ValueError(error_msg) 431 | 432 | # Check if encoding_method_map is provided and has ordinal_encoder 433 | if encoding_method_map: 434 | ordinal_columns = [ 435 | col for col, method in encoding_method_map.items() if method == "ordinal_encoder" 436 | ] 437 | else: 438 | ordinal_columns = [] 439 | 440 | if ordinal_columns: 441 | if not ordinal_encode_map: 442 | raise ValueError( 443 | "Ordinal encoding is specified in encoding_method_map, but no ordinal_encode_map is provided." 444 | ) 445 | 446 | # Validate only the columns specified for ordinal encoding 447 | for col in ordinal_columns: 448 | if col not in ordinal_encode_map: 449 | raise ValueError( 450 | f"Column '{col}' is marked for ordinal encoding but is missing in ordinal_encode_map." 451 | ) 452 | 453 | # Get distinct values in the column 454 | distinct_values = set(data[col].dropna().unique()) 455 | map_values = set(ordinal_encode_map[col]) 456 | 457 | # Check if the values in ordinal_encode_map match exactly with the distinct values 458 | if distinct_values != map_values: 459 | raise ValueError( 460 | f"Unique values in '{col}' do not match with the ones given in ordinal_encode_map. " 461 | f"Ensure they match exactly." 462 | ) 463 | 464 | # Ensure ordinal_encode_map does not include extra columns 465 | extra_columns = set(ordinal_encode_map.keys()) - set(ordinal_columns) 466 | if extra_columns: 467 | raise ValueError( 468 | f"Ordinal_encode_map includes extra columns not specified for ordinal encoding." 469 | f"Remove these columns from the mapping." 470 | ) 471 | 472 | return True -------------------------------------------------------------------------------- /flexml/_feature_engineer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, MaxAbsScaler, normalize 7 | from typing import List, Optional, Dict, Any 8 | from flexml.logger import get_logger 9 | 10 | class ColumnDropper(BaseEstimator, TransformerMixin): 11 | """ 12 | A transformer to drop specified columns from a dataset 13 | """ 14 | def __init__(self, drop_columns: Optional[List[str]] = None): 15 | self.drop_columns = drop_columns or [] 16 | 17 | def fit(self, X, y=None): 18 | return self 19 | 20 | def transform(self, X): 21 | """ 22 | Drops specified columns from the input DataFrame 23 | 24 | Returns 25 | ------- 26 | pd.DataFrame 27 | A DataFrame with the specified columns dropped 28 | """ 29 | return X.drop(columns=self.drop_columns, axis=1, errors='ignore') 30 | 31 | 32 | class ColumnImputer(BaseEstimator, TransformerMixin): 33 | """ 34 | A transformer to impute missing values in a dataset 35 | """ 36 | def __init__( 37 | self, 38 | column_imputation_mapper: Dict[str, str], 39 | numerical_imputation_constant: float = 0.0, 40 | categorical_imputation_constant: str = "Unknown" 41 | ): 42 | self.column_imputation_mapper = column_imputation_mapper 43 | self.numerical_imputation_constant = numerical_imputation_constant 44 | self.categorical_imputation_constant = categorical_imputation_constant 45 | 46 | def fit(self, X, y=None): 47 | return self 48 | 49 | def transform(self, X) -> pd.DataFrame: 50 | # Categorical columns are converted to string 51 | categorical_cols = X.select_dtypes(exclude=['number']).columns 52 | X[categorical_cols] = X[categorical_cols].astype(str) 53 | 54 | for column, method in self.column_imputation_mapper.items(): 55 | X[column] = X[column].replace("nan", pd.NA) 56 | if method == "mean": 57 | mean_value = X[column].mean() 58 | X[column] = X[column].fillna(mean_value) 59 | 60 | elif method == "median": 61 | median_value = X[column].median() 62 | X[column] = X[column].fillna(median_value) 63 | 64 | elif method == "mode": 65 | mode_values = X[column].mode() 66 | if len(mode_values) > 0: 67 | mode_value = mode_values[0] 68 | else: 69 | # TODO: Notify user that mode is not available 70 | mode_value = self.categorical_imputation_constant 71 | X[column] = X[column].replace("nan", np.nan).fillna(mode_value) 72 | 73 | elif method == "constant": 74 | if X[column].dtype != 'object': 75 | constant = self.numerical_imputation_constant 76 | else: 77 | constant = self.categorical_imputation_constant 78 | X[column] = X[column].replace("nan", np.nan).fillna(constant) 79 | 80 | elif method == "drop": 81 | X = X.dropna(subset=[column]) 82 | 83 | else: 84 | raise ValueError(f"Invalid imputation method: {method}") 85 | 86 | return X 87 | 88 | 89 | class CategoricalEncoder(BaseEstimator, TransformerMixin): 90 | """ 91 | A transformer to encode categorical columns in a dataset 92 | """ 93 | def __init__( 94 | self, 95 | encoding_method_mapper: Dict[str, str], 96 | ordinal_map: Dict[str, List[str]], 97 | onehot_limit: int = 25 98 | ): 99 | self.encoding_method_mapper = encoding_method_mapper 100 | self.ordinal_map = ordinal_map 101 | self.onehot_limit = onehot_limit 102 | self.label_encoders = {} 103 | self.onehot_encoders = {} 104 | self.ordinal_encoders = {} 105 | 106 | def fit(self, X, y=None): 107 | # Categorical columns are converted to string 108 | categorical_cols = X.select_dtypes(exclude=['number']).columns 109 | X[categorical_cols] = X[categorical_cols].astype(str) 110 | 111 | for col, method in self.encoding_method_mapper.items(): 112 | if method == "label_encoder": 113 | encoder = LabelEncoder() 114 | encoder.fit(X[col].fillna("Unknown")) 115 | self.label_encoders[col] = encoder 116 | 117 | elif method == "onehot_encoder": 118 | encoder = OneHotEncoder( 119 | sparse_output=False, 120 | handle_unknown="ignore", 121 | max_categories=self.onehot_limit 122 | ) 123 | encoder.fit(X[[col]]) 124 | self.onehot_encoders[col] = encoder 125 | 126 | elif method == "ordinal_encoder": 127 | if col in self.ordinal_map: 128 | categories = [self.ordinal_map[col]] 129 | encoder = OrdinalEncoder(categories=categories) 130 | encoder.fit(X[[col]]) 131 | self.ordinal_encoders[col] = encoder 132 | 133 | return self 134 | 135 | def transform(self, X) -> pd.DataFrame: 136 | # Categorical columns are converted to string 137 | categorical_cols = X.select_dtypes(exclude=['number']).columns 138 | X[categorical_cols] = X[categorical_cols].astype(str) 139 | 140 | for col, method in self.encoding_method_mapper.items(): 141 | if method == "label_encoder": 142 | if col in self.label_encoders: 143 | encoder = self.label_encoders[col] 144 | # Identify known and unknown labels 145 | known_mask = X[col].isin(encoder.classes_) 146 | # Transform known labels 147 | if known_mask.any(): 148 | X.loc[known_mask, col] = encoder.transform(X.loc[known_mask, col]) 149 | # Handle unknown labels 150 | X.loc[~known_mask, col] = -1 151 | X[col] = X[col].astype(int) 152 | 153 | elif method == "onehot_encoder": 154 | if col in self.onehot_encoders: 155 | encoder = self.onehot_encoders[col] 156 | one_hot_encoded = encoder.transform(X[[col]]) 157 | one_hot_df = pd.DataFrame( 158 | one_hot_encoded, 159 | columns=encoder.get_feature_names_out([col]), 160 | index=X.index 161 | ) 162 | X = pd.concat([X.drop(columns=[col]), one_hot_df], axis=1) 163 | 164 | elif method == "ordinal_encoder": 165 | if col in self.ordinal_encoders: 166 | encoder = self.ordinal_encoders[col] 167 | # Identify known and unknown categories 168 | known_categories = encoder.categories_[0] 169 | known_mask = X[col].isin(known_categories) 170 | # Transform known categories 171 | if known_mask.any(): 172 | X.loc[known_mask, col] = encoder.transform(X.loc[known_mask, [col]])[:, 0] 173 | # Handle unknown categories 174 | X.loc[~known_mask, col] = -1 175 | X[col] = X[col].astype(int) 176 | 177 | return X 178 | 179 | 180 | class NumericalNormalizer(BaseEstimator, TransformerMixin): 181 | """ 182 | A transformer to normalize numerical columns in a dataset 183 | """ 184 | def __init__(self, normalization_method_map: Dict[str, str]): 185 | self.normalization_method_map = normalization_method_map or {} 186 | self.scalers = {} 187 | self.logger = get_logger(__name__, "PROD") 188 | 189 | def fit(self, X, y=None): 190 | for column, method in self.normalization_method_map.items(): 191 | if method == "standard_scaler": 192 | scaler = StandardScaler() 193 | 194 | elif method == "minmax_scaler": 195 | scaler = MinMaxScaler() 196 | 197 | elif method == "robust_scaler": 198 | scaler = RobustScaler() 199 | 200 | elif method == "quantile_transformer": 201 | scaler = QuantileTransformer() 202 | 203 | elif method == "maxabs_scaler": 204 | scaler = MaxAbsScaler() 205 | 206 | elif method == "normalize_scaler": 207 | scaler = None 208 | 209 | else: 210 | self.logger.warning(f"Unknown method '{method}' for column '{column}'. Skipping.") 211 | continue 212 | 213 | if scaler is not None: 214 | scaler.fit(X[[column]]) 215 | self.scalers[column] = scaler 216 | else: 217 | self.scalers[column] = None 218 | 219 | return self 220 | 221 | def transform(self, X): 222 | for column, scaler in self.scalers.items(): 223 | if scaler is None: # Directly use sklearn's normalize method 224 | X[column] = normalize(X[[column]], axis=0).flatten() # Normalize to unit length 225 | else: 226 | X[column] = scaler.transform(X[[column]]) 227 | 228 | return X 229 | 230 | 231 | class FeatureEngineering: 232 | """ 233 | A class for performing feature engineering on a dataset 234 | 235 | Parameters 236 | ---------- 237 | data : pd.DataFrame 238 | The input data for the model training process 239 | 240 | target_col : str 241 | The target column name in the data 242 | 243 | drop_columns : list, default=None 244 | Columns that will be dropped from the data 245 | 246 | categorical_imputation_method : str, default='mode' 247 | Imputation method for categorical columns. Options: 248 | * 'mode': Replace missing values with the most frequent value 249 | * 'constant': Replace missing values with a constant value 250 | * 'drop': Drop rows with missing values 251 | 252 | numerical_imputation_method : str, default='mean' 253 | Imputation method for numerical columns. Options: 254 | * 'mean': Replace missing values with the column mean 255 | * 'median': Replace missing values with the column median 256 | * 'mode': Replace missing values with the column mode 257 | * 'constant': Replace missing values with a constant value 258 | * 'drop': Drop rows with missing values 259 | 260 | column_imputation_map : dict, default=None 261 | Custom mapping of columns to specific imputation methods 262 | Example usage: {'column_name1': 'mean', 'column_name2': 'mode'} 263 | 264 | categorical_imputation_constant : str, default='Unknown' 265 | The constant value for imputing categorical columns when 'constant' is selected 266 | 267 | numerical_imputation_constant : float, default=0.0 268 | The constant value for imputing numerical columns when 'constant' is selected 269 | 270 | encoding_method : str, default='onehot_encoder' 271 | Encoding method for categorical columns. Options: 272 | * 'label_encoder': Use label encoding 273 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html 274 | * 'onehot_encoder': Use one-hot encoding 275 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html 276 | * 'ordinal_encoder': Use ordinal encoding 277 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html 278 | 279 | onehot_limit : int, default=25 280 | Maximum number of categories to use for one-hot encoding 281 | 282 | encoding_method_map : dict, default=None 283 | Custom mapping of columns to encoding methods 284 | Example usage: {'column_name': 'onehot_encoder', 'column_name2': 'label_encoder'} 285 | 286 | ordinal_encode_map : dict, default=None 287 | Custom mapping of columns to category order for ordinal encoding 288 | Example usage: {'column_name': ['low', 'medium', 'high']} 289 | 290 | normalize : str, default=None 291 | Standardize the data using StandardScaler. Options: 292 | * 'standard_scaler': Standardize the data using StandardScaler 293 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 294 | * 'minmax_scaler': Scale the data using MinMaxScaler 295 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html 296 | * 'robust_scaler': Scale the data using RobustScaler 297 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html 298 | * 'quantile_transformer': Transform the data using QuantileTransformer 299 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html 300 | * 'maxabs_scaler': Scale the data using MaxAbsScaler 301 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html 302 | * 'normalize_scaler': Normalize the data to unit length 303 | * https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html 304 | """ 305 | def __init__( 306 | self, 307 | data: pd.DataFrame, 308 | target_col: str, 309 | drop_columns: Optional[List[str]] = None, 310 | categorical_imputation_method: str = "mode", 311 | numerical_imputation_method: str = "mean", 312 | column_imputation_map: Optional[Dict[str, str]] = None, 313 | categorical_imputation_constant: str = "Unknown", 314 | numerical_imputation_constant: float = 0.0, 315 | encoding_method: str = "onehot_encoder", 316 | onehot_limit: int = 25, 317 | encoding_method_map: Optional[Dict[str, str]] = None, 318 | ordinal_encode_map: Optional[Dict[str, List[str]]] = None, 319 | normalize: Optional[str] = None 320 | ): 321 | self.logger = get_logger(__name__, "PROD") 322 | 323 | # Initialize attributes 324 | self.data = data 325 | self.target_col = target_col 326 | self.drop_columns = drop_columns or [] 327 | self.categorical_imputation_method = categorical_imputation_method 328 | self.numerical_imputation_method = numerical_imputation_method 329 | self.column_imputation_map = column_imputation_map or {} 330 | self.numerical_imputation_constant = numerical_imputation_constant 331 | self.categorical_imputation_constant = categorical_imputation_constant 332 | self.encoding_method = encoding_method 333 | self.onehot_limit = onehot_limit 334 | self.encoding_method_map = encoding_method_map or {} 335 | self.ordinal_encode_map = ordinal_encode_map or {} 336 | self.normalize = normalize 337 | self.y_class_mapping = None 338 | 339 | def setup(self, data: Optional[pd.DataFrame] = None): 340 | """ 341 | Setup the feature engineering pipeline 342 | 343 | Parameters 344 | ---------- 345 | data : pd.DataFrame, default=None 346 | The data to override the existing data attribute 347 | """ 348 | if data is not None: 349 | self.data = data 350 | 351 | # Initialize encoder for target column 352 | self.target_encoder = LabelEncoder() 353 | # Separate features and target column 354 | self.feature_data = self.data.drop(columns=[self.target_col, *self.drop_columns], errors='ignore') 355 | self.numerical_columns = self.feature_data.select_dtypes(include=['number']).columns.tolist() 356 | self.categorical_columns = self.feature_data.columns.difference(self.numerical_columns).tolist() 357 | 358 | # Separate imputation mapping for numerical and categorical columns 359 | self.numerical_column_imputation_mapper = { 360 | col: self.numerical_imputation_method for col in self.numerical_columns 361 | } 362 | 363 | # For categorical columns, handle imputation separately 364 | self.categorical_column_imputation_mapper = { 365 | col: self.categorical_imputation_method for col in self.categorical_columns 366 | } 367 | 368 | # Combine both mappers to have a comprehensive imputation mapping 369 | self.column_imputation_mapper = {**self.numerical_column_imputation_mapper, 370 | **self.categorical_column_imputation_mapper} 371 | 372 | # Update the mappers with any custom map provided 373 | if self.column_imputation_map: 374 | self.column_imputation_mapper.update(self.column_imputation_map) 375 | 376 | # Initialize encoding method mapper with default value and update with custom map 377 | self.encoding_method_mapper = {col: self.encoding_method for col in self.categorical_columns} 378 | if self.encoding_method_map: 379 | self.encoding_method_mapper.update(self.encoding_method_map) 380 | 381 | if self.ordinal_encode_map: 382 | for col in self.ordinal_encode_map.keys(): 383 | if col in self.encoding_method_mapper: 384 | self.encoding_method_mapper[col] = 'ordinal_encoder' 385 | 386 | # Initialize numerical normalization map 387 | if self.normalize: 388 | self.normalization_map = { 389 | col: self.normalize for col in self.numerical_columns 390 | } 391 | 392 | 393 | pipeline_steps = [] 394 | 395 | # Add drop_columns step if drop_columns is not empty 396 | if self.drop_columns: 397 | pipeline_steps.append(("drop_columns", ColumnDropper(drop_columns=self.drop_columns))) 398 | 399 | # Add imputer step 400 | pipeline_steps.append( 401 | ("imputer", ColumnImputer( 402 | self.column_imputation_mapper, 403 | self.numerical_imputation_constant, 404 | self.categorical_imputation_constant 405 | ) 406 | ) 407 | ) 408 | 409 | # Add normalization step if not None 410 | if self.normalize: 411 | pipeline_steps.append(("normalizer", NumericalNormalizer(self.normalization_map))) 412 | 413 | # Add encoding step 414 | pipeline_steps.append(("encoder", CategoricalEncoder( 415 | self.encoding_method_mapper, 416 | self.ordinal_encode_map, 417 | onehot_limit=self.onehot_limit 418 | ))) 419 | 420 | # Create the pipeline 421 | self.pipeline = Pipeline(pipeline_steps, memory=None) 422 | 423 | def check_column_anomalies(self, threshold: float = 0.5): 424 | """ 425 | Identifies columns that are likely to be ID columns or have too many unique values 426 | 427 | Parameters 428 | ---------- 429 | threshold : float 430 | Threshold for the ratio (default is 0.5, e.g., 50%) 431 | """ 432 | 433 | id_columns = self._id_finder() 434 | if id_columns: 435 | for column in id_columns: 436 | if column not in self.drop_columns: 437 | self.logger.warning(f"Column '{column}' seems like an ID column. Consider dropping it via 'drop_columns' parameter if it is not a feature") 438 | 439 | columns_to_consider = self._anomaly_unique_values_finder(threshold=threshold) 440 | if columns_to_consider: 441 | for column, ratio in columns_to_consider.items(): 442 | self.logger.warning( 443 | f"Column '{column}' has too many unique values ({ratio:.2%}). " 444 | "Recommended to either process or drop this column via 'drop_columns'" 445 | ) 446 | 447 | # Find the columns that exceeds one_hot_limit 448 | columns_exceeding_limit = self._anomaly_onehot_limit_finder() 449 | # remove columns_to_consider from columns_exceeding_limit to avoid duplicate warnings 450 | columns_exceeding_limit = {k: v for k, v in columns_exceeding_limit.items() if k not in columns_to_consider} 451 | if columns_exceeding_limit: 452 | for column, count in columns_exceeding_limit.items(): 453 | self.logger.warning( 454 | f"Column '{column}' has {count} unique values. " 455 | "Consider operations like increasing value of 'onehot_limit', " 456 | "changing the encoding method or processing the column" 457 | ) 458 | 459 | def _id_finder(self) -> list: 460 | """ 461 | Identifies potential ID columns by checking if values in the first 100 rows 462 | match their respective index values 463 | 464 | Returns 465 | ------- 466 | list 467 | List of column names that could be ID columns 468 | """ 469 | potential_ids = [] 470 | 471 | for column in self.data.columns: 472 | # Check if the first 100 rows match the index values 473 | if (self.data[column].iloc[:100] == self.data.index[:100]).all(): 474 | potential_ids.append(column) 475 | 476 | return potential_ids 477 | 478 | def _anomaly_unique_values_finder(self, threshold: float = 0.5) -> dict: 479 | """ 480 | Identifies categorical columns where the ratio of unique values to non-null rows 481 | exceeds the given threshold 482 | 483 | Parameters 484 | ---------- 485 | threshold : float 486 | Threshold for the ratio (default is 0.5, e.g., 50%) 487 | 488 | Returns 489 | ------- 490 | dict 491 | Dictionary of column names and their unique value ratios 492 | """ 493 | columns_above_threshold = {} 494 | 495 | for column in self.categorical_columns: 496 | # Calculate the ratio using non-null data 497 | non_null_count = self.data[column].notnull().sum() 498 | if non_null_count > 0: # Avoid division by zero 499 | unique_ratio = self.data[column].nunique() / non_null_count 500 | if unique_ratio > threshold: 501 | columns_above_threshold[column] = unique_ratio 502 | 503 | return columns_above_threshold 504 | 505 | def _anomaly_onehot_limit_finder(self) -> dict: 506 | """ 507 | Identifies categorical columns where the number of unique values exceeds the one_hot_limit 508 | 509 | Returns 510 | ------- 511 | dict 512 | Dictionary of column names and their unique value counts 513 | """ 514 | columns_above_threshold = {} 515 | 516 | for column in self.categorical_columns: 517 | if self.data[column].nunique() > self.onehot_limit: 518 | columns_above_threshold[column] = self.data[column].nunique() 519 | 520 | return columns_above_threshold 521 | 522 | def fit_transform(self) -> pd.DataFrame: 523 | """ 524 | Perform feature engineering on the training data 525 | 526 | Processes features and the target column by: 527 | - Dropping specified columns from the data 528 | - Imputing missing values for numerical and categorical columns 529 | - Encoding categorical features 530 | - Encoding the target column if it is categorical 531 | - Normalizing numerical columns if specified 532 | 533 | Returns 534 | ------- 535 | pd.DataFrame 536 | A DataFrame containing the processed features and target column 537 | """ 538 | # Process features 539 | processed_features = self.pipeline.fit_transform(self.feature_data) 540 | 541 | # Process if target column is categorical 542 | target_data = self.data[self.target_col] 543 | if target_data.dtype in ['object', 'category']: 544 | target_data = self.target_encoder.fit_transform(target_data) 545 | self.y_class_mapping = { # for example: {0: 'male', 1: 'female'} 546 | i: label for i, label in enumerate(self.target_encoder.classes_) 547 | } 548 | processed_features[self.target_col] = target_data 549 | 550 | return processed_features.drop(self.target_col, axis=1), processed_features[self.target_col] 551 | 552 | def transform(self, test_data: pd.DataFrame, y_included: bool = False) -> pd.DataFrame: 553 | """ 554 | Perform feature engineering on test data using the fitted pipeline 555 | 556 | Processes features by: 557 | - Imputing missing values for numerical and categorical columns 558 | - Encoding categorical features 559 | - Normalizing numerical columns if specified 560 | 561 | Parameters 562 | ---------- 563 | test_data : pd.DataFrame 564 | The test dataset to process 565 | 566 | y_included : bool, default=False 567 | Whether the target column is included in the test data so It also transforms the target column 568 | 569 | Returns 570 | ------- 571 | pd.DataFrame 572 | A DataFrame containing the processed test features 573 | """ 574 | if y_included: 575 | test_features = test_data 576 | else: 577 | test_features = test_data.drop(columns=[self.target_col], errors='ignore') 578 | 579 | processed_test_features = self.pipeline.transform(test_features) 580 | 581 | # Add target column if it exists in test data 582 | if self.target_col in test_data.columns: 583 | target_data = test_data[self.target_col] 584 | if target_data.dtype in ['object', 'category']: 585 | target_data = self.target_encoder.transform(target_data) 586 | processed_test_features[self.target_col] = target_data 587 | 588 | if not y_included: 589 | return processed_test_features 590 | else: 591 | return processed_test_features.drop(self.target_col, axis=1), processed_test_features[self.target_col] -------------------------------------------------------------------------------- /flexml/_model_tuner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import optuna 4 | import joblib 5 | from joblib.parallel import BatchCompletionCallBack 6 | from contextlib import contextmanager 7 | from typing import Optional, Union 8 | from time import time 9 | from sklearn.model_selection import ParameterGrid, GridSearchCV, RandomizedSearchCV 10 | from sklearn.pipeline import Pipeline 11 | from sklearn.base import clone 12 | from flexml.config import TUNING_METRIC_TRANSFORMATIONS 13 | from flexml.logger import get_logger 14 | from flexml.helpers import evaluate_model_perf 15 | from copy import deepcopy 16 | from tqdm import tqdm 17 | 18 | 19 | class TqdmBatchCompletionCallback(BatchCompletionCallBack): 20 | def __init__(self, *args, **kwargs): 21 | super().__init__(*args, **kwargs) 22 | 23 | def __call__(self, *args, **kwargs): 24 | self.tqdm_object.update(n=self.batch_size) 25 | return super().__call__(*args, **kwargs) 26 | 27 | @contextmanager 28 | def tqdm_joblib(tqdm_object): 29 | """Context manager to patch joblib to report into tqdm progress bar""" 30 | class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack): 31 | def __init__(self, *args, **kwargs): 32 | super().__init__(*args, **kwargs) 33 | 34 | def __call__(self, *args, **kwargs): 35 | tqdm_object.update(n=self.batch_size) 36 | return super().__call__(*args, **kwargs) 37 | 38 | old_batch_callback = joblib.parallel.BatchCompletionCallBack 39 | joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback 40 | try: 41 | yield tqdm_object 42 | finally: 43 | joblib.parallel.BatchCompletionCallBack = old_batch_callback 44 | tqdm_object.close() 45 | 46 | 47 | class ModelTuner: 48 | """ 49 | Implements hyperparameter tuning on the machine learning models with the desired tuning method from the following: 50 | 51 | * 'grid_search' for GridSearchCV (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) 52 | Note that GridSearch optimization may take too long to finish since It tries all the possible combinations of the parameters 53 | 54 | * 'randomized_search' for RandomizedSearchCV (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) 55 | 56 | * 'optuna' for Optuna (https://optuna.readthedocs.io/en/stable/) 57 | 58 | Parameters 59 | ---------- 60 | ml_problem_type : str 61 | The type of the machine learning problem. It can be one of the following: 62 | 63 | * 'Classification' for classification problems 64 | 65 | * 'Regression' for regression problems 66 | 67 | logging_to_file: bool, (default=False) 68 | If True, the logs will be saved to a file in the current path, located in /logs/flexml_logs.log, Otherwise, it will not be saved 69 | """ 70 | def __init__( 71 | self, 72 | ml_problem_type: str, 73 | X: Union[pd.DataFrame, np.ndarray], 74 | y: Union[pd.DataFrame, np.ndarray], 75 | logging_to_file: bool = False 76 | ): 77 | """ 78 | Parameters 79 | ---------- 80 | ml_problem_type : str 81 | Type of the ML problem ('Classification' or 'Regression') 82 | 83 | X : pd.DataFrame 84 | The feature values of the dataset 85 | 86 | y : pd.DataFrame 87 | The target values of the dataset 88 | 89 | logging_to_file : bool, optional (default=False) 90 | Whether to log to a file 91 | """ 92 | self.ml_problem_type = ml_problem_type.lower().capitalize() # Normalize case 93 | self.X = X 94 | self.y = y 95 | 96 | self.logger = get_logger(__name__, "PROD", logging_to_file) 97 | 98 | self.eval_metrics_in_tuning_format = TUNING_METRIC_TRANSFORMATIONS.get(self.ml_problem_type) 99 | self.reverse_signed_eval_metrics = TUNING_METRIC_TRANSFORMATIONS.get("reverse_signed_eval_metrics") 100 | 101 | # Revise classification metrics for multi-class classification 102 | if self.ml_problem_type == "Classification" and self.y.nunique() > 2: 103 | self.eval_metrics_in_tuning_format['ROC-AUC'] = 'roc_auc_ovr' 104 | self.eval_metrics_in_tuning_format['Precision'] = 'precision_macro' 105 | self.eval_metrics_in_tuning_format['Recall'] = 'recall_macro' 106 | self.eval_metrics_in_tuning_format['F1 Score'] = 'f1_macro' 107 | 108 | def _param_grid_validator( 109 | self, 110 | model_available_params: dict, 111 | param_grid: dict, 112 | prefix_param_grid_flag: bool = True 113 | ) -> dict: 114 | """ 115 | This method is used to validate the param_grid dictionary for the model 116 | 117 | Parameters 118 | ---------- 119 | model_available_params : dict 120 | All params that model has 121 | 122 | param_grid : dict 123 | The dictionary that contains the hyperparameters and their possible values 124 | 125 | prefix_param_grid_flag : bool 126 | Indicates If param_grid keys will be modified to be suitable for Pipeline object, adds model__ prefix to the begining of them 127 | """ 128 | param_amount = len(param_grid) 129 | if param_amount == 0: 130 | error_msg = "Error while validating the param_grid for the model. The param_grid should not be empty" 131 | self.logger.error(error_msg) 132 | raise ValueError(error_msg) 133 | 134 | if prefix_param_grid_flag: 135 | param_grid = {f"model__{key}": value for key, value in param_grid.items()} 136 | 137 | # Check if all params that param_grid has are available in the model's params 138 | for param_name in param_grid.keys(): 139 | if param_name not in model_available_params: 140 | error_msg = f"Error while validating the param_grid for the model. The '{param_name}' parameter is not available in the model's available params.\n \ 141 | Available params: {list(model_available_params)}" 142 | self.logger.error(error_msg) 143 | raise ValueError(error_msg) 144 | 145 | return param_grid 146 | 147 | def _setup_tuning( 148 | self, 149 | tuning_method: str, 150 | model: Union[object, Pipeline], 151 | param_grid: dict, 152 | n_iter: Optional[int] = None, 153 | n_jobs: int = -1, 154 | prefix_param_grid_flag = True 155 | ): 156 | """ 157 | Sets up the tuning process by creating the model_stats dictionary 158 | 159 | Parameters 160 | ---------- 161 | tuning_method : str 162 | The tuning method that will be used for the optimization. It can be one of the following: 163 | 164 | * 'grid_search' for GridSearchCV (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) 165 | 166 | * 'randomized_search' for RandomizedSearchCV (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) 167 | 168 | * 'optuna' for Optuna (https://optuna.readthedocs.io/en/stable/) 169 | 170 | model : object or Pipeline 171 | The model or Pipeline object that will be used for tuning 172 | 173 | n_iter : int, optional (default=10) 174 | The number of iterations. The default is 10. 175 | 176 | n_jobs : int (default=-1) 177 | The number of parallel jobs to run. The default is -1. 178 | 179 | prefix_param_grid_flag : bool 180 | Indicates If param_grid keys will be modified to be suitable for Pipeline object, adds model__ prefix to the begining of them 181 | 182 | Returns 183 | ------- 184 | model_stats: dict 185 | Dictionary including tuning information and model: 186 | 187 | * 'tuning_method': The tuning method that is used for the optimization 188 | 189 | * 'tuning_param_grid': The hyperparameter grid that is used for the optimization 190 | 191 | * 'n_iter': The number of iterations 192 | 193 | * 'n_jobs': The number of parallel jobs to run 194 | 195 | * 'tuned_model': The tuned model object 196 | 197 | * 'tuned_model_score': The evaluation metric score of the tuned model 198 | 199 | * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model 200 | """ 201 | model_params = None 202 | 203 | if isinstance(model, Pipeline): 204 | model = model.named_steps['model'] 205 | 206 | if "CatBoost" in model.__class__.__name__: 207 | model_params = model.get_all_params() 208 | else: 209 | model_params = model.get_params() 210 | 211 | if prefix_param_grid_flag: 212 | model_params = {f"model__{key}": value for key, value in model_params.items()} 213 | 214 | param_grid = self._param_grid_validator( 215 | model_available_params=model_params, 216 | param_grid=param_grid, 217 | prefix_param_grid_flag=prefix_param_grid_flag 218 | ) 219 | 220 | model_stats = { 221 | "tuning_method": tuning_method, 222 | "tuning_param_grid": param_grid, 223 | "n_iter": n_iter, 224 | "n_jobs": n_jobs, 225 | "tuned_model": None, 226 | "tuned_model_score": None, 227 | "tuned_model_evaluation_metric": None 228 | } 229 | 230 | return model_stats 231 | 232 | def grid_search( 233 | self, 234 | pipeline: Pipeline, 235 | param_grid: dict, 236 | eval_metric: str, 237 | cv: list, 238 | n_jobs: int = -1, 239 | verbose: int = 0 240 | ) -> Optional[dict]: 241 | """ 242 | Implements grid search hyperparameter optimization on the giveen machine learning model 243 | 244 | Parameters 245 | ---------- 246 | pipeline : Pipeline 247 | The pipeline object includes feature engineering and model object that will be tuned 248 | 249 | param_grid : dict 250 | The dictionary that contains the hyperparameters and their possible values 251 | 252 | eval_metric : str 253 | The evaluation metric that will be used to evaluate the model. It can be one of the following: 254 | 255 | * 'R2' for R^2 score 256 | 257 | * 'MAE' for Mean Absolute Error 258 | 259 | * 'MSE' for Mean Squared Error 260 | 261 | * 'RMSE' for Root Mean Squared Error 262 | 263 | * 'MAPE' for Mean Absolute Percentage Error 264 | 265 | * 'Accuracy' for Accuracy 266 | 267 | * 'Precision' for Precision 268 | 269 | * 'Recall' for Recall 270 | 271 | * 'F1 Score' for F1 score 272 | 273 | cv : list of tuples 274 | A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices 275 | for the training and test sets for that fold. For example: 276 | [(array([1,2,4,...]), array([0,3,6,...])), ...] 277 | 278 | n_jobs : int (default=-1) 279 | The number of parallel jobs to run. The default is -1. 280 | 281 | verbose: int (default = 0) 282 | The verbosity level of the tuning process. If It's set to 0, no logs will be shown during the tuning process. Otherwise, the logs will be shown based on the value of the verbose parameter: 283 | 284 | * 1 : the computation time for each fold and parameter candidate is displayed 285 | 286 | * 2 : the score is also displayed 287 | 288 | * 3 : the fold and candidate parameter indexes are also displayed together with the starting time of the computation 289 | 290 | Returns 291 | ------- 292 | model_stats: dict 293 | Dictionary including tuning information and model: 294 | 295 | * 'tuning_method': The tuning method that is used for the optimization 296 | 297 | * 'tuning_param_grid': The hyperparameter grid that is used for the optimization 298 | 299 | * 'cv': The number of cross-validation splits 300 | 301 | * 'n_jobs': The number of parallel jobs to run 302 | 303 | * 'tuned_model': The tuned model object 304 | 305 | * 'tuned_model_score': The evaluation metric score of the tuned model 306 | 307 | * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model 308 | """ 309 | model_stats = self._setup_tuning("GridSearchCV", pipeline, param_grid, n_iter=None, n_jobs=n_jobs) 310 | param_grid = model_stats['tuning_param_grid'] 311 | 312 | try: 313 | t_start = time() 314 | 315 | # Calculate total fits 316 | total_params = len(ParameterGrid(param_grid)) 317 | n_splits = len(cv) 318 | total_fits = total_params * n_splits 319 | 320 | # Create GridSearchCV object 321 | search = GridSearchCV( 322 | pipeline, 323 | param_grid, 324 | scoring=self.eval_metrics_in_tuning_format, 325 | refit=eval_metric, 326 | cv=cv, 327 | n_jobs=n_jobs, 328 | verbose=verbose 329 | ) 330 | 331 | # Fit with progress bar 332 | with tqdm_joblib(tqdm( 333 | total=total_fits, 334 | desc="INFO | Grid Search Progress", 335 | bar_format="{desc}: |{bar}| {percentage:.0f}%" 336 | )): 337 | search_result = search.fit(self.X, self.y) 338 | 339 | t_end = time() 340 | time_taken = round(t_end - t_start, 2) 341 | 342 | scores = { 343 | metric: ( 344 | -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] 345 | if metric in self.reverse_signed_eval_metrics else 346 | search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] 347 | ) 348 | for metric in list(self.eval_metrics_in_tuning_format.keys()) 349 | } 350 | 351 | model_stats['tuned_model'] = search_result.best_estimator_.named_steps['model'] 352 | mean_score = search_result.cv_results_[f'mean_test_{eval_metric}'][search_result.best_index_] 353 | model_stats['tuned_model_score'] = round(mean_score, 6) 354 | model_stats['model_perf'] = scores 355 | model_stats['time_taken_sec'] = time_taken 356 | model_stats['tuned_model_evaluation_metric'] = eval_metric 357 | return model_stats 358 | except Exception as e: 359 | self.logger.error(f"Error while tuning the model with GridSearchCV, Error: {e}") 360 | return None 361 | 362 | def randomized_search( 363 | self, 364 | pipeline: Pipeline, 365 | param_grid: dict, 366 | eval_metric: str, 367 | cv: list, 368 | n_iter: int = 10, 369 | n_jobs: int = -1, 370 | verbose: int = 0 371 | ) -> Optional[dict]: 372 | """ 373 | Implements randomized search hyperparameter optimization on the giveen machine learning model 374 | 375 | Parameters 376 | ---------- 377 | pipeline : Pipeline 378 | The pipeline object includes feature engineering and model object that will be tuned 379 | 380 | param_grid : dict 381 | The dictionary that contains the hyperparameters and their possible values 382 | 383 | eval_metric : str 384 | The evaluation metric that will be used to evaluate the model. It can be one of the following: 385 | 386 | * 'R2' for R^2 score 387 | 388 | * 'MAE' for Mean Absolute Error 389 | 390 | * 'MSE' for Mean Squared Error 391 | 392 | * 'RMSE' for Root Mean Squared Error 393 | 394 | * 'MAPE' for Mean Absolute Percentage Error 395 | 396 | * 'Accuracy' for Accuracy 397 | 398 | * 'Precision' for Precision 399 | 400 | * 'Recall' for Recall 401 | 402 | * 'F1 Score' for F1 score 403 | 404 | cv : list of tuples 405 | A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices 406 | for the training and test sets for that fold. For example: 407 | [(array([1,2,4,...]), array([0,3,6,...])), ...] 408 | 409 | n_iter : int, optional (default=10) 410 | The number of trials. The default is 10 411 | 412 | n_jobs : int (default=-1) 413 | The number of parallel jobs to run. The default is -1 414 | 415 | Returns 416 | ------- 417 | model_stats: dict 418 | Dictionary including tuning information and model: 419 | 420 | * 'tuning_method': The tuning method that is used for the optimization 421 | 422 | * 'tuning_param_grid': The hyperparameter grid that is used for the optimization 423 | 424 | * 'n_jobs': The number of parallel jobs to run 425 | 426 | * 'tuned_model': The tuned model object 427 | 428 | * 'tuned_model_score': The evaluation metric score of the tuned model 429 | 430 | * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model 431 | """ 432 | model_stats = self._setup_tuning("randomized_search", pipeline, param_grid, n_iter=n_iter, n_jobs=n_jobs) 433 | param_grid = model_stats['tuning_param_grid'] 434 | 435 | t_start = time() 436 | 437 | # Calculate total fits 438 | n_splits = len(cv) 439 | total_fits = n_iter * n_splits 440 | 441 | # Create RandomizedSearchCV object 442 | search = RandomizedSearchCV( 443 | estimator=pipeline, 444 | param_distributions=param_grid, 445 | n_iter=n_iter, 446 | scoring=self.eval_metrics_in_tuning_format, 447 | refit=eval_metric, 448 | cv=cv, 449 | n_jobs=n_jobs, 450 | verbose=verbose 451 | ) 452 | 453 | # Fit with progress bar 454 | with tqdm_joblib(tqdm( 455 | total=total_fits, 456 | desc="INFO | Randomized Search Progress", 457 | bar_format="{desc}: |{bar}| {percentage:.0f}%" 458 | )): 459 | search_result = search.fit(self.X, self.y) 460 | 461 | t_end = time() 462 | time_taken = round(t_end - t_start, 2) 463 | 464 | scores = { 465 | metric: ( 466 | -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] 467 | if metric in self.reverse_signed_eval_metrics else 468 | search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] 469 | ) 470 | for metric in list(self.eval_metrics_in_tuning_format.keys()) 471 | } 472 | 473 | model_stats['tuned_model'] = search_result.best_estimator_.named_steps['model'] 474 | mean_score = search_result.cv_results_[f'mean_test_{eval_metric}'][search_result.best_index_] 475 | model_stats['tuned_model_score'] = round(mean_score, 6) 476 | model_stats['model_perf'] = scores 477 | model_stats['time_taken_sec'] = time_taken 478 | model_stats['tuned_model_evaluation_metric'] = eval_metric 479 | return model_stats 480 | 481 | 482 | def optuna_search( 483 | self, 484 | pipeline: Pipeline, 485 | param_grid: dict, 486 | eval_metric: str, 487 | cv: list, 488 | n_iter: int = 10, 489 | timeout: Optional[int] = None, 490 | n_jobs: int = -1, 491 | verbose: int = 0 492 | ) -> Optional[dict]: 493 | """ 494 | Implements Optuna hyperparameter optimization on the given machine learning model 495 | 496 | Parameters 497 | ---------- 498 | pipeline : Pipeline 499 | The pipeline object includes feature engineering and model object that will be tuned 500 | 501 | param_grid : dict 502 | The dictionary that contains the hyperparameters and their possible values 503 | 504 | eval_metric : str 505 | The evaluation metric that will be used to evaluate the model. It can be one of the following: 506 | 507 | * 'R2' for R^2 score 508 | 509 | * 'MAE' for Mean Absolute Error 510 | 511 | * 'MSE' for Mean Squared Error 512 | 513 | * 'RMSE' for Root Mean Squared Error 514 | 515 | * 'MAPE' for Mean Absolute Percentage Error 516 | 517 | * 'Accuracy' for Accuracy 518 | 519 | * 'Precision' for Precision 520 | 521 | * 'Recall' for Recall 522 | 523 | * 'F1 Score' for F1 score 524 | 525 | cv : list of tuples 526 | A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices 527 | for the training and test sets for that fold. For example: 528 | [(array([1,2,4,...]), array([0,3,6,...])), ...] 529 | 530 | n_iter : int, optional (default=10) 531 | The number of trials. The default is 10 532 | 533 | timeout : int, optional (default=None) 534 | The timeout in seconds. The default is None 535 | 536 | n_jobs : int, optional (default=-1) 537 | The number of parallel jobs to run. The default is -1 538 | 539 | verbose: int (default = 0) 540 | The verbosity level of the tuning process. If It's set to 0, no logs will be shown during the tuning process. Otherwise, the logs will be shown based on the value of the verbose parameter: 541 | 542 | * DEBUG (Equals to 4): Most detailed logging (prints almost everything) 543 | 544 | * INFO (Equals to 3): Standard informational output 545 | 546 | * WARNING (Equals to 2): Only warnings and errors 547 | 548 | * ERROR (Equals to 1): Only error messages 549 | 550 | * CRITICAL (Equals to 0): Only critical errors 551 | 552 | Returns 553 | ------- 554 | model_stats: dict 555 | Dictionary including tuning information and model: 556 | 557 | * 'tuning_method': The tuning method that is used for the optimization 558 | 559 | * 'tuning_param_grid': The hyperparameter grid that is used for the optimization 560 | 561 | * 'cv': The number of cross-validation splits 562 | 563 | * 'n_jobs': The number of parallel jobs to run 564 | 565 | * 'tuned_model': The tuned model object 566 | 567 | * 'tuned_model_score': The evaluation metric score of the tuned model 568 | 569 | * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model 570 | """ 571 | model_stats = self._setup_tuning("optuna", pipeline, param_grid, n_iter=n_iter, n_jobs=n_jobs, prefix_param_grid_flag=False) 572 | param_grid = model_stats['tuning_param_grid'] 573 | 574 | # Set verbosity levels 575 | if verbose == 0: 576 | optuna.logging.set_verbosity(optuna.logging.CRITICAL) 577 | elif verbose == 1: 578 | optuna.logging.set_verbosity(optuna.logging.ERROR) 579 | elif verbose == 2: 580 | optuna.logging.set_verbosity(optuna.logging.WARNING) 581 | elif verbose == 3: 582 | optuna.logging.set_verbosity(optuna.logging.INFO) 583 | elif verbose == 4: 584 | optuna.logging.set_verbosity(optuna.logging.DEBUG) 585 | 586 | study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'] else "minimize" 587 | 588 | def objective(trial): 589 | # Generate parameters for the trial 590 | params = pipeline.named_steps['model'].get_params() 591 | for param_name, param_values in param_grid.items(): 592 | first_element = param_values[0] 593 | 594 | if isinstance(first_element, (str, bool)): 595 | params[param_name] = trial.suggest_categorical(param_name, param_values) 596 | elif isinstance(first_element, int): 597 | params[param_name] = trial.suggest_int(param_name, param_values[0], param_values[-1]) 598 | elif isinstance(first_element, float): 599 | params[param_name] = trial.suggest_float(param_name, param_values[0], param_values[-1]) 600 | else: 601 | info_msg = f"{param_name} parameter is not added to tuning since its type is not supported by Optuna." 602 | self.logger.info(info_msg) 603 | 604 | # Clone the entire pipeline and its steps to avoid shared state between trials 605 | preprocessing_steps = [(name, clone(step)) for name, step in pipeline.steps[:-1]] 606 | new_pipeline = Pipeline( 607 | steps=preprocessing_steps + [ 608 | ('model', clone(pipeline.named_steps['model']).set_params(**params)) 609 | ] 610 | ) 611 | 612 | # Perform cross-validation and calculate the score 613 | scores = [] 614 | for train_idx, test_idx in cv: 615 | X_train, X_test = self.X.iloc[train_idx], self.X.iloc[test_idx] 616 | y_train, y_test = self.y.iloc[train_idx], self.y.iloc[test_idx] 617 | 618 | new_pipeline.fit(X_train, y_train) 619 | 620 | if self.ml_problem_type == "Classification" and hasattr(new_pipeline, 'predict_proba'): 621 | y_pred = new_pipeline.predict_proba(X_test) 622 | else: 623 | y_pred = new_pipeline.predict(X_test) 624 | 625 | # Evaluate performance 626 | scores.append(evaluate_model_perf(self.ml_problem_type, y_test, y_pred)) 627 | 628 | # Calculate the mean score across all folds 629 | avg_metrics = {k: np.mean([m[k] if m[k] is not None else -1 for m in scores]) for k in scores[0]} 630 | mean_score = avg_metrics.get(eval_metric, float('inf')) 631 | 632 | # Update the best score and model 633 | if model_stats['tuned_model_score'] is None or (study_direction == "maximize" and mean_score > model_stats['tuned_model_score']) or (study_direction == "minimize" and mean_score < model_stats['tuned_model_score']): 634 | model_stats['tuned_model_score'] = round(mean_score, 6) 635 | model_stats['tuned_model'] = new_pipeline.named_steps['model'] 636 | model_stats['model_perf'] = avg_metrics 637 | 638 | return mean_score 639 | 640 | try: 641 | # Perform Optuna optimization 642 | t_start = time() 643 | study = optuna.create_study(direction=study_direction) 644 | study.optimize(objective, n_trials=n_iter, timeout=timeout, n_jobs=n_jobs, show_progress_bar=True) 645 | t_end = time() 646 | 647 | # Update model stats 648 | model_stats['time_taken_sec'] = round(t_end - t_start, 2) 649 | return model_stats 650 | 651 | except Exception as e: 652 | self.logger.error(f"Error while tuning the model with Optuna, Error: {e}") 653 | return None --------------------------------------------------------------------------------