├── .github └── workflows │ ├── run-tests.yml │ └── test-installation-with-conda.yml ├── .gitignore ├── CITATION ├── LICENSE ├── MANIFEST.in ├── README.md ├── examples ├── notebooks │ ├── Titanic.ipynb │ └── basic_run.ipynb └── scripts │ ├── binary_classifier.py │ ├── binary_classifier_Titanic.py │ ├── binary_classifier_adult_fairness.py │ ├── binary_classifier_ensemble.py │ ├── binary_classifier_marketing.py │ ├── binary_classifier_random.py │ ├── multi_class_classifier.py │ ├── multi_class_classifier_MNIST.py │ ├── multi_class_classifier_digits.py │ ├── multi_class_drug_fairness.py │ ├── regression.py │ ├── regression_acs_fairness.py │ ├── regression_crime_fairness.py │ ├── regression_housing_fairness.py │ ├── regression_law_school_fairness.py │ └── tabular_mar_2021.py ├── pytest.ini ├── requirements.txt ├── requirements_dev.txt ├── setup.py ├── supervised ├── __init__.py ├── algorithms │ ├── __init__.py │ ├── algorithm.py │ ├── baseline.py │ ├── catboost.py │ ├── decision_tree.py │ ├── extra_trees.py │ ├── factory.py │ ├── knn.py │ ├── lightgbm.py │ ├── linear.py │ ├── nn.py │ ├── random_forest.py │ ├── registry.py │ ├── sklearn.py │ └── xgboost.py ├── automl.py ├── base_automl.py ├── callbacks │ ├── __init__.py │ ├── callback.py │ ├── callback_list.py │ ├── early_stopping.py │ ├── learner_time_constraint.py │ ├── max_iters_constraint.py │ ├── metric_logger.py │ ├── terminate_on_nan.py │ └── total_time_constraint.py ├── ensemble.py ├── exceptions.py ├── fairness │ ├── __init__.py │ ├── metrics.py │ ├── optimization.py │ ├── plots.py │ ├── report.py │ └── utils.py ├── model_framework.py ├── preprocessing │ ├── __init__.py │ ├── datetime_transformer.py │ ├── eda.py │ ├── encoding_selector.py │ ├── exclude_missing_target.py │ ├── goldenfeatures_transformer.py │ ├── kmeans_transformer.py │ ├── label_binarizer.py │ ├── label_encoder.py │ ├── loo_encoder.py │ ├── preprocessing.py │ ├── preprocessing_categorical.py │ ├── preprocessing_missing.py │ ├── preprocessing_utils.py │ ├── scale.py │ └── text_transformer.py ├── tuner │ ├── __init__.py │ ├── data_info.py │ ├── hill_climbing.py │ ├── mljar_tuner.py │ ├── optuna │ │ ├── __init__.py │ │ ├── catboost.py │ │ ├── extra_trees.py │ │ ├── knn.py │ │ ├── lightgbm.py │ │ ├── nn.py │ │ ├── random_forest.py │ │ ├── tuner.py │ │ └── xgboost.py │ ├── preprocessing_tuner.py │ ├── random_parameters.py │ └── time_controller.py ├── utils │ ├── __init__.py │ ├── additional_metrics.py │ ├── additional_plots.py │ ├── automl_plots.py │ ├── common.py │ ├── config.py │ ├── constants.py │ ├── data_validation.py │ ├── importance.py │ ├── jsonencoder.py │ ├── leaderboard_plots.py │ ├── learning_curves.py │ ├── metric.py │ ├── shap.py │ ├── subsample.py │ └── utils.py └── validation │ ├── __init__.py │ ├── validation_step.py │ ├── validator_base.py │ ├── validator_custom.py │ ├── validator_kfold.py │ └── validator_split.py └── tests ├── README.md ├── __init__.py ├── checks ├── __init__.py ├── check_automl_with_regression.py ├── run_ml_tests.py └── run_performance_tests.py ├── conftest.py ├── data ├── 179.csv ├── 24.csv ├── 3.csv ├── 31.csv ├── 38.csv ├── 44.csv ├── 720.csv ├── 737.csv ├── CrimeData │ ├── README.md │ ├── cities.json │ └── crimedata.csv ├── Drug │ ├── Drug_Consumption.csv │ └── README.md ├── LawSchool │ ├── README.md │ └── bar_pass_prediction.csv ├── PortugeseBankMarketing │ └── Data_FinalProject.csv ├── Titanic │ ├── test_with_Survived.csv │ └── train.csv ├── acs_income_1k.csv ├── adult_missing_values_missing_target_500rows.csv ├── boston_housing.csv ├── housing_regression_missing_values_missing_target.csv ├── iris_classes_missing_values_missing_target.csv └── iris_missing_values_missing_target.csv ├── tests_algorithms ├── __init__.py ├── test_baseline.py ├── test_catboost.py ├── test_decision_tree.py ├── test_extra_trees.py ├── test_factory.py ├── test_knn.py ├── test_lightgbm.py ├── test_linear.py ├── test_nn.py ├── test_random_forest.py ├── test_registry.py └── test_xgboost.py ├── tests_automl ├── __init__.py ├── test_adjust_validation.py ├── test_automl.py ├── test_automl_init.py ├── test_automl_report.py ├── test_automl_sample_weight.py ├── test_automl_time_constraints.py ├── test_data_types.py ├── test_dir_change.py ├── test_explain_levels.py ├── test_golden_features.py ├── test_handle_imbalance.py ├── test_integration.py ├── test_joblib_version.py ├── test_models_needed_for_predict.py ├── test_prediction_after_load.py ├── test_repeated_validation.py ├── test_restore.py ├── test_stack_models_constraints.py ├── test_targets.py └── test_update_errors_report.py ├── tests_callbacks ├── __init__.py └── test_total_time_constraint.py ├── tests_ensemble ├── __init__.py └── test_save_load.py ├── tests_fairness ├── __init__.py ├── test_binary_classification.py ├── test_multi_class_classification.py └── test_regression.py ├── tests_preprocessing ├── __init__.py ├── disable_eda.py ├── test_categorical_integers.py ├── test_datetime_transformer.py ├── test_encoding_selector.py ├── test_exclude_missing.py ├── test_goldenfeatures_transformer.py ├── test_label_binarizer.py ├── test_label_encoder.py ├── test_loo_encoder.py ├── test_preprocessing.py ├── test_preprocessing_missing.py ├── test_preprocessing_utils.py ├── test_scale.py └── test_text_transformer.py ├── tests_tuner ├── __init__.py ├── test_hill_climbing.py ├── test_time_controller.py └── test_tuner.py ├── tests_utils ├── __init__.py ├── test_compute_additional_metrics.py ├── test_importance.py ├── test_learning_curves.py ├── test_metric.py ├── test_shap.py └── test_subsample.py └── tests_validation ├── __init__.py ├── test_validator_kfold.py └── test_validator_split.py /.github/workflows/run-tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [ push,pull_request ] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: [ ubuntu-latest ] 12 | python-version: [ '3.10'] 13 | #os: [ ubuntu-latest, macos-latest, windows-latest ] 14 | #python-version: [ '3.8', '3.9', '3.10', '3.11' ] 15 | 16 | steps: 17 | - name: Install OS Dependencies 18 | if: matrix.os == 'ubuntu-latest' 19 | run: | 20 | sudo apt-get update 21 | sudo apt-get -y install graphviz 22 | 23 | - name: Install OS Dependencies 24 | if: matrix.os == 'macos-latest' 25 | run: | 26 | brew install graphviz 27 | 28 | - name: Install OS Dependencies 29 | if: matrix.os == 'windows-latest' 30 | run: | 31 | choco install graphviz 32 | - uses: actions/checkout@v2 33 | - name: Set up Python ${{ matrix.python-version }} 34 | uses: actions/setup-python@v2 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | - name: Install Python Dependencies 38 | run: | 39 | python -m pip install --upgrade pip 40 | pip install --upgrade setuptools 41 | pip install -U importlib-metadata>=1.7.0 42 | pip install -U -r requirements.txt 43 | pip install -U -r requirements_dev.txt 44 | pip install ipython 45 | python setup.py install 46 | - name: Test with pytest 47 | run: | 48 | pytest tests --cov=supervised/ 49 | continue-on-error: true 50 | -------------------------------------------------------------------------------- /.github/workflows/test-installation-with-conda.yml: -------------------------------------------------------------------------------- 1 | name: Test installation with conda 2 | 3 | on: 4 | schedule: 5 | - cron: '0 8 * * 1' 6 | # run workflow manually 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | name: Run (${{ matrix.python-version }}, ${{ matrix.os }}) 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: [windows-latest] 17 | python-version: ['3.9'] 18 | 19 | steps: 20 | - uses: conda-incubator/setup-miniconda@v2 21 | with: 22 | activate-environment: test 23 | auto-update-conda: false 24 | python-version: ${{ matrix.python-version }} 25 | - name: Activate conda and check versions 26 | run: | 27 | conda activate test 28 | conda --version 29 | python --version 30 | - name: Install MLJAR AutoML 31 | run: conda install -c conda-forge mljar-supervised 32 | - name: Try to import 33 | run: python -c "import supervised;print(supervised.__version__)" 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | AutoML_* 2 | .vscode 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /CITATION: -------------------------------------------------------------------------------- 1 | @misc{mljar, 2 | author = {Aleksandra P\l{}o\'{n}ska and Piotr P\l{}o\'{n}ski}, 3 | year = {2021}, 4 | publisher = {MLJAR Sp. z o.o.}, 5 | address = {\L{}apy, Poland}, 6 | title = {MLJAR: State-of-the-art Automated Machine Learning Framework for Tabular Data. Version 0.10.3}, 7 | url = {https://github.com/mljar/mljar-supervised} 8 | } 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 MLJAR Sp. z o.o. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include LICENSE 3 | include README.md -------------------------------------------------------------------------------- /examples/notebooks/basic_run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import openml\n", 11 | "from sklearn.ensemble import RandomForestClassifier\n", 12 | "from supervised.automl import AutoML\n", 13 | "\n", 14 | "import os\n", 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "import sklearn.model_selection\n", 18 | "from sklearn.metrics import log_loss, f1_score\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "dataset_id = 3\n", 28 | "df = pd.read_csv('./tests/data/{0}.csv'.format(dataset_id))\n", 29 | "x_cols = [c for c in df.columns if c != 'target']\n", 30 | "X = df[x_cols]\n", 31 | "y = df['target']" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "a = AutoML(total_time_limit=10)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "a.fit(X, y)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | " " 59 | ] 60 | } 61 | ], 62 | "metadata": { 63 | "kernelspec": { 64 | "display_name": ".venv", 65 | "language": "python", 66 | "name": ".venv" 67 | }, 68 | "language_info": { 69 | "codemirror_mode": { 70 | "name": "ipython", 71 | "version": 3 72 | }, 73 | "file_extension": ".py", 74 | "mimetype": "text/x-python", 75 | "name": "python", 76 | "nbconvert_exporter": "python", 77 | "pygments_lexer": "ipython3", 78 | "version": "3.6.7" 79 | } 80 | }, 81 | "nbformat": 4, 82 | "nbformat_minor": 2 83 | } -------------------------------------------------------------------------------- /examples/scripts/binary_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | from sklearn.model_selection import train_test_split 5 | import os 6 | from sklearn.metrics import log_loss 7 | import warnings 8 | 9 | # warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning) 10 | 11 | df = pd.read_csv( 12 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", 13 | skipinitialspace=True, 14 | ) 15 | 16 | X = df[df.columns[:-1]] 17 | y = df["income"] 18 | 19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 20 | 21 | automl = AutoML( 22 | algorithms=["LightGBM"], 23 | mode="Compete", 24 | explain_level=0, 25 | train_ensemble=True, 26 | golden_features=False, 27 | features_selection=False, 28 | eval_metric="auc", 29 | ) 30 | automl.fit(X_train, y_train) 31 | 32 | predictions = automl.predict_all(X_test) 33 | 34 | print(predictions.head()) 35 | print(predictions.tail()) 36 | print(X_test.shape, predictions.shape) 37 | print("LogLoss", log_loss(y_test, predictions["prediction_>50K"])) 38 | -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_Titanic.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.metrics import accuracy_score 4 | from supervised import AutoML 5 | 6 | train = pd.read_csv( 7 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv" 8 | ) 9 | print(train.head()) 10 | 11 | X = train[train.columns[2:]] 12 | y = train["Survived"] 13 | 14 | automl = AutoML() # default mode is Explain 15 | 16 | automl.fit(X, y) 17 | 18 | test = pd.read_csv( 19 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv" 20 | ) 21 | predictions = automl.predict(test) 22 | print(predictions) 23 | print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%") 24 | -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_adult_fairness.py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.datasets import fetch_openml 4 | from supervised.automl import AutoML 5 | 6 | data = fetch_openml(data_id=1590, as_frame=True) 7 | X = data.data 8 | # data.target # 9 | y = data.target # (data.target == ">50K") * 1 10 | sensitive_features = X[["sex"]] 11 | 12 | X_train, X_test, y_train, y_test, S_train, S_test = train_test_split( 13 | X, y, sensitive_features, stratify=y, test_size=0.75, random_state=42 14 | ) 15 | 16 | automl = AutoML( 17 | algorithms=[ 18 | "Xgboost" 19 | ], 20 | train_ensemble=False, 21 | fairness_metric="demographic_parity_ratio", 22 | fairness_threshold=0.8, 23 | privileged_groups = [{"sex": "Male"}], 24 | underprivileged_groups = [{"sex": "Female"}], 25 | ) 26 | 27 | automl.fit(X_train, y_train, sensitive_features=S_train) 28 | -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_ensemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from supervised.automl import AutoML 3 | from supervised.ensemble import Ensemble 4 | import os 5 | 6 | df = pd.read_csv( 7 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", 8 | skipinitialspace=True, 9 | ) 10 | 11 | X = df[df.columns[:-1]] 12 | y = df["income"] 13 | 14 | results_path = "AutoML_2" 15 | automl = AutoML( 16 | results_path=results_path, 17 | total_time_limit=400, 18 | start_random_models=10, 19 | hill_climbing_steps=0, 20 | top_models_to_improve=0, 21 | train_ensemble=False, 22 | ) 23 | 24 | 25 | models_map = {m.get_name(): m for m in automl._models} 26 | 27 | ensemble = Ensemble("logloss", "binary_classification") 28 | ensemble.models_map = models_map 29 | 30 | oofs = {} 31 | target = None 32 | for i in range(1, 30): 33 | oof = pd.read_csv( 34 | os.path.join(results_path, f"model_{i}", "predictions_out_of_folds.csv") 35 | ) 36 | prediction_cols = [c for c in oof.columns if "prediction" in c] 37 | oofs[f"model_{i}"] = oof[prediction_cols] 38 | if target is None: 39 | target_columns = [c for c in oof.columns if "target" in c] 40 | target = oof[target_columns] 41 | 42 | ensemble.target = target 43 | ensemble.target_columns = "target" 44 | ensemble.fit(oofs, target) 45 | ensemble.save(os.path.join(results_path, "ensemble")) 46 | 47 | 48 | predictions = ensemble.predict(X) 49 | print(predictions.head()) 50 | 51 | """ 52 | p_<=50K p_>50K 53 | 0 0.982940 0.017060 54 | 1 0.722781 0.277219 55 | 2 0.972687 0.027313 56 | 3 0.903021 0.096979 57 | 4 0.591373 0.408627 58 | """ 59 | 60 | 61 | ensemble2 = Ensemble.load(os.path.join(results_path, "ensemble"), models_map) 62 | predictions2 = ensemble2.predict(X) 63 | print(predictions2.head()) 64 | 65 | """ 66 | p_<=50K p_>50K 67 | 0 0.982940 0.017060 68 | 1 0.722781 0.277219 69 | 2 0.972687 0.027313 70 | 3 0.903021 0.096979 71 | 4 0.591373 0.408627 72 | """ 73 | -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_marketing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from supervised.automl import AutoML 3 | import os 4 | 5 | from sklearn.metrics import accuracy_score 6 | from sklearn.model_selection import train_test_split 7 | 8 | df = pd.read_csv("tests/data/PortugeseBankMarketing/Data_FinalProject.csv") 9 | 10 | X = df[df.columns[:-1]] 11 | y = df["y"] 12 | 13 | 14 | X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25) 15 | 16 | 17 | automl = AutoML( 18 | # results_path="AutoML_22", 19 | total_time_limit=30 * 60, 20 | start_random_models=10, 21 | hill_climbing_steps=3, 22 | top_models_to_improve=3, 23 | train_ensemble=True, 24 | ) 25 | 26 | automl.fit(X_train, y_train) 27 | 28 | 29 | pred = automl.predict(X_test) 30 | print("Test accuracy", accuracy_score(y_test, pred)) 31 | -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_random.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | from sklearn.metrics import accuracy_score 5 | import os 6 | 7 | nrows = 100 8 | ncols = 3 9 | X = np.random.rand(nrows, ncols) 10 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(ncols)]) 11 | y = np.random.randint(0, 2, nrows) 12 | # y = np.random.permutation(["a", "B"] * 50) 13 | 14 | automl = AutoML(model_time_limit=10) # , algorithms=["Decision Tree"]) 15 | automl.fit(X, y) 16 | print("Train accuracy", accuracy_score(y, automl.predict_all(X)["label"])) 17 | 18 | # X = np.random.rand(1000, 10) 19 | # X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)]) 20 | # y = np.random.randint(0, 2, 1000) 21 | # print("Test accuracy", accuracy_score(y, automl.predict(X)["label"])) 22 | -------------------------------------------------------------------------------- /examples/scripts/multi_class_classifier.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from supervised.automl import AutoML 4 | import supervised 5 | 6 | 7 | import warnings 8 | 9 | from sklearn import datasets 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.decomposition import PCA 12 | 13 | from supervised import AutoML 14 | from supervised.exceptions import AutoMLException 15 | 16 | df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv") 17 | X = df[["feature_1", "feature_2", "feature_3", "feature_4"]] 18 | y = df["class"] 19 | 20 | automl = AutoML() 21 | 22 | automl.fit(X, y) 23 | 24 | predictions = automl.predict_all(X) 25 | 26 | print(predictions.head()) 27 | print(predictions.tail()) 28 | 29 | print(X.shape) 30 | print(predictions.shape) 31 | -------------------------------------------------------------------------------- /examples/scripts/multi_class_classifier_MNIST.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from supervised.automl import AutoML 4 | 5 | 6 | from supervised.utils.config import mem 7 | 8 | 9 | df = pd.read_csv("tests/data/MNIST/train.csv") 10 | 11 | X = df[[f for f in df.columns if "pixel" in f]] 12 | y = df["label"] 13 | 14 | for _ in range(4): 15 | X = pd.concat([X, X], axis=0) 16 | y = pd.concat([y, y], axis=0) 17 | 18 | 19 | mem() 20 | 21 | 22 | automl = AutoML( 23 | # results_path="AutoML_12", 24 | total_time_limit=60 * 60, 25 | start_random_models=5, 26 | hill_climbing_steps=2, 27 | top_models_to_improve=3, 28 | train_ensemble=True, 29 | ) 30 | 31 | mem() 32 | print("Start fit") 33 | automl.fit(X, y) 34 | 35 | test = pd.read_csv("tests/data/MNIST/test.csv") 36 | predictions = automl.predict(test) 37 | 38 | print(predictions.head()) 39 | print(predictions.tail()) 40 | 41 | sub = pd.DataFrame({"ImageId": 0, "Label": predictions["label"]}) 42 | sub["ImageId"] = sub.index + 1 43 | sub.to_csv("sub1.csv", index=False) 44 | -------------------------------------------------------------------------------- /examples/scripts/multi_class_classifier_digits.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # scikit learn utilites 4 | from sklearn.datasets import load_digits 5 | from sklearn.metrics import accuracy_score 6 | from sklearn.model_selection import train_test_split 7 | 8 | # mljar-supervised package 9 | from supervised.automl import AutoML 10 | 11 | # Load the data 12 | digits = load_digits() 13 | X_train, X_test, y_train, y_test = train_test_split( 14 | pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25 15 | ) 16 | 17 | # train models 18 | automl = AutoML(mode="Perform") 19 | automl.fit(X_train, y_train) 20 | 21 | # compute the accuracy on test data 22 | predictions = automl.predict(X_test) 23 | print(predictions.head()) 24 | print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int))) 25 | -------------------------------------------------------------------------------- /examples/scripts/multi_class_drug_fairness.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from supervised import AutoML 5 | 6 | 7 | df = pd.read_csv("tests/data/Drug/Drug_Consumption.csv") 8 | 9 | 10 | X = df[df.columns[1:13]] 11 | 12 | # convert to 3 classes 13 | df = df.replace( 14 | { 15 | "Cannabis": { 16 | "CL0": "never_used", 17 | "CL1": "not_in_last_year", 18 | "CL2": "not_in_last_year", 19 | "CL3": "used_in_last_year", 20 | "CL4": "used_in_last_year", 21 | "CL5": "used_in_last_year", 22 | "CL6": "used_in_last_year", 23 | } 24 | } 25 | ) 26 | 27 | y = df["Cannabis"] 28 | 29 | # maybe should be 30 | # The binary sensitive feature is education level (college degree or not). 31 | # like in 32 | # Fairness guarantee in multi-class classification 33 | sensitive_features = df["Gender"] 34 | 35 | 36 | automl = AutoML( 37 | algorithms=["Xgboost"], 38 | train_ensemble=True, 39 | start_random_models=3, 40 | hill_climbing_steps=3, 41 | top_models_to_improve=2, 42 | fairness_threshold=0.8, 43 | explain_level=1 44 | ) 45 | automl.fit(X, y, sensitive_features=sensitive_features) 46 | -------------------------------------------------------------------------------- /examples/scripts/regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | df = pd.read_csv("./tests/data/housing_regression_missing_values_missing_target.csv") 6 | x_cols = [c for c in df.columns if c != "MEDV"] 7 | X = df[x_cols] 8 | y = df["MEDV"] 9 | 10 | automl = AutoML() 11 | automl.fit(X, y) 12 | 13 | df["predictions"] = automl.predict(X) 14 | print("Predictions") 15 | print(df[["MEDV", "predictions"]].head()) 16 | -------------------------------------------------------------------------------- /examples/scripts/regression_acs_fairness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | # to get data 6 | # from fairlearn.datasets import fetch_acs_income 7 | # df = fetch_acs_income(as_frame=True) 8 | # df["frame"].to_csv("acs_income.csv", index=False) 9 | 10 | df = pd.read_csv("tests/data/acs_income_1k.csv") 11 | 12 | print(df) 13 | 14 | x_cols = [c for c in df.columns if c != "PINCP"] 15 | 16 | sensitive_features = df["SEX"].astype(str) 17 | 18 | X = df[x_cols] 19 | y = df["PINCP"] 20 | 21 | automl = AutoML( 22 | algorithms=["Xgboost", "LightGBM"], 23 | train_ensemble=True, 24 | fairness_threshold=0.91, 25 | # underprivileged_groups=[{"SEX": "1.0"}], 26 | # privileged_groups=[{"SEX": "2.0"}] 27 | ) 28 | automl.fit(X, y, sensitive_features=sensitive_features) 29 | -------------------------------------------------------------------------------- /examples/scripts/regression_crime_fairness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | # data source http://archive.ics.uci.edu/ml/datasets/Communities%20and%20Crime%20Unnormalized 6 | 7 | df = pd.read_csv("tests/data/CrimeData/crimedata.csv", na_values=["?"]) 8 | 9 | X = df[df.columns[5:129]] 10 | y = df["ViolentCrimesPerPop"] 11 | 12 | sensitive_features = (df["racePctWhite"] > 84).astype(str) 13 | 14 | automl = AutoML( 15 | #algorithms=["Decision Tree", "Neural Network", "Xgboost", "Linear", "CatBoost"], 16 | algorithms=["Xgboost", "Linear", "CatBoost"], 17 | train_ensemble=True, 18 | fairness_threshold=0.5, 19 | ) 20 | automl.fit(X, y, sensitive_features=sensitive_features) 21 | -------------------------------------------------------------------------------- /examples/scripts/regression_housing_fairness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | df = pd.read_csv("./tests/data/boston_housing.csv") 6 | x_cols = [c for c in df.columns if c != "MEDV"] 7 | 8 | df["large_B"] = (df["B"] > 380) * 1 9 | df["large_B"] = df["large_B"].astype(str) 10 | 11 | 12 | print(df["large_B"].dtype.name) 13 | sensitive_features = df["large_B"] 14 | 15 | X = df[x_cols] 16 | y = df["MEDV"] 17 | 18 | automl = AutoML( 19 | algorithms=["Xgboost", "LightGBM"], 20 | train_ensemble=True, 21 | fairness_threshold=0.9, 22 | ) 23 | automl.fit(X, y, sensitive_features=sensitive_features) 24 | 25 | df["predictions"] = automl.predict(X) 26 | print("Predictions") 27 | print(df[["MEDV", "predictions"]].head()) 28 | -------------------------------------------------------------------------------- /examples/scripts/regression_law_school_fairness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | df = pd.read_csv("tests/data/LawSchool/bar_pass_prediction.csv") 6 | df["race1"][df["race1"] != "white"] = "non-white" # keep it as binary feature 7 | 8 | X = df[["gender", "lsat", "race1", "pass_bar"]] 9 | y = df["gpa"] 10 | 11 | sensitive_features = df["race1"] 12 | 13 | automl = AutoML( 14 | algorithms=["Xgboost", "LightGBM", "Extra Trees"], 15 | train_ensemble=True, 16 | fairness_threshold=0.9, 17 | ) 18 | automl.fit(X, y, sensitive_features=sensitive_features) 19 | -------------------------------------------------------------------------------- /examples/scripts/tabular_mar_2021.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from supervised import AutoML 3 | 4 | train = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/train.csv") 5 | test = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/test.csv") 6 | 7 | X_train = train.drop(["id", "target"], axis=1) 8 | y_train = train.target 9 | X_test = test.drop(["id"], axis=1) 10 | 11 | automl = AutoML( 12 | mode="Optuna", 13 | eval_metric="auc", 14 | algorithms=["CatBoost"], 15 | optuna_time_budget=1800, # tune each algorithm for 30 minutes 16 | total_time_limit=48 17 | * 3600, # total time limit, set large enough to have time to compute all steps 18 | features_selection=False, 19 | ) 20 | automl.fit(X_train, y_train) 21 | 22 | preds = automl.predict_proba(X_test) 23 | submission = pd.DataFrame({"id": test.id, "target": preds[:, 1]}) 24 | submission.to_csv("1_submission.csv", index=False) 25 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -p no:warnings -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.19.5,<2 2 | pandas>=2.0.0 3 | scipy>=1.6.1 4 | scikit-learn>=1.5.0 5 | xgboost>=2.0.0 6 | lightgbm>=3.0.0 7 | catboost>=0.24.4 8 | joblib>=1.0.1 9 | tabulate>=0.8.7 10 | matplotlib>=3.2.2 11 | dtreeviz>=2.2.2 12 | shap>=0.42.1 13 | seaborn>=0.11.1 14 | wordcloud>=1.8.1 15 | category_encoders>=2.2.2 16 | optuna-integration>=3.6.0 17 | mljar-scikit-plot>=0.3.11 18 | markdown 19 | typing-extensions 20 | ipython 21 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | black 3 | pytest-cov 4 | coveralls -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | # Get the long description from the README file 8 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 9 | long_description = f.read() 10 | 11 | setup( 12 | name="mljar-supervised", 13 | version="1.1.17", 14 | description="Automated Machine Learning for Humans", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/mljar/mljar-supervised", 18 | author="MLJAR, Sp. z o.o.", 19 | author_email="contact@mljar.com", 20 | license="MIT", 21 | packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 22 | install_requires=open("requirements.txt").readlines(), 23 | include_package_data=True, 24 | python_requires='>=3.8', 25 | classifiers=[ 26 | "Programming Language :: Python", 27 | "Programming Language :: Python :: 3.8", 28 | "Programming Language :: Python :: 3.9", 29 | "Programming Language :: Python :: 3.10", 30 | "Programming Language :: Python :: 3.11", 31 | ], 32 | keywords=[ 33 | "automated machine learning", 34 | "automl", 35 | "machine learning", 36 | "data science", 37 | "data mining", 38 | "mljar", 39 | "random forest", 40 | "decision tree", 41 | "xgboost", 42 | "lightgbm", 43 | "catboost", 44 | "neural network", 45 | "extra trees", 46 | "linear model", 47 | "features selection", 48 | "features engineering" 49 | ], 50 | ) 51 | -------------------------------------------------------------------------------- /supervised/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1.17" 2 | 3 | from supervised.automl import AutoML 4 | -------------------------------------------------------------------------------- /supervised/algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/algorithms/__init__.py -------------------------------------------------------------------------------- /supervised/algorithms/baseline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import sklearn 4 | from sklearn.base import ClassifierMixin, RegressorMixin 5 | from sklearn.dummy import DummyClassifier, DummyRegressor 6 | 7 | from supervised.algorithms.registry import ( 8 | BINARY_CLASSIFICATION, 9 | MULTICLASS_CLASSIFICATION, 10 | REGRESSION, 11 | AlgorithmsRegistry, 12 | ) 13 | from supervised.algorithms.sklearn import SklearnAlgorithm 14 | from supervised.utils.config import LOG_LEVEL 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(LOG_LEVEL) 18 | 19 | 20 | class BaselineClassifierAlgorithm(ClassifierMixin, SklearnAlgorithm): 21 | algorithm_name = "Baseline Classifier" 22 | algorithm_short_name = "Baseline" 23 | 24 | def __init__(self, params): 25 | super(BaselineClassifierAlgorithm, self).__init__(params) 26 | logger.debug("BaselineClassifierAlgorithm.__init__") 27 | 28 | self.library_version = sklearn.__version__ 29 | self.max_iters = additional.get("max_steps", 1) 30 | self.model = DummyClassifier( 31 | strategy="prior", random_state=params.get("seed", 1) 32 | ) 33 | 34 | def file_extension(self): 35 | return "baseline" 36 | 37 | def is_fitted(self): 38 | return ( 39 | hasattr(self.model, "n_outputs_") 40 | and self.model.n_outputs_ is not None 41 | and self.model.n_outputs_ > 0 42 | ) 43 | 44 | 45 | class BaselineRegressorAlgorithm(RegressorMixin, SklearnAlgorithm): 46 | algorithm_name = "Baseline Regressor" 47 | algorithm_short_name = "Baseline" 48 | 49 | def __init__(self, params): 50 | super(BaselineRegressorAlgorithm, self).__init__(params) 51 | logger.debug("BaselineRegressorAlgorithm.__init__") 52 | 53 | self.library_version = sklearn.__version__ 54 | self.max_iters = additional.get("max_steps", 1) 55 | self.model = DummyRegressor(strategy="mean") 56 | 57 | def file_extension(self): 58 | return "baseline" 59 | 60 | def is_fitted(self): 61 | return ( 62 | hasattr(self.model, "n_outputs_") 63 | and self.model.n_outputs_ is not None 64 | and self.model.n_outputs_ > 0 65 | ) 66 | 67 | 68 | additional = {"max_steps": 1, "max_rows_limit": None, "max_cols_limit": None} 69 | required_preprocessing = ["target_as_integer"] 70 | 71 | AlgorithmsRegistry.add( 72 | BINARY_CLASSIFICATION, 73 | BaselineClassifierAlgorithm, 74 | {}, 75 | required_preprocessing, 76 | additional, 77 | {}, 78 | ) 79 | 80 | AlgorithmsRegistry.add( 81 | MULTICLASS_CLASSIFICATION, 82 | BaselineClassifierAlgorithm, 83 | {}, 84 | required_preprocessing, 85 | additional, 86 | {}, 87 | ) 88 | 89 | 90 | AlgorithmsRegistry.add(REGRESSION, BaselineRegressorAlgorithm, {}, {}, additional, {}) 91 | -------------------------------------------------------------------------------- /supervised/algorithms/factory.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from supervised.algorithms.registry import BINARY_CLASSIFICATION, AlgorithmsRegistry 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | from supervised.exceptions import AutoMLException 8 | 9 | 10 | class AlgorithmFactory(object): 11 | @classmethod 12 | def get_algorithm(cls, params): 13 | alg_type = params.get("model_type", "Xgboost") 14 | ml_task = params.get("ml_task", BINARY_CLASSIFICATION) 15 | 16 | try: 17 | Algorithm = AlgorithmsRegistry.get_algorithm_class(ml_task, alg_type) 18 | return Algorithm(params) 19 | except Exception as e: 20 | raise AutoMLException(f"Cannot get algorithm class. {str(e)}") 21 | 22 | @classmethod 23 | def load(cls, json_desc, learner_path, lazy_load): 24 | learner = AlgorithmFactory.get_algorithm(json_desc.get("params")) 25 | learner.set_params(json_desc, learner_path) 26 | if not lazy_load: 27 | learner.reload() 28 | return learner 29 | -------------------------------------------------------------------------------- /supervised/algorithms/knn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import sklearn 4 | from sklearn.base import ClassifierMixin, RegressorMixin 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 7 | 8 | from supervised.algorithms.registry import ( 9 | BINARY_CLASSIFICATION, 10 | MULTICLASS_CLASSIFICATION, 11 | REGRESSION, 12 | AlgorithmsRegistry, 13 | ) 14 | from supervised.algorithms.sklearn import SklearnAlgorithm 15 | from supervised.utils.config import LOG_LEVEL 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(LOG_LEVEL) 19 | 20 | 21 | KNN_ROWS_LIMIT = 1000 22 | 23 | 24 | class KNNFit(SklearnAlgorithm): 25 | def file_extension(self): 26 | return "k_neighbors" 27 | 28 | def is_fitted(self): 29 | return ( 30 | hasattr(self.model, "n_samples_fit_") 31 | and self.model.n_samples_fit_ is not None 32 | and self.model.n_samples_fit_ > 0 33 | ) 34 | 35 | def fit( 36 | self, 37 | X, 38 | y, 39 | sample_weight=None, 40 | X_validation=None, 41 | y_validation=None, 42 | sample_weight_validation=None, 43 | log_to_file=None, 44 | max_time=None, 45 | ): 46 | rows_limit = self.params.get("rows_limit", KNN_ROWS_LIMIT) 47 | if X.shape[0] > rows_limit: 48 | X1, _, y1, _ = train_test_split( 49 | X, y, train_size=rows_limit, stratify=y, random_state=1234 50 | ) 51 | self.model.fit(X1, y1) 52 | else: 53 | self.model.fit(X, y) 54 | 55 | @property 56 | def _classes(self): 57 | # Returns the unique classes based on the fitted model 58 | if hasattr(self.model, "classes_"): 59 | return self.model.classes_ 60 | else: 61 | return None 62 | 63 | 64 | class KNeighborsAlgorithm(ClassifierMixin, KNNFit): 65 | algorithm_name = "k-Nearest Neighbors" 66 | algorithm_short_name = "Nearest Neighbors" 67 | 68 | def __init__(self, params): 69 | super(KNeighborsAlgorithm, self).__init__(params) 70 | logger.debug("KNeighborsAlgorithm.__init__") 71 | self.library_version = sklearn.__version__ 72 | self.max_iters = 1 73 | self.model = KNeighborsClassifier( 74 | n_neighbors=params.get("n_neighbors", 3), 75 | weights=params.get("weights", "uniform"), 76 | algorithm="kd_tree", 77 | n_jobs=params.get("n_jobs", -1), 78 | ) 79 | 80 | 81 | class KNeighborsRegressorAlgorithm(RegressorMixin, KNNFit): 82 | algorithm_name = "k-Nearest Neighbors" 83 | algorithm_short_name = "Nearest Neighbors" 84 | 85 | def __init__(self, params): 86 | super(KNeighborsRegressorAlgorithm, self).__init__(params) 87 | logger.debug("KNeighborsRegressorAlgorithm.__init__") 88 | self.library_version = sklearn.__version__ 89 | self.max_iters = 1 90 | self.model = KNeighborsRegressor( 91 | n_neighbors=params.get("n_neighbors", 3), 92 | weights=params.get("weights", "uniform"), 93 | algorithm="ball_tree", 94 | n_jobs=params.get("n_jobs", -1), 95 | ) 96 | 97 | 98 | knn_params = {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]} 99 | 100 | default_params = {"n_neighbors": 5, "weights": "uniform"} 101 | 102 | additional = {"max_rows_limit": 100000, "max_cols_limit": 100} 103 | 104 | required_preprocessing = [ 105 | "missing_values_inputation", 106 | "convert_categorical", 107 | "datetime_transform", 108 | "text_transform", 109 | "scale", 110 | "target_as_integer", 111 | ] 112 | 113 | AlgorithmsRegistry.add( 114 | BINARY_CLASSIFICATION, 115 | KNeighborsAlgorithm, 116 | knn_params, 117 | required_preprocessing, 118 | additional, 119 | default_params, 120 | ) 121 | AlgorithmsRegistry.add( 122 | MULTICLASS_CLASSIFICATION, 123 | KNeighborsAlgorithm, 124 | knn_params, 125 | required_preprocessing, 126 | additional, 127 | default_params, 128 | ) 129 | 130 | AlgorithmsRegistry.add( 131 | REGRESSION, 132 | KNeighborsRegressorAlgorithm, 133 | knn_params, 134 | required_preprocessing, 135 | additional, 136 | default_params, 137 | ) 138 | -------------------------------------------------------------------------------- /supervised/algorithms/registry.py: -------------------------------------------------------------------------------- 1 | # tasks that can be handled by the package 2 | BINARY_CLASSIFICATION = "binary_classification" 3 | MULTICLASS_CLASSIFICATION = "multiclass_classification" 4 | REGRESSION = "regression" 5 | 6 | class AlgorithmsRegistry: 7 | registry = { 8 | BINARY_CLASSIFICATION: {}, 9 | MULTICLASS_CLASSIFICATION: {}, 10 | REGRESSION: {}, 11 | } 12 | 13 | @staticmethod 14 | def add( 15 | task_name, 16 | model_class, 17 | model_params, 18 | required_preprocessing, 19 | additional, 20 | default_params, 21 | ): 22 | model_information = { 23 | "class": model_class, 24 | "params": model_params, 25 | "required_preprocessing": required_preprocessing, 26 | "additional": additional, 27 | "default_params": default_params, 28 | } 29 | AlgorithmsRegistry.registry[task_name][ 30 | model_class.algorithm_short_name 31 | ] = model_information 32 | 33 | @staticmethod 34 | def get_supported_ml_tasks(): 35 | return AlgorithmsRegistry.registry.keys() 36 | 37 | @staticmethod 38 | def get_algorithm_class(ml_task, algorithm_name): 39 | return AlgorithmsRegistry.registry[ml_task][algorithm_name]["class"] 40 | 41 | @staticmethod 42 | def get_long_name(ml_task, algorithm_name): 43 | return AlgorithmsRegistry.registry[ml_task][algorithm_name][ 44 | "class" 45 | ].algorithm_name 46 | 47 | @staticmethod 48 | def get_max_rows_limit(ml_task, algorithm_name): 49 | return AlgorithmsRegistry.registry[ml_task][algorithm_name]["additional"][ 50 | "max_rows_limit" 51 | ] 52 | 53 | @staticmethod 54 | def get_max_cols_limit(ml_task, algorithm_name): 55 | return AlgorithmsRegistry.registry[ml_task][algorithm_name]["additional"][ 56 | "max_cols_limit" 57 | ] 58 | 59 | @staticmethod 60 | def get_eval_metric(algorithm_name, ml_task, automl_eval_metric): 61 | if algorithm_name == "Xgboost": 62 | return xgboost_eval_metric(ml_task, automl_eval_metric) 63 | 64 | return automl_eval_metric 65 | 66 | # Import algorithm to be registered 67 | import supervised.algorithms.baseline 68 | import supervised.algorithms.catboost 69 | import supervised.algorithms.decision_tree 70 | import supervised.algorithms.extra_trees 71 | import supervised.algorithms.knn 72 | import supervised.algorithms.lightgbm 73 | import supervised.algorithms.linear 74 | import supervised.algorithms.nn 75 | import supervised.algorithms.random_forest 76 | import supervised.algorithms.xgboost -------------------------------------------------------------------------------- /supervised/callbacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/callbacks/__init__.py -------------------------------------------------------------------------------- /supervised/callbacks/callback.py: -------------------------------------------------------------------------------- 1 | class Callback(object): 2 | def __init__(self, params): 3 | self.params = params 4 | self.learners = [] 5 | self.learner = None # current learner 6 | self.name = "callback" 7 | 8 | def add_and_set_learner(self, learner): 9 | self.learners += [learner] 10 | self.learner = learner 11 | 12 | def on_learner_train_start(self, logs): 13 | pass 14 | 15 | def on_learner_train_end(self, logs): 16 | pass 17 | 18 | def on_iteration_start(self, logs): 19 | pass 20 | 21 | def on_iteration_end(self, logs, predictions): 22 | pass 23 | 24 | def on_framework_train_end(self, logs): 25 | pass 26 | -------------------------------------------------------------------------------- /supervised/callbacks/callback_list.py: -------------------------------------------------------------------------------- 1 | class CallbackList(object): 2 | def __init__(self, callbacks=[]): 3 | self.callbacks = callbacks 4 | 5 | def add_and_set_learner(self, learner): 6 | for cb in self.callbacks: 7 | cb.add_and_set_learner(learner) 8 | 9 | def on_learner_train_start(self, logs=None): 10 | for cb in self.callbacks: 11 | cb.on_learner_train_start(logs) 12 | 13 | def on_learner_train_end(self, logs=None): 14 | for cb in self.callbacks: 15 | cb.on_learner_train_end(logs) 16 | 17 | def on_iteration_start(self, logs=None): 18 | for cb in self.callbacks: 19 | cb.on_iteration_start(logs) 20 | 21 | def on_iteration_end(self, logs=None, predictions=None): 22 | for cb in self.callbacks: 23 | cb.on_iteration_end(logs, predictions) 24 | 25 | def on_framework_train_end(self, logs=None): 26 | for cb in self.callbacks: 27 | cb.on_framework_train_end(logs) 28 | 29 | def get(self, callback_name): 30 | for cb in self.callbacks: 31 | if cb.name == callback_name: 32 | return cb 33 | return None 34 | -------------------------------------------------------------------------------- /supervised/callbacks/learner_time_constraint.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import numpy as np 5 | 6 | from supervised.callbacks.callback import Callback 7 | from supervised.utils.config import LOG_LEVEL 8 | 9 | log = logging.getLogger(__name__) 10 | log.setLevel(LOG_LEVEL) 11 | 12 | 13 | class LearnerTimeConstraint(Callback): 14 | def __init__(self, params={}): 15 | super(LearnerTimeConstraint, self).__init__(params) 16 | self.name = params.get("name", "learner_time_constraint") 17 | self.min_steps = params.get("min_steps") 18 | self.learner_time_limit = params.get("learner_time_limit") # in seconds 19 | self.iterations_count = 0 20 | 21 | def on_learner_train_start(self, logs): 22 | self.train_start_time = time.time() 23 | self.iterations_count = 0 24 | 25 | def on_iteration_start(self, logs): 26 | self.iter_start_time = time.time() 27 | 28 | def on_iteration_end(self, logs, predictions): 29 | self.iterations_count += 1 30 | iteration_elapsed_time = np.round(time.time() - self.iter_start_time, 2) 31 | learner_elapsed_time = np.round(time.time() - self.train_start_time, 2) 32 | log.debug( 33 | "Iteration {0} took {1} seconds, learner training time {2} seconds".format( 34 | self.iterations_count, iteration_elapsed_time, learner_elapsed_time 35 | ) 36 | ) 37 | 38 | if self.min_steps is not None: 39 | if self.iterations_count < self.min_steps: 40 | # self.learner.stop_training = False 41 | # return before checking other conditions 42 | return 43 | 44 | if self.learner_time_limit is not None: 45 | if learner_elapsed_time >= self.learner_time_limit: 46 | self.learner.stop_training = True 47 | log.info("Terminating learning, time limit reached") 48 | -------------------------------------------------------------------------------- /supervised/callbacks/max_iters_constraint.py: -------------------------------------------------------------------------------- 1 | from supervised.callbacks.callback import Callback 2 | 3 | 4 | class MaxItersConstraint(Callback): 5 | def __init__(self, params): 6 | super(MaxItersConstraint, self).__init__(params) 7 | self.name = params.get("name", "max_iters_constraint") 8 | self.max_iters = params.get("max_iters", 10) 9 | 10 | def add_and_set_learner(self, learner): 11 | self.learner = learner 12 | 13 | def on_iteration_end(self, logs, predictions): 14 | # iters are computed starting from 0 15 | if logs.get("iter_cnt") + 1 >= self.max_iters: 16 | self.learner.stop_training = True 17 | -------------------------------------------------------------------------------- /supervised/callbacks/metric_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | from supervised.callbacks.callback import Callback 6 | from supervised.utils.metric import Metric 7 | 8 | 9 | class MetricLogger(Callback): 10 | def __init__(self, params): 11 | super(MetricLogger, self).__init__(params) 12 | self.name = params.get("name", "metric_logger") 13 | self.loss_values = {} 14 | self.metrics = [] 15 | for metric_name in params.get("metric_names"): 16 | self.metrics += [Metric({"name": metric_name})] 17 | 18 | def add_and_set_learner(self, learner): 19 | self.loss_values[learner.uid] = {"train": {}, "validation": {}, "iters": []} 20 | for metric in self.metrics: 21 | self.loss_values[learner.uid]["train"][metric.name] = [] 22 | self.loss_values[learner.uid]["validation"][metric.name] = [] 23 | 24 | self.current_learner_uid = learner.uid 25 | 26 | def on_iteration_end(self, logs, predictions): 27 | for metric in self.metrics: 28 | train_loss = 0 29 | if predictions.get("y_train_predicted") is not None: 30 | train_loss = metric( 31 | predictions.get("y_train_true"), 32 | predictions.get("y_train_predicted"), 33 | ) 34 | validation_loss = metric( 35 | predictions.get("y_validation_true"), 36 | predictions.get("y_validation_predicted"), 37 | ) 38 | self.loss_values[self.current_learner_uid]["train"][metric.name] += [ 39 | train_loss 40 | ] 41 | self.loss_values[self.current_learner_uid]["validation"][metric.name] += [ 42 | validation_loss 43 | ] 44 | # keep information about iter number only once :) 45 | if metric == self.metrics[0]: 46 | self.loss_values[self.current_learner_uid]["iters"] += [ 47 | logs.get("iter_cnt") 48 | ] 49 | -------------------------------------------------------------------------------- /supervised/callbacks/terminate_on_nan.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | import numpy as np 6 | 7 | from supervised.callbacks.callback import Callback 8 | 9 | 10 | class TerminateOnNan(Callback): 11 | def __init__(self, learner, params): 12 | super(TerminateOnNan, self).__init__(learner, params) 13 | self.metric = Metric(params.get("metric_name")) 14 | 15 | def on_iteration_end(self, iter_cnt, data): 16 | loss_train = 0 17 | if data.get("y_train_predicted") is not None: 18 | loss_train = self.metric( 19 | data.get("y_train_true"), data.get("y_train_predicted") 20 | ) 21 | loss_validation = self.metric( 22 | data.get("y_validation_true"), data.get("y_validation_predicted") 23 | ) 24 | 25 | for loss in [loss_train, loss_validation]: 26 | if np.isnan(loss) or np.isinf(loss) or np.isneginf(loss): 27 | self.learner.stop_training = True 28 | log.info("Terminating learning, invalid loss value") 29 | -------------------------------------------------------------------------------- /supervised/callbacks/total_time_constraint.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import numpy as np 5 | 6 | from supervised.callbacks.callback import Callback 7 | from supervised.exceptions import NotTrainedException 8 | from supervised.utils.config import LOG_LEVEL 9 | 10 | log = logging.getLogger(__name__) 11 | log.setLevel(LOG_LEVEL) 12 | 13 | 14 | class TotalTimeConstraint(Callback): 15 | def __init__(self, params={}): 16 | super(TotalTimeConstraint, self).__init__(params) 17 | self.name = params.get("name", "total_time_constraint") 18 | self.total_time_limit = params.get("total_time_limit") 19 | self.total_time_start = params.get("total_time_start") 20 | self.expected_learners_cnt = params.get("expected_learners_cnt", 1) 21 | 22 | def on_learner_train_start(self, logs): 23 | self.train_start_time = time.time() 24 | 25 | def on_learner_train_end(self, logs): 26 | if ( 27 | self.total_time_limit is not None 28 | and len(self.learners) == 1 29 | and self.expected_learners_cnt > 1 30 | # just check for the first learner 31 | # need to have more than 1 learner 32 | # otherwise it is a finish of the training 33 | ): 34 | one_fold_time = time.time() - self.train_start_time 35 | estimate_all_folds = one_fold_time * self.expected_learners_cnt 36 | 37 | total_elapsed_time = np.round(time.time() - self.total_time_start, 2) 38 | 39 | # we need to add time for the rest of learners (assuming that all folds training time is the same) 40 | estimate_elapsed_time = total_elapsed_time + one_fold_time * ( 41 | self.expected_learners_cnt - 1 42 | ) 43 | 44 | if estimate_elapsed_time >= self.total_time_limit: 45 | raise NotTrainedException( 46 | "Stop training after the first fold. " 47 | f"Time needed to train on the first fold {np.round(one_fold_time)} seconds. " 48 | "The time estimate for training on all folds is larger than total_time_limit." 49 | ) 50 | if ( 51 | self.total_time_limit is not None 52 | and len(self.learners) < self.expected_learners_cnt 53 | # dont stop for last learner, we are finishing anyway 54 | ): 55 | total_elapsed_time = np.round(time.time() - self.total_time_start, 2) 56 | 57 | if total_elapsed_time > self.total_time_limit + 600: 58 | # add 10 minutes of margin 59 | # margin is added because of unexpected time changes 60 | # if training on each fold will be the same 61 | # then the training will be stopped after first fold (above condition) 62 | raise NotTrainedException( 63 | "Force to stop the training. " 64 | "Total time for AutoML training already exceeded." 65 | ) 66 | 67 | def on_iteration_end(self, logs, predictions): 68 | total_elapsed_time = np.round(time.time() - self.total_time_start, 2) 69 | 70 | if self.total_time_limit is not None: 71 | log.debug( 72 | f"Total elapsed time {total_elapsed_time} seconds. " 73 | + f"Time left {np.round(self.total_time_limit - total_elapsed_time, 2)} seconds." 74 | ) 75 | # not time left, stop now 76 | if total_elapsed_time >= self.total_time_limit: 77 | self.learner.stop_training = True 78 | else: 79 | log.debug(f"Total elapsed time {total_elapsed_time} seconds") 80 | -------------------------------------------------------------------------------- /supervised/exceptions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from supervised.utils.config import LOG_LEVEL 4 | 5 | logging.basicConfig( 6 | format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.ERROR 7 | ) 8 | logger = logging.getLogger(__name__) 9 | logger.setLevel(LOG_LEVEL) 10 | 11 | 12 | class AutoMLException(Exception): 13 | def __init__(self, message): 14 | super(AutoMLException, self).__init__(message) 15 | logger.error(message) 16 | 17 | 18 | class NotTrainedException(Exception): 19 | def __init__(self, message): 20 | super(NotTrainedException, self).__init__(message) 21 | logger.debug(message) 22 | -------------------------------------------------------------------------------- /supervised/fairness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/fairness/__init__.py -------------------------------------------------------------------------------- /supervised/fairness/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(t, y): 5 | return np.round(np.sum(t == y) / t.shape[0], 4) 6 | 7 | 8 | def selection_rate(y): 9 | return np.round( 10 | np.sum((y == 1)) / y.shape[0], 11 | 4, 12 | ) 13 | 14 | 15 | def true_positive_rate(t, y): 16 | return np.round( 17 | np.sum((y == 1) & (t == 1)) / np.sum((t == 1)), 18 | 4, 19 | ) 20 | 21 | 22 | def false_positive_rate(t, y): 23 | return np.round( 24 | np.sum((y == 1) & (t == 0)) / np.sum((t == 0)), 25 | 4, 26 | ) 27 | 28 | 29 | def true_negative_rate(t, y): 30 | return np.round( 31 | np.sum((y == 0) & (t == 0)) / np.sum((t == 0)), 32 | 4, 33 | ) 34 | 35 | 36 | def false_negative_rate(t, y): 37 | return np.round( 38 | np.sum((y == 0) & (t == 1)) / np.sum((t == 1)), 39 | 4, 40 | ) 41 | -------------------------------------------------------------------------------- /supervised/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/preprocessing/__init__.py -------------------------------------------------------------------------------- /supervised/preprocessing/datetime_transformer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | class DateTimeTransformer(object): 6 | def __init__(self): 7 | self._new_columns = [] 8 | self._old_column = None 9 | self._min_datetime = None 10 | self._transforms = [] 11 | 12 | def fit(self, X, column): 13 | self._old_column = column 14 | self._min_datetime = np.min(X[column]) 15 | 16 | values = X[column].dt.year 17 | if len(np.unique(values)) > 1: 18 | self._transforms += ["year"] 19 | new_column = column + "_Year" 20 | self._new_columns += [new_column] 21 | 22 | values = X[column].dt.month 23 | if len(np.unique(values)) > 1: 24 | self._transforms += ["month"] 25 | new_column = column + "_Month" 26 | self._new_columns += [new_column] 27 | 28 | values = X[column].dt.day 29 | if len(np.unique(values)) > 1: 30 | self._transforms += ["day"] 31 | new_column = column + "_Day" 32 | self._new_columns += [new_column] 33 | 34 | values = X[column].dt.weekday 35 | if len(np.unique(values)) > 1: 36 | self._transforms += ["weekday"] 37 | new_column = column + "_WeekDay" 38 | self._new_columns += [new_column] 39 | 40 | values = X[column].dt.dayofyear 41 | if len(np.unique(values)) > 1: 42 | self._transforms += ["dayofyear"] 43 | new_column = column + "_DayOfYear" 44 | self._new_columns += [new_column] 45 | 46 | values = X[column].dt.hour 47 | if len(np.unique(values)) > 1: 48 | self._transforms += ["hour"] 49 | new_column = column + "_Hour" 50 | self._new_columns += [new_column] 51 | 52 | values = (X[column] - self._min_datetime).dt.days 53 | if len(np.unique(values)) > 1: 54 | self._transforms += ["days_diff"] 55 | new_column = column + "_Days_Diff_To_Min" 56 | self._new_columns += [new_column] 57 | 58 | def transform(self, X): 59 | column = self._old_column 60 | 61 | if "year" in self._transforms: 62 | new_column = column + "_Year" 63 | X[new_column] = X[column].dt.year 64 | 65 | if "month" in self._transforms: 66 | new_column = column + "_Month" 67 | X[new_column] = X[column].dt.month 68 | 69 | if "day" in self._transforms: 70 | new_column = column + "_Day" 71 | X[new_column] = X[column].dt.day 72 | 73 | if "weekday" in self._transforms: 74 | new_column = column + "_WeekDay" 75 | X[new_column] = X[column].dt.weekday 76 | 77 | if "dayofyear" in self._transforms: 78 | new_column = column + "_DayOfYear" 79 | X[new_column] = X[column].dt.dayofyear 80 | 81 | if "hour" in self._transforms: 82 | new_column = column + "_Hour" 83 | X[new_column] = X[column].dt.hour 84 | 85 | if "days_diff" in self._transforms: 86 | new_column = column + "_Days_Diff_To_Min" 87 | X[new_column] = (X[column] - self._min_datetime).dt.days 88 | 89 | X.drop(column, axis=1, inplace=True) 90 | return X 91 | 92 | def to_json(self): 93 | data_json = { 94 | "new_columns": list(self._new_columns), 95 | "old_column": self._old_column, 96 | "min_datetime": str(self._min_datetime), 97 | "transforms": list(self._transforms), 98 | } 99 | return data_json 100 | 101 | def from_json(self, data_json): 102 | self._new_columns = data_json.get("new_columns", None) 103 | self._old_column = data_json.get("old_column", None) 104 | d = data_json.get("min_datetime", None) 105 | self._min_datetime = None if d is None else pd.to_datetime(d) 106 | self._transforms = data_json.get("transforms", []) 107 | -------------------------------------------------------------------------------- /supervised/preprocessing/encoding_selector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical 5 | 6 | 7 | class EncodingSelector: 8 | 9 | """ 10 | EncodingSelector object decides which method should be used for categorical encoding. 11 | 12 | Please keep it fast and simple. Thank you. 13 | """ 14 | 15 | @staticmethod 16 | def get(X, y, column): 17 | # return PreprocessingCategorical.CONVERT_LOO 18 | try: 19 | unique_cnt = len(np.unique(X.loc[~pd.isnull(X[column]), column])) 20 | if unique_cnt <= 20: 21 | return PreprocessingCategorical.FEW_CATEGORIES 22 | except Exception as e: 23 | pass 24 | 25 | return PreprocessingCategorical.MANY_CATEGORIES 26 | """ 27 | if unique_cnt <= 2 or unique_cnt > 25: 28 | return PreprocessingCategorical.CONVERT_INTEGER 29 | 30 | return PreprocessingCategorical.CONVERT_ONE_HOT 31 | """ 32 | -------------------------------------------------------------------------------- /supervised/preprocessing/exclude_missing_target.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import warnings 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised.utils.config import LOG_LEVEL 8 | 9 | logger = logging.getLogger(__name__) 10 | logger.setLevel(LOG_LEVEL) 11 | 12 | 13 | class ExcludeRowsMissingTarget(object): 14 | @staticmethod 15 | def transform( 16 | X=None, y=None, sample_weight=None, sensitive_features=None, warn=False 17 | ): 18 | if y is None: 19 | return X, y, sample_weight, sensitive_features 20 | y_missing = pd.isnull(y) 21 | if np.sum(np.array(y_missing)) == 0: 22 | return X, y, sample_weight, sensitive_features 23 | logger.debug("Exclude rows with missing target values") 24 | if warn: 25 | warnings.warn( 26 | "There are samples with missing target values in the data which will be excluded for further analysis", 27 | UserWarning 28 | ) 29 | y = y.drop(y.index[y_missing]) 30 | y.reset_index(drop=True, inplace=True) 31 | 32 | if X is not None: 33 | X = X.drop(X.index[y_missing]) 34 | X.reset_index(drop=True, inplace=True) 35 | 36 | if sample_weight is not None: 37 | sample_weight = sample_weight.drop(sample_weight.index[y_missing]) 38 | sample_weight.reset_index(drop=True, inplace=True) 39 | 40 | if sensitive_features is not None: 41 | sensitive_features = sensitive_features.drop( 42 | sensitive_features.index[y_missing] 43 | ) 44 | sensitive_features.reset_index(drop=True, inplace=True) 45 | 46 | return X, y, sample_weight, sensitive_features 47 | -------------------------------------------------------------------------------- /supervised/preprocessing/kmeans_transformer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import joblib 5 | import numpy as np 6 | from sklearn.cluster import MiniBatchKMeans 7 | from sklearn.preprocessing import StandardScaler 8 | 9 | from supervised.exceptions import AutoMLException 10 | 11 | 12 | class KMeansTransformer(object): 13 | def __init__(self, results_path=None, model_name=None, k_fold=None): 14 | self._new_features = [] 15 | self._input_columns = [] 16 | self._error = None 17 | self._kmeans = None 18 | self._scale = None 19 | self._model_name = model_name 20 | self._k_fold = k_fold 21 | 22 | if results_path is not None: 23 | self._result_file = os.path.join( 24 | self._model_name, f"kmeans_fold_{k_fold}.joblib" 25 | ) 26 | self._result_path = os.path.join(results_path, self._result_file) 27 | # self.try_load() 28 | 29 | def fit(self, X, y): 30 | if self._new_features: 31 | return 32 | if self._error is not None and self._error: 33 | raise AutoMLException( 34 | "KMeans Features not created due to error (please check errors.md). " 35 | + self._error 36 | ) 37 | return 38 | if X.shape[1] == 0: 39 | self._error = f"KMeans not created. No continous features. Input data shape: {X.shape}, {y.shape}" 40 | raise AutoMLException("KMeans Features not created. No continous features.") 41 | 42 | start_time = time.time() 43 | 44 | n_clusters = int(np.log10(X.shape[0]) * 8) 45 | n_clusters = max(8, n_clusters) 46 | n_clusters = min(n_clusters, X.shape[1]) 47 | 48 | self._input_columns = X.columns.tolist() 49 | # scale data 50 | self._scale = StandardScaler(copy=True, with_mean=True, with_std=True) 51 | X = self._scale.fit_transform(X) 52 | 53 | # Kmeans 54 | self._kmeans = kmeans = MiniBatchKMeans(n_clusters=n_clusters, init="k-means++") 55 | self._kmeans.fit(X) 56 | self._create_new_features_names() 57 | 58 | # print( 59 | # f"Created {len(self._new_features)} KMeans Features in {np.round(time.time() - start_time,2)} seconds." 60 | # ) 61 | 62 | def _create_new_features_names(self): 63 | n_clusters = self._kmeans.cluster_centers_.shape[0] 64 | self._new_features = [f"Dist_Cluster_{i}" for i in range(n_clusters)] 65 | self._new_features += ["Cluster"] 66 | 67 | def transform(self, X): 68 | if self._kmeans is None: 69 | raise AutoMLException("KMeans not fitted") 70 | 71 | # scale 72 | X_scaled = self._scale.transform(X[self._input_columns]) 73 | 74 | # kmeans 75 | distances = self._kmeans.transform(X_scaled) 76 | clusters = self._kmeans.predict(X_scaled) 77 | 78 | X[self._new_features[:-1]] = distances 79 | X[self._new_features[-1]] = clusters 80 | 81 | return X 82 | 83 | def to_json(self): 84 | self.save() 85 | data_json = { 86 | "new_features": self._new_features, 87 | "result_file": self._result_file, 88 | "input_columns": self._input_columns, 89 | } 90 | if self._error is not None and self._error: 91 | data_json["error"] = self._error 92 | return data_json 93 | 94 | def from_json(self, data_json, results_path): 95 | self._new_features = data_json.get("new_features", []) 96 | self._input_columns = data_json.get("input_columns", []) 97 | self._result_file = data_json.get("result_file") 98 | self._result_path = os.path.join(results_path, self._result_file) 99 | self._error = data_json.get("error") 100 | self.try_load() 101 | 102 | def save(self): 103 | joblib.dump( 104 | {"kmeans": self._kmeans, "scale": self._scale}, 105 | self._result_path, 106 | compress=True, 107 | ) 108 | 109 | def try_load(self): 110 | if os.path.exists(self._result_path): 111 | data = joblib.load(self._result_path) 112 | self._kmeans = data["kmeans"] 113 | self._scale = data["scale"] 114 | 115 | self._create_new_features_names() 116 | -------------------------------------------------------------------------------- /supervised/preprocessing/label_binarizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LabelBinarizer(object): 5 | def __init__(self): 6 | self._new_columns = [] 7 | self._uniq_values = None 8 | self._old_column = None 9 | self._old_column_dtype = None 10 | 11 | def fit(self, X, column): 12 | self._old_column = column 13 | self._old_column_dtype = str(X[column].dtype) 14 | self._uniq_values = np.unique(X[column].values) 15 | # self._uniq_values = [str(u) for u in self._uniq_values] 16 | 17 | if len(self._uniq_values) == 2: 18 | self._new_columns.append(column + "_" + str(self._uniq_values[1])) 19 | else: 20 | for v in self._uniq_values: 21 | self._new_columns.append(column + "_" + str(v)) 22 | 23 | def transform(self, X, column): 24 | if len(self._uniq_values) == 2: 25 | X[column + "_" + str(self._uniq_values[1])] = ( 26 | X[column] == self._uniq_values[1] 27 | ).astype(int) 28 | else: 29 | for v in self._uniq_values: 30 | X[column + "_" + str(v)] = (X[column] == v).astype(int) 31 | 32 | X.drop(column, axis=1, inplace=True) 33 | return X 34 | 35 | def inverse_transform(self, X): 36 | if self._old_column is None: 37 | return X 38 | 39 | old_col = (X[self._new_columns[0]] * 0).astype(self._old_column_dtype) 40 | 41 | for unique_value in self._uniq_values: 42 | new_col = f"{self._old_column}_{unique_value}" 43 | if new_col not in self._new_columns: 44 | old_col[:] = unique_value 45 | else: 46 | old_col[X[new_col] == 1] = unique_value 47 | 48 | X[self._old_column] = old_col 49 | X.drop(self._new_columns, axis=1, inplace=True) 50 | return X 51 | 52 | def to_json(self): 53 | self._uniq_values = [str(i) for i in list(self._uniq_values)] 54 | data_json = { 55 | "new_columns": list(self._new_columns), 56 | "unique_values": self._uniq_values, 57 | "old_column": self._old_column, 58 | "old_column_dtype": self._old_column_dtype, 59 | } 60 | 61 | if ( 62 | "True" in self._uniq_values 63 | and "False" in self._uniq_values 64 | and len(self._uniq_values) == 2 65 | ): 66 | self._uniq_values = [False, True] 67 | 68 | return data_json 69 | 70 | def from_json(self, data_json): 71 | self._new_columns = data_json.get("new_columns", None) 72 | self._uniq_values = data_json.get("unique_values", None) 73 | self._old_column = data_json.get("old_column", None) 74 | self._old_column_dtype = data_json.get("old_column_dtype", None) 75 | 76 | if ( 77 | "True" in self._uniq_values 78 | and "False" in self._uniq_values 79 | and len(self._uniq_values) == 2 80 | ): 81 | self._uniq_values = [False, True] 82 | -------------------------------------------------------------------------------- /supervised/preprocessing/label_encoder.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from decimal import Decimal 3 | 4 | import numpy as np 5 | from sklearn import preprocessing as sk_preproc 6 | 7 | from supervised.utils.config import LOG_LEVEL 8 | 9 | logger = logging.getLogger(__name__) 10 | logger.setLevel(LOG_LEVEL) 11 | 12 | 13 | class LabelEncoder(object): 14 | def __init__(self, try_to_fit_numeric=False): 15 | self.lbl = sk_preproc.LabelEncoder() 16 | self._try_to_fit_numeric = try_to_fit_numeric 17 | 18 | def fit(self, x): 19 | self.lbl.fit(x) # list(x.values)) 20 | if self._try_to_fit_numeric: 21 | logger.debug("Try to fit numeric in LabelEncoder") 22 | try: 23 | arr = {Decimal(c): c for c in self.lbl.classes_} 24 | sorted_arr = dict(sorted(arr.items())) 25 | self.lbl.classes_ = np.array( 26 | list(sorted_arr.values()), dtype=self.lbl.classes_.dtype 27 | ) 28 | except Exception as e: 29 | pass 30 | 31 | def transform(self, x): 32 | try: 33 | return self.lbl.transform(x) # list(x.values)) 34 | except ValueError as ve: 35 | # rescue 36 | classes = np.unique(x) # list(x.values)) 37 | diff = np.setdiff1d(classes, self.lbl.classes_) 38 | self.lbl.classes_ = np.concatenate((self.lbl.classes_, diff)) 39 | return self.lbl.transform(x) # list(x.values)) 40 | 41 | def inverse_transform(self, x): 42 | return self.lbl.inverse_transform(x) # (list(x.values)) 43 | 44 | def to_json(self): 45 | data_json = {} 46 | for i, cl in enumerate(self.lbl.classes_): 47 | data_json[str(cl)] = i 48 | return data_json 49 | 50 | def from_json(self, data_json): 51 | keys = np.array(list(data_json.keys())) 52 | if len(keys) == 2 and "False" in keys and "True" in keys: 53 | keys = np.array([False, True]) 54 | self.lbl.classes_ = keys 55 | -------------------------------------------------------------------------------- /supervised/preprocessing/loo_encoder.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import warnings 4 | 5 | import pandas as pd 6 | from category_encoders.leave_one_out import LeaveOneOutEncoder 7 | 8 | from supervised.utils.config import LOG_LEVEL 9 | 10 | logger = logging.getLogger(__name__) 11 | logger.setLevel(LOG_LEVEL) 12 | 13 | 14 | class LooEncoder(object): 15 | def __init__(self, cols=None): 16 | self.enc = LeaveOneOutEncoder( 17 | cols=cols, 18 | verbose=1, 19 | drop_invariant=False, 20 | return_df=True, 21 | handle_unknown="value", 22 | handle_missing="value", 23 | random_state=1, 24 | sigma=0, 25 | ) 26 | 27 | def fit(self, X, y): 28 | with warnings.catch_warnings(): 29 | warnings.simplefilter("ignore") 30 | self.enc.fit(X, y) 31 | 32 | def transform(self, X): 33 | return self.enc.transform(X) 34 | 35 | def to_json(self): 36 | data_json = { 37 | "cols": self.enc.cols, 38 | "dim": self.enc._dim, 39 | "mean": float(self.enc._mean), 40 | "feature_names": self.enc.get_feature_names_out(), 41 | "mapping": {}, 42 | } 43 | for k, v in self.enc.mapping.items(): 44 | data_json["mapping"][k] = v.to_json() 45 | return data_json 46 | 47 | def from_json(self, data_json): 48 | self.enc.cols = data_json.get("cols") 49 | self.enc._dim = data_json.get("dim") 50 | self.enc._mean = data_json.get("mean") 51 | self.enc.feature_names = data_json.get("feature_names") 52 | self.enc.mapping = {} 53 | for k, v in data_json.get("mapping", {}).items(): 54 | self.enc.mapping[k] = pd.DataFrame(json.loads(v)) 55 | -------------------------------------------------------------------------------- /supervised/preprocessing/preprocessing_missing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 5 | 6 | 7 | class PreprocessingMissingValues(object): 8 | FILL_NA_MIN = "na_fill_min_1" 9 | FILL_NA_MEAN = "na_fill_mean" 10 | FILL_NA_MEDIAN = "na_fill_median" 11 | FILL_DATETIME = "na_fill_datetime" 12 | 13 | NA_EXCLUDE = "na_exclude" 14 | MISSING_VALUE = "_missing_value_" 15 | REMOVE_COLUMN = "remove_column" 16 | 17 | def __init__(self, columns=[], na_fill_method=FILL_NA_MEDIAN): 18 | self._columns = columns 19 | # fill method 20 | self._na_fill_method = na_fill_method 21 | # fill parameters stored as a dict, feature -> fill value 22 | self._na_fill_params = {} 23 | self._datetime_columns = [] 24 | 25 | def fit(self, X): 26 | X = self._fit_na_fill(X) 27 | 28 | def _fit_na_fill(self, X): 29 | for column in self._columns: 30 | if np.sum(pd.isnull(X[column]) == True) == 0: 31 | continue 32 | self._na_fill_params[column] = self._get_fill_value(X[column]) 33 | if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME: 34 | self._datetime_columns += [column] 35 | 36 | def _get_fill_value(self, x): 37 | # categorical type 38 | if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL: 39 | if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: 40 | return ( 41 | PreprocessingMissingValues.MISSING_VALUE 42 | ) # add new categorical value 43 | return PreprocessingUtils.get_most_frequent(x) 44 | # datetime 45 | if PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME: 46 | return PreprocessingUtils.get_most_frequent(x) 47 | # text 48 | if PreprocessingUtils.get_type(x) == PreprocessingUtils.TEXT: 49 | return PreprocessingMissingValues.MISSING_VALUE 50 | 51 | # numerical type 52 | if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: 53 | return PreprocessingUtils.get_min(x) - 1.0 54 | if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN: 55 | return PreprocessingUtils.get_mean(x) 56 | return PreprocessingUtils.get_median(x) 57 | 58 | def transform(self, X): 59 | X = self._transform_na_fill(X) 60 | # this is additional run through columns, 61 | # in case of transforming data with new columns with missing values 62 | # X = self._make_sure_na_filled(X) # disbaled for now 63 | return X 64 | 65 | def _transform_na_fill(self, X): 66 | for column, value in self._na_fill_params.items(): 67 | ind = pd.isnull(X.loc[:, column]) 68 | X.loc[ind, column] = value 69 | return X 70 | 71 | def _make_sure_na_filled(self, X): 72 | self._fit_na_fill(X) 73 | return self._transform_na_fill(X) 74 | 75 | def to_json(self): 76 | # prepare json with all parameters 77 | if len(self._na_fill_params) == 0: 78 | return {} 79 | params = { 80 | "fill_method": self._na_fill_method, 81 | "fill_params": self._na_fill_params, 82 | "datetime_columns": list(self._datetime_columns), 83 | } 84 | for col in self._datetime_columns: 85 | params["fill_params"][col] = str(params["fill_params"][col]) 86 | return params 87 | 88 | def from_json(self, params): 89 | if params is not None: 90 | self._na_fill_method = params.get("fill_method", None) 91 | self._na_fill_params = params.get("fill_params", {}) 92 | self._datetime_columns = params.get("datetime_columns", []) 93 | for col in self._datetime_columns: 94 | self._na_fill_params[col] = pd.to_datetime(self._na_fill_params[col]) 95 | else: 96 | self._na_fill_method, self._na_fill_params = None, None 97 | self._datetime_columns = [] 98 | -------------------------------------------------------------------------------- /supervised/preprocessing/scale.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import preprocessing 3 | 4 | 5 | class Scale(object): 6 | SCALE_NORMAL = "scale_normal" 7 | SCALE_LOG_AND_NORMAL = "scale_log_and_normal" 8 | 9 | def __init__(self, columns=[], scale_method=SCALE_NORMAL): 10 | self.scale_method = scale_method 11 | self.columns = columns 12 | self.scale = preprocessing.StandardScaler( 13 | copy=True, with_mean=True, with_std=True 14 | ) 15 | self.X_min_values = None # it is used in SCALE_LOG_AND_NORMAL 16 | 17 | def fit(self, X): 18 | if len(self.columns): 19 | for c in self.columns: 20 | X[c] = X[c].astype(float) 21 | 22 | if self.scale_method == self.SCALE_NORMAL: 23 | self.scale.fit(X[self.columns]) 24 | elif self.scale_method == self.SCALE_LOG_AND_NORMAL: 25 | self.X_min_values = np.min(X[self.columns], axis=0) 26 | self.scale.fit(np.log(X[self.columns] - self.X_min_values + 1)) 27 | 28 | def transform(self, X): 29 | if len(self.columns): 30 | for c in self.columns: 31 | X[c] = X[c].astype(float) 32 | if self.scale_method == self.SCALE_NORMAL: 33 | X.loc[:, self.columns] = self.scale.transform(X[self.columns]) 34 | elif self.scale_method == self.SCALE_LOG_AND_NORMAL: 35 | X[self.columns] = np.log( 36 | np.clip( 37 | X[self.columns] - self.X_min_values + 1, a_min=1, a_max=None 38 | ) 39 | ) 40 | X.loc[:, self.columns] = self.scale.transform(X[self.columns]) 41 | return X 42 | 43 | def inverse_transform(self, X): 44 | if len(self.columns): 45 | if self.scale_method == self.SCALE_NORMAL: 46 | X.loc[:, self.columns] = self.scale.inverse_transform(X[self.columns]) 47 | elif self.scale_method == self.SCALE_LOG_AND_NORMAL: 48 | X[self.columns] = X[self.columns].astype("float64") 49 | 50 | X[self.columns] = self.scale.inverse_transform(X[self.columns]) 51 | X[self.columns] = np.exp(X[self.columns]) 52 | 53 | X.loc[:, self.columns] += self.X_min_values - 1 54 | return X 55 | 56 | def to_json(self): 57 | if len(self.columns) == 0: 58 | return None 59 | data_json = { 60 | "scale": list(self.scale.scale_), 61 | "mean": list(self.scale.mean_), 62 | "var": list(self.scale.var_), 63 | "n_samples_seen": int(self.scale.n_samples_seen_), 64 | "n_features_in": int(self.scale.n_features_in_), 65 | "columns": self.columns, 66 | "scale_method": self.scale_method, 67 | } 68 | if self.X_min_values is not None: 69 | data_json["X_min_values"] = list(self.X_min_values) 70 | return data_json 71 | 72 | def from_json(self, data_json): 73 | self.scale = preprocessing.StandardScaler( 74 | copy=True, with_mean=True, with_std=True 75 | ) 76 | self.scale.scale_ = data_json.get("scale") 77 | if self.scale.scale_ is not None: 78 | self.scale.scale_ = np.array(self.scale.scale_) 79 | self.scale.mean_ = data_json.get("mean") 80 | if self.scale.mean_ is not None: 81 | self.scale.mean_ = np.array(self.scale.mean_) 82 | self.scale.var_ = data_json.get("var") 83 | if self.scale.var_ is not None: 84 | self.scale.var_ = np.array(self.scale.var_) 85 | self.scale.n_samples_seen_ = int(data_json.get("n_samples_seen")) 86 | self.scale.n_features_in_ = int(data_json.get("n_features_in")) 87 | self.columns = data_json.get("columns", []) 88 | self.scale.feature_names_in_ = data_json.get("columns") 89 | self.scale_method = data_json.get("scale_method") 90 | self.X_min_values = data_json.get("X_min_values") 91 | if self.X_min_values is not None: 92 | self.X_min_values = np.array(self.X_min_values) 93 | -------------------------------------------------------------------------------- /supervised/preprocessing/text_transformer.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | 6 | 7 | class TextTransformer(object): 8 | def __init__(self): 9 | self._new_columns = [] 10 | self._old_column = None 11 | self._max_features = 100 12 | self._vectorizer = None 13 | 14 | def fit(self, X, column): 15 | self._old_column = column 16 | self._vectorizer = TfidfVectorizer( 17 | analyzer="word", 18 | stop_words="english", 19 | lowercase=True, 20 | max_features=self._max_features, 21 | ) 22 | 23 | x = X[column][~pd.isnull(X[column])] 24 | self._vectorizer.fit(x) 25 | for f in list(self._vectorizer.get_feature_names_out()): 26 | new_col = self._old_column + "_" + f 27 | self._new_columns += [new_col] 28 | 29 | def transform(self, X): 30 | with warnings.catch_warnings(): 31 | warnings.simplefilter( 32 | action="ignore", category=pd.errors.PerformanceWarning 33 | ) 34 | ii = ~pd.isnull(X[self._old_column]) 35 | x = X[self._old_column][ii] 36 | vect = self._vectorizer.transform(x) 37 | 38 | for f in self._new_columns: 39 | X[f] = 0.0 40 | 41 | X.loc[ii, self._new_columns] = vect.toarray() 42 | X.drop(self._old_column, axis=1, inplace=True) 43 | return X 44 | 45 | def to_json(self): 46 | for k in self._vectorizer.vocabulary_.keys(): 47 | self._vectorizer.vocabulary_[k] = int(self._vectorizer.vocabulary_[k]) 48 | 49 | data_json = { 50 | "new_columns": list(self._new_columns), 51 | "old_column": self._old_column, 52 | "vocabulary": self._vectorizer.vocabulary_, 53 | "fixed_vocabulary": self._vectorizer.fixed_vocabulary_, 54 | "idf": list(self._vectorizer.idf_), 55 | } 56 | return data_json 57 | 58 | def from_json(self, data_json): 59 | self._new_columns = data_json.get("new_columns", None) 60 | self._old_column = data_json.get("old_column", None) 61 | vocabulary = data_json.get("vocabulary") 62 | fixed_vocabulary = data_json.get("fixed_vocabulary") 63 | idf = data_json.get("idf") 64 | if vocabulary is not None and fixed_vocabulary is not None and idf is not None: 65 | self._vectorizer = TfidfVectorizer( 66 | analyzer="word", 67 | stop_words="english", 68 | lowercase=True, 69 | max_features=self._max_features, 70 | ) 71 | self._vectorizer.vocabulary_ = vocabulary 72 | self._vectorizer.fixed_vocabulary_ = fixed_vocabulary 73 | self._vectorizer.idf_ = np.array(idf) 74 | -------------------------------------------------------------------------------- /supervised/tuner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/tuner/__init__.py -------------------------------------------------------------------------------- /supervised/tuner/data_info.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from supervised.algorithms.registry import ( 5 | BINARY_CLASSIFICATION, 6 | MULTICLASS_CLASSIFICATION, 7 | REGRESSION, 8 | ) 9 | from supervised.preprocessing.encoding_selector import EncodingSelector 10 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 11 | 12 | 13 | class DataInfo: 14 | @staticmethod 15 | def compute(X, y, machinelearning_task): 16 | columns_info = {} 17 | for col in X.columns: 18 | columns_info[col] = [] 19 | # 20 | empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0] 21 | if empty_column: 22 | columns_info[col] += ["empty_column"] 23 | continue 24 | # 25 | constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1 26 | if constant_column: 27 | columns_info[col] += ["constant_column"] 28 | continue 29 | # 30 | if PreprocessingUtils.is_na(X[col]): 31 | columns_info[col] += ["missing_values"] 32 | # 33 | if PreprocessingUtils.is_categorical(X[col]): 34 | columns_info[col] += ["categorical"] 35 | columns_info[col] += [EncodingSelector.get(X, y, col)] 36 | elif PreprocessingUtils.is_datetime(X[col]): 37 | columns_info[col] += ["datetime_transform"] 38 | elif PreprocessingUtils.is_text(X[col]): 39 | columns_info[col] = ["text_transform"] # override other transforms 40 | else: 41 | # numeric type, check if scale needed 42 | if PreprocessingUtils.is_scale_needed(X[col]): 43 | columns_info[col] += ["scale"] 44 | 45 | target_info = [] 46 | if machinelearning_task == BINARY_CLASSIFICATION: 47 | if not PreprocessingUtils.is_0_1(y): 48 | target_info += ["convert_0_1"] 49 | 50 | if machinelearning_task == REGRESSION: 51 | if PreprocessingUtils.is_log_scale_needed(y): 52 | target_info += ["scale_log"] 53 | elif PreprocessingUtils.is_scale_needed(y): 54 | target_info += ["scale"] 55 | 56 | num_class = None 57 | if machinelearning_task == MULTICLASS_CLASSIFICATION: 58 | num_class = PreprocessingUtils.num_class(y) 59 | 60 | return { 61 | "columns_info": columns_info, 62 | "target_info": target_info, 63 | "num_class": num_class, 64 | } 65 | -------------------------------------------------------------------------------- /supervised/tuner/hill_climbing.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | 5 | from supervised.algorithms.registry import AlgorithmsRegistry 6 | 7 | 8 | class HillClimbing: 9 | 10 | """ 11 | Example params are in JSON format: 12 | { 13 | "booster": ["gbtree", "gblinear"], 14 | "objective": ["binary:logistic"], 15 | "eval_metric": ["auc", "logloss"], 16 | "eta": [0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1] 17 | } 18 | """ 19 | 20 | @staticmethod 21 | def get(params, ml_task, seed=1): 22 | np.random.seed(seed) 23 | keys = list(params.keys()) 24 | for k in [ 25 | "num_class", 26 | "model_type", 27 | "seed", 28 | "ml_task", 29 | "explain_level", 30 | "model_architecture_json", 31 | "n_jobs", 32 | "metric", 33 | "eval_metric", 34 | "custom_eval_metric_name", 35 | "eval_metric_name", 36 | ]: 37 | if k in keys: 38 | keys.remove(k) 39 | 40 | model_type = params["model_type"] 41 | if model_type == "Baseline": 42 | return [None, None] 43 | model_info = AlgorithmsRegistry.registry[ml_task][model_type] 44 | model_params = model_info["params"] 45 | 46 | permuted_keys = np.random.permutation(keys) 47 | key_to_update = None 48 | values = None 49 | 50 | for key_to_update in permuted_keys: 51 | if key_to_update not in model_params: 52 | continue 53 | values = model_params[key_to_update] 54 | if len(values) > 1: 55 | break 56 | if values is None: 57 | return [None, None] 58 | 59 | left, right = None, None 60 | for i, v in enumerate(values): 61 | if v == params[key_to_update]: 62 | if i + 1 < len(values): 63 | right = values[i + 1] 64 | if i - 1 >= 0: 65 | left = values[i - 1] 66 | 67 | params_1, params_2 = None, None 68 | if left is not None: 69 | params_1 = copy.deepcopy(params) 70 | params_1[key_to_update] = left 71 | if right is not None: 72 | params_2 = copy.deepcopy(params) 73 | params_2[key_to_update] = right 74 | 75 | if params_1 is not None and "model_architecture_json" in params_1: 76 | del params_1["model_architecture_json"] 77 | if params_2 is not None and "model_architecture_json" in params_2: 78 | del params_2["model_architecture_json"] 79 | 80 | return [params_1, params_2] 81 | -------------------------------------------------------------------------------- /supervised/tuner/optuna/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/tuner/optuna/__init__.py -------------------------------------------------------------------------------- /supervised/tuner/optuna/extra_trees.py: -------------------------------------------------------------------------------- 1 | import optuna 2 | 3 | from supervised.algorithms.extra_trees import ( 4 | ExtraTreesAlgorithm, 5 | ExtraTreesRegressorAlgorithm, 6 | ) 7 | from supervised.algorithms.registry import ( 8 | REGRESSION, 9 | ) 10 | from supervised.utils.metric import Metric 11 | 12 | EPS = 1e-8 13 | 14 | 15 | class ExtraTreesObjective: 16 | def __init__( 17 | self, 18 | ml_task, 19 | X_train, 20 | y_train, 21 | sample_weight, 22 | X_validation, 23 | y_validation, 24 | sample_weight_validation, 25 | eval_metric, 26 | n_jobs, 27 | random_state, 28 | ): 29 | self.ml_task = ml_task 30 | self.X_train = X_train 31 | self.y_train = y_train 32 | self.sample_weight = sample_weight 33 | self.X_validation = X_validation 34 | self.y_validation = y_validation 35 | self.eval_metric = eval_metric 36 | self.n_jobs = n_jobs 37 | self.objective = "squared_error" if ml_task == REGRESSION else "gini" 38 | self.max_steps = 10 # ET is trained in steps 100 trees each 39 | self.seed = random_state 40 | 41 | def __call__(self, trial): 42 | try: 43 | Algorithm = ( 44 | ExtraTreesRegressorAlgorithm 45 | if self.ml_task == REGRESSION 46 | else ExtraTreesAlgorithm 47 | ) 48 | self.objective = ( 49 | "squared_error" 50 | if self.ml_task == REGRESSION 51 | else trial.suggest_categorical("criterion", ["gini", "entropy"]) 52 | ) 53 | params = { 54 | "max_steps": self.max_steps, 55 | "criterion": self.objective, 56 | "max_depth": trial.suggest_int("max_depth", 2, 32), 57 | "min_samples_split": trial.suggest_int("min_samples_split", 2, 100), 58 | "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100), 59 | "max_features": trial.suggest_float("max_features", 0.01, 1), 60 | "n_jobs": self.n_jobs, 61 | "seed": self.seed, 62 | "ml_task": self.ml_task, 63 | } 64 | model = Algorithm(params) 65 | 66 | model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) 67 | 68 | preds = model.predict(self.X_validation) 69 | 70 | score = self.eval_metric(self.y_validation, preds) 71 | if Metric.optimize_negative(self.eval_metric.name): 72 | score *= -1.0 73 | 74 | except optuna.exceptions.TrialPruned as e: 75 | raise e 76 | except Exception as e: 77 | print("Exception in ExtraTreesObjective", str(e)) 78 | return None 79 | 80 | return score 81 | -------------------------------------------------------------------------------- /supervised/tuner/optuna/knn.py: -------------------------------------------------------------------------------- 1 | import optuna 2 | 3 | from supervised.algorithms.knn import KNeighborsAlgorithm, KNeighborsRegressorAlgorithm 4 | from supervised.algorithms.registry import ( 5 | REGRESSION, 6 | ) 7 | from supervised.utils.metric import Metric 8 | 9 | 10 | class KNNObjective: 11 | def __init__( 12 | self, 13 | ml_task, 14 | X_train, 15 | y_train, 16 | sample_weight, 17 | X_validation, 18 | y_validation, 19 | sample_weight_validation, 20 | eval_metric, 21 | n_jobs, 22 | random_state, 23 | ): 24 | self.ml_task = ml_task 25 | self.X_train = X_train 26 | self.y_train = y_train 27 | self.sample_weight = sample_weight 28 | self.X_validation = X_validation 29 | self.y_validation = y_validation 30 | self.eval_metric = eval_metric 31 | self.n_jobs = n_jobs 32 | self.seed = random_state 33 | 34 | def __call__(self, trial): 35 | try: 36 | params = { 37 | "n_neighbors": trial.suggest_int("n_neighbors", 1, 128), 38 | "weights": trial.suggest_categorical( 39 | "weights", ["uniform", "distance"] 40 | ), 41 | "n_jobs": self.n_jobs, 42 | "rows_limit": 100000, 43 | "ml_task": self.ml_task, 44 | } 45 | Algorithm = ( 46 | KNeighborsRegressorAlgorithm 47 | if self.ml_task == REGRESSION 48 | else KNeighborsAlgorithm 49 | ) 50 | model = Algorithm(params) 51 | model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) 52 | preds = model.predict(self.X_validation) 53 | 54 | score = self.eval_metric(self.y_validation, preds) 55 | if Metric.optimize_negative(self.eval_metric.name): 56 | score *= -1.0 57 | 58 | except optuna.exceptions.TrialPruned as e: 59 | raise e 60 | except Exception as e: 61 | print("Exception in KNNObjective", str(e)) 62 | return None 63 | 64 | return score 65 | -------------------------------------------------------------------------------- /supervised/tuner/optuna/nn.py: -------------------------------------------------------------------------------- 1 | import optuna 2 | 3 | from supervised.algorithms.nn import MLPAlgorithm, MLPRegressorAlgorithm 4 | from supervised.algorithms.registry import ( 5 | REGRESSION, 6 | ) 7 | from supervised.utils.metric import Metric 8 | 9 | 10 | class NeuralNetworkObjective: 11 | def __init__( 12 | self, 13 | ml_task, 14 | X_train, 15 | y_train, 16 | sample_weight, 17 | X_validation, 18 | y_validation, 19 | sample_weight_validation, 20 | eval_metric, 21 | n_jobs, 22 | random_state, 23 | ): 24 | self.ml_task = ml_task 25 | self.X_train = X_train 26 | self.y_train = y_train 27 | self.sample_weight = sample_weight 28 | self.X_validation = X_validation 29 | self.y_validation = y_validation 30 | self.eval_metric = eval_metric 31 | self.seed = random_state 32 | 33 | def __call__(self, trial): 34 | try: 35 | Algorithm = ( 36 | MLPRegressorAlgorithm if self.ml_task == REGRESSION else MLPAlgorithm 37 | ) 38 | params = { 39 | "dense_1_size": trial.suggest_int("dense_1_size", 4, 100), 40 | "dense_2_size": trial.suggest_int("dense_2_size", 2, 100), 41 | "learning_rate": trial.suggest_categorical( 42 | "learning_rate", [0.005, 0.01, 0.05, 0.1, 0.2] 43 | ), 44 | "learning_rate_type": trial.suggest_categorical( 45 | "learning_rate_type", ["constant", "adaptive"] 46 | ), 47 | "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True), 48 | "seed": self.seed, 49 | "ml_task": self.ml_task, 50 | } 51 | model = Algorithm(params) 52 | model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) 53 | 54 | preds = model.predict(self.X_validation) 55 | 56 | score = self.eval_metric(self.y_validation, preds) 57 | if Metric.optimize_negative(self.eval_metric.name): 58 | score *= -1.0 59 | 60 | except optuna.exceptions.TrialPruned as e: 61 | raise e 62 | except Exception as e: 63 | print("Exception in NeuralNetworkObjective", str(e)) 64 | return None 65 | 66 | return score 67 | -------------------------------------------------------------------------------- /supervised/tuner/optuna/random_forest.py: -------------------------------------------------------------------------------- 1 | import optuna 2 | 3 | from supervised.algorithms.random_forest import ( 4 | RandomForestAlgorithm, 5 | RandomForestRegressorAlgorithm, 6 | ) 7 | from supervised.algorithms.registry import ( 8 | REGRESSION, 9 | ) 10 | from supervised.utils.metric import Metric 11 | 12 | 13 | class RandomForestObjective: 14 | def __init__( 15 | self, 16 | ml_task, 17 | X_train, 18 | y_train, 19 | sample_weight, 20 | X_validation, 21 | y_validation, 22 | sample_weight_validation, 23 | eval_metric, 24 | n_jobs, 25 | random_state, 26 | ): 27 | self.ml_task = ml_task 28 | self.X_train = X_train 29 | self.y_train = y_train 30 | self.sample_weight = sample_weight 31 | self.X_validation = X_validation 32 | self.y_validation = y_validation 33 | self.eval_metric = eval_metric 34 | self.n_jobs = n_jobs 35 | self.objective = "squared_error" if ml_task == REGRESSION else "gini" 36 | self.max_steps = 10 # RF is trained in steps 100 trees each 37 | self.seed = random_state 38 | 39 | def __call__(self, trial): 40 | try: 41 | Algorithm = ( 42 | RandomForestRegressorAlgorithm 43 | if self.ml_task == REGRESSION 44 | else RandomForestAlgorithm 45 | ) 46 | self.objective = ( 47 | "squared_error" 48 | if self.ml_task == REGRESSION 49 | else trial.suggest_categorical("criterion", ["gini", "entropy"]) 50 | ) 51 | params = { 52 | "max_steps": self.max_steps, 53 | "criterion": self.objective, 54 | "max_depth": trial.suggest_int("max_depth", 2, 32), 55 | "min_samples_split": trial.suggest_int("min_samples_split", 2, 100), 56 | "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100), 57 | "max_features": trial.suggest_float("max_features", 0.01, 1), 58 | "n_jobs": self.n_jobs, 59 | "seed": self.seed, 60 | "ml_task": self.ml_task, 61 | } 62 | model = Algorithm(params) 63 | model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) 64 | 65 | preds = model.predict(self.X_validation) 66 | 67 | score = self.eval_metric(self.y_validation, preds) 68 | if Metric.optimize_negative(self.eval_metric.name): 69 | score *= -1.0 70 | 71 | except optuna.exceptions.TrialPruned as e: 72 | raise e 73 | except Exception as e: 74 | print("Exception in RandomForestObjective", str(e)) 75 | return None 76 | 77 | return score 78 | -------------------------------------------------------------------------------- /supervised/tuner/random_parameters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RandomParameters: 5 | 6 | """ 7 | Example params are in JSON format: 8 | { 9 | "booster": ["gbtree", "gblinear"], 10 | "objective": ["binary:logistic"], 11 | "eval_metric": ["auc", "logloss"], 12 | "eta": [0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1] 13 | } 14 | """ 15 | 16 | @staticmethod 17 | def get(params, seed=1): 18 | np.random.seed(seed) 19 | generated_params = {"seed": seed} 20 | for k in params: 21 | generated_params[k] = np.random.permutation(params[k])[0].item() 22 | return generated_params 23 | -------------------------------------------------------------------------------- /supervised/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from supervised.utils.jsonencoder import MLJSONEncoder 4 | 5 | 6 | def json_loads(data, *args, **kwargs): 7 | return json.loads(data, *args, **kwargs) 8 | 9 | 10 | def json_dumps(data, *args, **kwargs): 11 | return json.dumps(data, cls=MLJSONEncoder, *args, **kwargs) 12 | -------------------------------------------------------------------------------- /supervised/utils/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def construct_learner_name(fold, repeat, repeats): 5 | repeat_str = f"_repeat_{repeat}" if repeats > 1 else "" 6 | return f"learner_fold_{fold}{repeat_str}" 7 | 8 | 9 | def learner_name_to_fold_repeat(name): 10 | fold, repeat = None, None 11 | arr = name.split("_") 12 | fold = int(arr[2]) 13 | if "repeat" in name: 14 | repeat = int(arr[4]) 15 | return fold, repeat 16 | 17 | 18 | def get_fold_repeat_cnt(model_path): 19 | training_logs = [f for f in os.listdir(model_path) if "_training.log" in f] 20 | fold_cnt, repeat_cnt = 0, 0 21 | for fname in training_logs: 22 | fold, repeat = learner_name_to_fold_repeat(fname) 23 | if fold is not None: 24 | fold_cnt = max(fold_cnt, fold) 25 | if repeat is not None: 26 | repeat_cnt = max(repeat_cnt, repeat) 27 | 28 | fold_cnt += 1 # counting from 0 29 | repeat_cnt += 1 30 | 31 | return fold_cnt, repeat_cnt 32 | 33 | 34 | def get_learners_names(model_path): 35 | postfix = "_training.log" 36 | learner_names = [ 37 | f.repleace(postfix, "") for f in os.listdir(model_path) if postfix in f 38 | ] 39 | return learner_names 40 | -------------------------------------------------------------------------------- /supervised/utils/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | LOG_LEVEL = logging.ERROR 4 | 5 | # from guppy import hpy 6 | # from pympler import summary 7 | # from pympler import muppy 8 | import time 9 | 10 | import numpy as np 11 | 12 | 13 | def mem(msg=""): 14 | """Memory usage in MB""" 15 | 16 | time.sleep(5) 17 | 18 | with open("/proc/self/status") as f: 19 | memusage = f.read().split("VmRSS:")[1].split("\n")[0][:-3] 20 | 21 | print(msg, "- memory:", np.round(float(memusage.strip()) / 1024.0), "MB") 22 | 23 | # all_objects = muppy.get_objects() 24 | # sum1 = summary.summarize(all_objects) 25 | # summary.print_(sum1) 26 | -------------------------------------------------------------------------------- /supervised/utils/constants.py: -------------------------------------------------------------------------------- 1 | # tasks that can be handled by the package 2 | BINARY_CLASSIFICATION = "binary_classification" 3 | MULTICLASS_CLASSIFICATION = "multiclass_classification" 4 | REGRESSION = "regression" 5 | -------------------------------------------------------------------------------- /supervised/utils/data_validation.py: -------------------------------------------------------------------------------- 1 | def check_greater_than_zero_integer(value, original_var_name): 2 | if not isinstance(value, int): 3 | raise ValueError( 4 | f"'{original_var_name}' must be an integer, got '{type(value)}'." 5 | ) 6 | 7 | if value <= 0: 8 | raise ValueError( 9 | f"'{original_var_name}' must be greater than zero, got '{value}'." 10 | ) 11 | 12 | 13 | def check_positive_integer(value, original_var_name): 14 | if not isinstance(value, int): 15 | raise ValueError( 16 | f"'{original_var_name}' must be an integer, got '{type(value)}'." 17 | ) 18 | 19 | if value < 0: 20 | raise ValueError( 21 | f"'{original_var_name}' must be equal or greater than zero, got '{value}'." 22 | ) 23 | 24 | 25 | def check_integer(value, original_var_name): 26 | if not isinstance(value, int): 27 | raise ValueError( 28 | f"'{original_var_name}' must be an integer, got '{type(value)}'." 29 | ) 30 | 31 | 32 | def check_bool(value, original_var_name): 33 | if not isinstance(value, bool): 34 | raise ValueError( 35 | f"'{original_var_name}' must be a boolean, got '{type(value)}'." 36 | ) 37 | 38 | 39 | def check_greater_than_zero_integer_or_float(value, original_var_name): 40 | if not (isinstance(value, int) or isinstance(value, float)): 41 | raise ValueError( 42 | f"'{original_var_name}' must be an integer or float, got '{type(value)}'." 43 | ) 44 | 45 | if value <= 0: 46 | raise ValueError( 47 | f"'{original_var_name}' must be greater than zero, got '{value}'." 48 | ) 49 | -------------------------------------------------------------------------------- /supervised/utils/importance.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import warnings 4 | 5 | import pandas as pd 6 | from sklearn.inspection import permutation_importance 7 | 8 | from supervised.algorithms.registry import ( 9 | BINARY_CLASSIFICATION, 10 | MULTICLASS_CLASSIFICATION, 11 | ) 12 | from supervised.utils.subsample import subsample 13 | 14 | logger = logging.getLogger(__name__) 15 | from supervised.utils.config import LOG_LEVEL 16 | 17 | logger.setLevel(LOG_LEVEL) 18 | 19 | from sklearn.metrics import log_loss, make_scorer 20 | 21 | 22 | def log_loss_eps(y_true, y_pred): 23 | ll = log_loss(y_true, y_pred) 24 | return ll 25 | 26 | 27 | log_loss_scorer = make_scorer(log_loss_eps, greater_is_better=False, response_method="predict_proba") 28 | 29 | 30 | class PermutationImportance: 31 | @staticmethod 32 | def compute_and_plot( 33 | model, 34 | X_validation, 35 | y_validation, 36 | model_file_path, 37 | learner_name, 38 | metric_name=None, 39 | ml_task=None, 40 | n_jobs=-1, 41 | ): 42 | # for scoring check https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 43 | if ml_task == BINARY_CLASSIFICATION: 44 | scoring = log_loss_scorer 45 | elif ml_task == MULTICLASS_CLASSIFICATION: 46 | scoring = log_loss_scorer 47 | else: 48 | scoring = "neg_mean_squared_error" 49 | 50 | try: 51 | with warnings.catch_warnings(): 52 | warnings.simplefilter("ignore") 53 | # subsample validation data to speed-up importance computation 54 | # in the case of large number of columns, it can take a lot of time 55 | rows, cols = X_validation.shape 56 | if cols > 5000 and rows > 100: 57 | X_vald, _, y_vald, _ = subsample( 58 | X_validation, y_validation, train_size=100, ml_task=ml_task 59 | ) 60 | elif cols > 50 and rows * cols > 200000 and rows > 1000: 61 | X_vald, _, y_vald, _ = subsample( 62 | X_validation, y_validation, train_size=1000, ml_task=ml_task 63 | ) 64 | else: 65 | X_vald = X_validation 66 | y_vald = y_validation 67 | 68 | importance = permutation_importance( 69 | model, 70 | X_vald, 71 | y_vald, 72 | scoring=scoring, 73 | n_jobs=n_jobs, 74 | random_state=12, 75 | n_repeats=5, # default 76 | ) 77 | 78 | sorted_idx = importance["importances_mean"].argsort() 79 | 80 | # save detailed importance 81 | df_imp = pd.DataFrame( 82 | { 83 | "feature": X_vald.columns[sorted_idx], 84 | "mean_importance": importance["importances_mean"][sorted_idx], 85 | } 86 | ) 87 | df_imp.to_csv( 88 | os.path.join(model_file_path, f"{learner_name}_importance.csv"), 89 | index=False, 90 | ) 91 | except Exception as e: 92 | print(str(e)) 93 | print("Problem during computing permutation importance. Skipping ...") 94 | -------------------------------------------------------------------------------- /supervised/utils/jsonencoder.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import date 3 | 4 | import numpy as np 5 | 6 | 7 | class MLJSONEncoder(json.JSONEncoder): 8 | def default(self, o): 9 | if isinstance( 10 | o, 11 | ( 12 | np.int_, 13 | np.intc, 14 | np.intp, 15 | np.int8, 16 | np.int16, 17 | np.int32, 18 | np.int64, 19 | np.uint8, 20 | np.uint16, 21 | np.uint32, 22 | np.uint64, 23 | ), 24 | ): 25 | return int(o) 26 | elif isinstance(o, (np.float_, np.float16, np.float32, np.float64)): 27 | return float(o) 28 | elif isinstance(o, np.ndarray): 29 | return o.tolist() 30 | elif isinstance(obj, date): 31 | return obj.strftime("%Y-%m-%d") 32 | 33 | return super(MLJSONEncoder, self).default(o) 34 | -------------------------------------------------------------------------------- /supervised/utils/subsample.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | 3 | from supervised.algorithms.registry import REGRESSION 4 | 5 | 6 | def subsample(X, y, ml_task, train_size): 7 | shuffle = True 8 | stratify = None 9 | 10 | if ml_task != REGRESSION: 11 | stratify = y 12 | 13 | X_train, X_test, y_train, y_test = train_test_split( 14 | X, y, train_size=train_size, shuffle=shuffle, stratify=stratify 15 | ) 16 | 17 | return X_train, X_test, y_train, y_test 18 | -------------------------------------------------------------------------------- /supervised/utils/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | 4 | class Store: 5 | data = {} 6 | 7 | def set(self, key, value): 8 | Store.data[key] = value 9 | 10 | def get(self, key): 11 | return copy.deepcopy(Store.data[key]) 12 | 13 | 14 | def dump_data(file_path, df): 15 | store = Store() 16 | store.set(file_path, df) 17 | # try: 18 | # df.to_parquet(file_path, index=False) 19 | # except Exception as e: 20 | # df.to_csv(file_path, index=False) 21 | 22 | 23 | def load_data(file_path): 24 | store = Store() 25 | return store.get(file_path) 26 | # try: 27 | # return pd.read_parquet(file_path) 28 | # except Exception as e: 29 | # return pd.read_csv(file_path) 30 | -------------------------------------------------------------------------------- /supervised/validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/validation/__init__.py -------------------------------------------------------------------------------- /supervised/validation/validation_step.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | from supervised.exceptions import AutoMLException 6 | from supervised.validation.validator_custom import CustomValidator 7 | from supervised.validation.validator_kfold import KFoldValidator 8 | from supervised.validation.validator_split import SplitValidator 9 | 10 | 11 | class ValidationStep: 12 | def __init__(self, params): 13 | # kfold is default validation technique 14 | self.validation_type = params.get("validation_type", "kfold") 15 | 16 | if self.validation_type == "kfold": 17 | self.validator = KFoldValidator(params) 18 | elif self.validation_type == "split": 19 | self.validator = SplitValidator(params) 20 | elif self.validation_type == "custom": 21 | self.validator = CustomValidator(params) 22 | else: 23 | raise AutoMLException( 24 | f"The validation type ({self.validation_type}) is not implemented." 25 | ) 26 | 27 | def get_split(self, k, repeat=0): 28 | return self.validator.get_split(k, repeat) 29 | 30 | def split(self): 31 | return self.validator.split() 32 | 33 | def get_n_splits(self): 34 | return self.validator.get_n_splits() 35 | 36 | def get_repeats(self): 37 | return self.validator.get_repeats() 38 | -------------------------------------------------------------------------------- /supervised/validation/validator_base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | 6 | class BaseValidator(object): 7 | def __init__(self, params): 8 | self.params = params 9 | 10 | def split(self): 11 | pass 12 | 13 | def get_n_splits(self): 14 | pass 15 | 16 | def get_repeats(self): 17 | return 1 18 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Running tests 2 | 3 | 4 | To run all tests: 5 | 6 | ``` 7 | pytest tests -v -x 8 | ``` 9 | 10 | To run tests for `algorithms`: 11 | 12 | ``` 13 | pytest tests/tests_algorithms -v -x -s 14 | ``` -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/__init__.py -------------------------------------------------------------------------------- /tests/checks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/checks/__init__.py -------------------------------------------------------------------------------- /tests/checks/check_automl_with_regression.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | import sklearn.model_selection 5 | 6 | from supervised.automl import AutoML 7 | 8 | 9 | class AutoMLWithRegressionTest(unittest.TestCase): 10 | def test_fit_and_predict(self): 11 | seed = 1709 12 | 13 | df = pd.read_csv( 14 | "./tests/data/housing_regression_missing_values_missing_target.csv" 15 | ) 16 | print(df.columns) 17 | x_cols = [c for c in df.columns if c != "MEDV"] 18 | X = df[x_cols] 19 | y = df["MEDV"] 20 | 21 | X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( 22 | X, y, test_size=0.3, random_state=seed 23 | ) 24 | automl = AutoML( 25 | total_time_limit=10, 26 | algorithms=["Xgboost"], # ["LightGBM", "RF", "NN", "CatBoost", "Xgboost"], 27 | start_random_models=1, 28 | hill_climbing_steps=0, 29 | top_models_to_improve=0, 30 | train_ensemble=True, 31 | verbose=True, 32 | ) 33 | automl.fit(X_train, y_train) 34 | 35 | response = automl.predict(X_test) # ["p_1"] 36 | print("Response", response) 37 | 38 | # Compute the logloss on test dataset 39 | # ll = log_loss(y_test, response) 40 | # print("(*) Dataset id {} logloss {}".format(dataset_id, ll)) 41 | 42 | 43 | if __name__ == "__main__": 44 | unittest.main() 45 | -------------------------------------------------------------------------------- /tests/checks/run_ml_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tests.tests_bin_class.run import * 4 | from tests.tests_multi_class.run import * 5 | 6 | if __name__ == "__main__": 7 | unittest.main() 8 | -------------------------------------------------------------------------------- /tests/checks/run_performance_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tests.tests_bin_class.test_performance import * 4 | 5 | if __name__ == "__main__": 6 | unittest.main() 7 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def data_folder(request) -> Path: 8 | folder_path = Path(__file__).parent / 'data' 9 | assert folder_path.exists() 10 | request.cls.data_folder = folder_path 11 | return folder_path 12 | -------------------------------------------------------------------------------- /tests/data/CrimeData/README.md: -------------------------------------------------------------------------------- 1 | Source: https://www.kaggle.com/datasets/kkanda/communities%20and%20crime%20unnormalized%20data%20set?select=crimedata.csv 2 | Description: http://archive.ics.uci.edu/ml/datasets/Communities%20and%20Crime%20Unnormalized -------------------------------------------------------------------------------- /tests/data/Drug/README.md: -------------------------------------------------------------------------------- 1 | Source https://www.kaggle.com/datasets/obeykhadija/drug-consumptions-uci 2 | 3 | 4 | Rating's for Drug Use: 5 | 6 | CL0 Never Used 7 | 8 | CL1 Used over a Decade Ago 9 | 10 | CL2 Used in Last Decade 11 | 12 | CL3 Used in Last Year 59 13 | 14 | CL4 Used in Last Month 15 | 16 | CL5 Used in Last Week 17 | 18 | CL6 Used in Last Day -------------------------------------------------------------------------------- /tests/data/LawSchool/README.md: -------------------------------------------------------------------------------- 1 | Source: https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage -------------------------------------------------------------------------------- /tests/data/iris_classes_missing_values_missing_target.csv: -------------------------------------------------------------------------------- 1 | feature_1,feature_2,feature_3,feature_4,class 2 | 5.1,3.5,1.4,0.2,1 3 | 4.9,3.0,1.4,0.2,1 4 | 4.7,3.2,1.3,,1 5 | 4.6,3.1,1.5,,1 6 | 5.0,3.6,1.4,0.2,1 7 | ,3.9,1.7,0.4,1 8 | 4.6,3.4,1.4,0.3,1 9 | 5.0,3.4,1.5,0.2,1 10 | 4.4,,1.4,0.2,1 11 | 4.9,3.1,1.5,0.1,1 12 | 5.4,3.7,1.5,0.2,1 13 | 4.8,3.4,,0.2,1 14 | 4.8,3.0,1.4,0.1,1 15 | 4.3,3.0,1.1,0.1,1 16 | 5.8,4.0,1.2,0.2,1 17 | 5.7,4.4,1.5,0.4,1 18 | 5.4,3.9,1.3,0.4,1 19 | 5.1,3.5,1.4,0.3, 20 | 5.7,3.8,1.7,0.3,1 21 | 5.1,3.8,1.5,0.3,1 22 | 5.4,3.4,1.7,0.2,1 23 | 5.1,3.7,1.5,0.4,1 24 | 4.6,3.6,1.0,0.2,1 25 | 5.1,3.3,1.7,0.5,1 26 | 4.8,3.4,1.9,0.2,1 27 | 5.0,3.0,1.6,0.2,1 28 | 5.0,3.4,1.6,0.4,1 29 | 5.2,3.5,1.5,0.2,1 30 | 5.2,3.4,1.4,0.2,1 31 | 4.7,3.2,1.6,0.2,1 32 | 4.8,3.1,1.6,0.2,1 33 | 5.4,3.4,1.5,0.4,1 34 | 5.2,4.1,1.5,0.1,1 35 | 5.5,4.2,1.4,0.2,1 36 | 4.9,3.1,1.5,0.1,1 37 | 5.0,3.2,1.2,0.2,1 38 | 5.5,3.5,1.3,0.2,1 39 | 4.9,3.1,1.5,0.1,1 40 | 4.4,3.0,1.3,0.2,1 41 | 5.1,3.4,1.5,0.2,1 42 | 5.0,3.5,1.3,0.3,1 43 | 4.5,2.3,1.3,0.3,1 44 | 4.4,3.2,1.3,0.2,1 45 | 5.0,3.5,1.6,0.6,1 46 | 5.1,3.8,1.9,0.4,1 47 | 4.8,3.0,1.4,0.3,1 48 | 5.1,3.8,1.6,0.2,1 49 | 4.6,3.2,1.4,0.2,1 50 | 5.3,3.7,1.5,0.2,1 51 | 5.0,3.3,1.4,0.2,1 52 | 7.0,3.2,4.7,1.4,2 53 | 6.4,3.2,4.5,1.5,2 54 | 6.9,3.1,4.9,1.5, 55 | 5.5,2.3,4.0,1.3,2 56 | 6.5,2.8,4.6,1.5,2 57 | 5.7,2.8,4.5,1.3,2 58 | 6.3,3.3,4.7,1.6,2 59 | 4.9,2.4,3.3,1.0,2 60 | 6.6,2.9,4.6,1.3,2 61 | 5.2,2.7,3.9,1.4,2 62 | 5.0,2.0,3.5,1.0,2 63 | 5.9,3.0,4.2,1.5,2 64 | 6.0,2.2,4.0,1.0,2 65 | 6.1,2.9,4.7,1.4,2 66 | 5.6,2.9,3.6,1.3,2 67 | 6.7,3.1,4.4,1.4,2 68 | 5.6,3.0,4.5,1.5,2 69 | 5.8,2.7,4.1,1.0,2 70 | 6.2,2.2,4.5,1.5,2 71 | 5.6,2.5,3.9,1.1,2 72 | 5.9,3.2,4.8,1.8,2 73 | 6.1,2.8,4.0,1.3,2 74 | 6.3,2.5,4.9,1.5,2 75 | 6.1,2.8,4.7,1.2,2 76 | 6.4,2.9,4.3,1.3,2 77 | 6.6,3.0,4.4,1.4,2 78 | 6.8,2.8,4.8,1.4,2 79 | 6.7,3.0,5.0,1.7,2 80 | 6.0,2.9,4.5,1.5,2 81 | 5.7,2.6,3.5,1.0,2 82 | 5.5,2.4,3.8,1.1,2 83 | 5.5,2.4,3.7,1.0,2 84 | 5.8,2.7,3.9,1.2,2 85 | 6.0,2.7,5.1,1.6,2 86 | 5.4,3.0,4.5,1.5,2 87 | 6.0,3.4,4.5,1.6,2 88 | 6.7,3.1,4.7,1.5,2 89 | 6.3,2.3,4.4,1.3,2 90 | 5.6,3.0,4.1,1.3,2 91 | 5.5,2.5,4.0,1.3,2 92 | 5.5,2.6,4.4,1.2,2 93 | 6.1,3.0,4.6,1.4,2 94 | 5.8,2.6,4.0,1.2,2 95 | 5.0,2.3,3.3,1.0,2 96 | 5.6,2.7,4.2,1.3,2 97 | 5.7,3.0,4.2,1.2,2 98 | 5.7,2.9,4.2,1.3,2 99 | 6.2,2.9,4.3,1.3,2 100 | 5.1,2.5,3.0,1.1,2 101 | 5.7,2.8,4.1,1.3,2 102 | 6.3,3.3,6.0,2.5,121 103 | 5.8,2.7,5.1,1.9,121 104 | 7.1,3.0,5.9,2.1,121 105 | 6.3,2.9,5.6,1.8,121 106 | 6.5,3.0,5.8,2.2,121 107 | 7.6,3.0,6.6,2.1,121 108 | 4.9,2.5,4.5,1.7,121 109 | 7.3,2.9,6.3,1.8,121 110 | 6.7,2.5,5.8,1.8,121 111 | 7.2,3.6,6.1,2.5,121 112 | 6.5,3.2,5.1,2.0,121 113 | 6.4,2.7,5.3,1.9,121 114 | 6.8,3.0,5.5,2.1,121 115 | 5.7,2.5,5.0,2.0,121 116 | 5.8,2.8,5.1,2.4,121 117 | 6.4,3.2,5.3,2.3,121 118 | 6.5,3.0,5.5,1.8,121 119 | 7.7,3.8,6.7,2.2,121 120 | 7.7,2.6,6.9,2.3,121 121 | 6.0,2.2,5.0,1.5,121 122 | 6.9,3.2,5.7,2.3,121 123 | 5.6,2.8,4.9,2.0,121 124 | 7.7,2.8,6.7,2.0,121 125 | 6.3,2.7,4.9,1.8,121 126 | 6.7,3.3,5.7,2.1,121 127 | 7.2,3.2,6.0,1.8,121 128 | 6.2,2.8,4.8,1.8,121 129 | 6.1,3.0,4.9,1.8,121 130 | 6.4,2.8,5.6,2.1,121 131 | 7.2,3.0,5.8,1.6,121 132 | 7.4,2.8,6.1,1.9,121 133 | 7.9,3.8,6.4,2.0,121 134 | 6.4,2.8,5.6,2.2,121 135 | 6.3,2.8,5.1,1.5,121 136 | 6.1,2.6,5.6,1.4,121 137 | 7.7,3.0,6.1,2.3,121 138 | 6.3,3.4,5.6,2.4,121 139 | 6.4,3.1,5.5,1.8,121 140 | 6.0,3.0,4.8,1.8,121 141 | 6.9,3.1,5.4,2.1,121 142 | 6.7,3.1,5.6,2.4,121 143 | 6.9,3.1,5.1,2.3,121 144 | 5.8,2.7,5.1,1.9,121 145 | 6.8,3.2,5.9,2.3,121 146 | 6.7,3.3,5.7,2.5,121 147 | 6.7,3.0,5.2,2.3,121 148 | 6.3,2.5,5.0,1.9,121 149 | 6.5,3.0,5.2,2.0,121 150 | 6.2,3.4,5.4,2.3,121 151 | 5.9,3.0,5.1,1.8,121 152 | 153 | -------------------------------------------------------------------------------- /tests/tests_algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_algorithms/__init__.py -------------------------------------------------------------------------------- /tests/tests_algorithms/test_baseline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.baseline import ( 9 | BaselineClassifierAlgorithm, 10 | BaselineRegressorAlgorithm, 11 | ) 12 | from supervised.utils.metric import Metric 13 | 14 | 15 | class BaselineTest(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(cls): 18 | cls.X, cls.y = datasets.make_regression( 19 | n_samples=100, 20 | n_features=5, 21 | n_informative=4, 22 | n_targets=1, 23 | shuffle=False, 24 | random_state=0, 25 | ) 26 | 27 | def test_reproduce_fit_regression(self): 28 | metric = Metric({"name": "rmse"}) 29 | prev_loss = None 30 | for _ in range(3): 31 | model = BaselineRegressorAlgorithm({"ml_task": "regression"}) 32 | model.fit(self.X, self.y) 33 | y_predicted = model.predict(self.X) 34 | loss = metric(self.y, y_predicted) 35 | if prev_loss is not None: 36 | assert_almost_equal(prev_loss, loss) 37 | prev_loss = loss 38 | 39 | def test_reproduce_fit_bin_class(self): 40 | X, y = datasets.make_classification( 41 | n_samples=100, 42 | n_features=5, 43 | n_informative=4, 44 | n_redundant=1, 45 | n_classes=2, 46 | n_clusters_per_class=3, 47 | n_repeated=0, 48 | shuffle=False, 49 | random_state=0, 50 | ) 51 | metric = Metric({"name": "logloss"}) 52 | prev_loss = None 53 | for _ in range(3): 54 | model = BaselineClassifierAlgorithm({"ml_task": "binary_classification"}) 55 | model.fit(X, y) 56 | y_predicted = model.predict(X) 57 | loss = metric(y, y_predicted) 58 | if prev_loss is not None: 59 | assert_almost_equal(prev_loss, loss) 60 | prev_loss = loss 61 | 62 | def test_save_and_load(self): 63 | metric = Metric({"name": "rmse"}) 64 | dt = BaselineRegressorAlgorithm({"ml_task": "regression"}) 65 | dt.fit(self.X, self.y) 66 | y_predicted = dt.predict(self.X) 67 | loss = metric(self.y, y_predicted) 68 | 69 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 70 | 71 | dt.save(filename) 72 | dt2 = BaselineRegressorAlgorithm({"ml_task": "regression"}) 73 | dt2.load(filename) 74 | # Finished with the file, delete it 75 | os.remove(filename) 76 | 77 | y_predicted = dt2.predict(self.X) 78 | loss2 = metric(self.y, y_predicted) 79 | assert_almost_equal(loss, loss2) 80 | 81 | def test_is_fitted(self): 82 | model = BaselineRegressorAlgorithm({"ml_task": "regression"}) 83 | self.assertFalse(model.is_fitted()) 84 | model.fit(self.X, self.y) 85 | self.assertTrue(model.is_fitted()) 86 | -------------------------------------------------------------------------------- /tests/tests_algorithms/test_decision_tree.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.decision_tree import ( 9 | DecisionTreeRegressorAlgorithm, 10 | ) 11 | from supervised.utils.metric import Metric 12 | 13 | 14 | class DecisionTreeTest(unittest.TestCase): 15 | @classmethod 16 | def setUpClass(cls): 17 | cls.X, cls.y = datasets.make_regression( 18 | n_samples=100, 19 | n_features=5, 20 | n_informative=4, 21 | n_targets=1, 22 | shuffle=False, 23 | random_state=0, 24 | ) 25 | 26 | def test_reproduce_fit_regression(self): 27 | metric = Metric({"name": "rmse"}) 28 | params = {"max_depth": 1, "seed": 1, "ml_task": "regression"} 29 | prev_loss = None 30 | for _ in range(3): 31 | model = DecisionTreeRegressorAlgorithm(params) 32 | model.fit(self.X, self.y) 33 | y_predicted = model.predict(self.X) 34 | loss = metric(self.y, y_predicted) 35 | if prev_loss is not None: 36 | assert_almost_equal(prev_loss, loss) 37 | prev_loss = loss 38 | 39 | def test_save_and_load(self): 40 | metric = Metric({"name": "rmse"}) 41 | dt = DecisionTreeRegressorAlgorithm({"ml_task": "regression"}) 42 | dt.fit(self.X, self.y) 43 | y_predicted = dt.predict(self.X) 44 | loss = metric(self.y, y_predicted) 45 | 46 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 47 | 48 | dt.save(filename) 49 | dt2 = DecisionTreeRegressorAlgorithm({"ml_task": "regression"}) 50 | dt2.load(filename) 51 | 52 | y_predicted = dt2.predict(self.X) 53 | loss2 = metric(self.y, y_predicted) 54 | assert_almost_equal(loss, loss2) 55 | 56 | # Finished with temp file, delete it 57 | os.remove(filename) 58 | 59 | def test_is_fitted(self): 60 | params = {"max_depth": 1, "seed": 1, "ml_task": "regression"} 61 | model = DecisionTreeRegressorAlgorithm(params) 62 | self.assertFalse(model.is_fitted()) 63 | model.fit(self.X, self.y) 64 | self.assertTrue(model.is_fitted()) 65 | -------------------------------------------------------------------------------- /tests/tests_algorithms/test_extra_trees.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.extra_trees import ( 9 | ExtraTreesAlgorithm, 10 | ExtraTreesRegressorAlgorithm, 11 | additional, 12 | regression_additional, 13 | ) 14 | from supervised.utils.metric import Metric 15 | 16 | additional["trees_in_step"] = 1 17 | regression_additional["trees_in_step"] = 1 18 | additional["max_steps"] = 1 19 | regression_additional["max_steps"] = 1 20 | 21 | 22 | class ExtraTreesRegressorAlgorithmTest(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(cls): 25 | cls.X, cls.y = datasets.make_regression( 26 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 27 | ) 28 | 29 | def test_reproduce_fit(self): 30 | metric = Metric({"name": "mse"}) 31 | params = {"trees_in_step": 1, "seed": 1, "ml_task": "regression"} 32 | prev_loss = None 33 | for _ in range(3): 34 | model = ExtraTreesRegressorAlgorithm(params) 35 | model.fit(self.X, self.y) 36 | y_predicted = model.predict(self.X) 37 | loss = metric(self.y, y_predicted) 38 | if prev_loss is not None: 39 | assert_almost_equal(prev_loss, loss) 40 | prev_loss = loss 41 | 42 | 43 | class ExtraTreesAlgorithmTest(unittest.TestCase): 44 | @classmethod 45 | def setUpClass(cls): 46 | cls.X, cls.y = datasets.make_classification( 47 | n_samples=100, 48 | n_features=5, 49 | n_informative=4, 50 | n_redundant=1, 51 | n_classes=2, 52 | n_clusters_per_class=3, 53 | n_repeated=0, 54 | shuffle=False, 55 | random_state=0, 56 | ) 57 | 58 | def test_reproduce_fit(self): 59 | metric = Metric({"name": "logloss"}) 60 | params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"} 61 | prev_loss = None 62 | for _ in range(3): 63 | model = ExtraTreesAlgorithm(params) 64 | model.fit(self.X, self.y) 65 | y_predicted = model.predict(self.X) 66 | loss = metric(self.y, y_predicted) 67 | if prev_loss is not None: 68 | assert_almost_equal(prev_loss, loss) 69 | prev_loss = loss 70 | 71 | def test_fit_predict(self): 72 | metric = Metric({"name": "logloss"}) 73 | params = {"trees_in_step": 50, "ml_task": "binary_classification"} 74 | rf = ExtraTreesAlgorithm(params) 75 | 76 | rf.fit(self.X, self.y) 77 | y_predicted = rf.predict(self.X) 78 | self.assertTrue(metric(self.y, y_predicted) < 0.6) 79 | 80 | def test_copy(self): 81 | metric = Metric({"name": "logloss"}) 82 | rf = ExtraTreesAlgorithm({"ml_task": "binary_classification"}) 83 | rf.fit(self.X, self.y) 84 | y_predicted = rf.predict(self.X) 85 | loss = metric(self.y, y_predicted) 86 | 87 | rf2 = ExtraTreesAlgorithm({"ml_task": "binary_classification"}) 88 | rf2 = rf.copy() 89 | self.assertEqual(type(rf), type(rf2)) 90 | y_predicted = rf2.predict(self.X) 91 | loss2 = metric(self.y, y_predicted) 92 | assert_almost_equal(loss, loss2) 93 | 94 | def test_save_and_load(self): 95 | metric = Metric({"name": "logloss"}) 96 | rf = ExtraTreesAlgorithm({"ml_task": "binary_classification"}) 97 | rf.fit(self.X, self.y) 98 | y_predicted = rf.predict(self.X) 99 | loss = metric(self.y, y_predicted) 100 | 101 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 102 | 103 | rf.save(filename) 104 | rf2 = ExtraTreesAlgorithm({"ml_task": "binary_classification"}) 105 | rf2.load(filename) 106 | # Finished with the file, delete it 107 | os.remove(filename) 108 | 109 | y_predicted = rf2.predict(self.X) 110 | loss2 = metric(self.y, y_predicted) 111 | assert_almost_equal(loss, loss2) 112 | 113 | def test_is_fitted(self): 114 | params = {"trees_in_step": 50, "ml_task": "binary_classification"} 115 | model = ExtraTreesAlgorithm(params) 116 | self.assertFalse(model.is_fitted()) 117 | model.fit(self.X, self.y) 118 | self.assertTrue(model.is_fitted()) 119 | -------------------------------------------------------------------------------- /tests/tests_algorithms/test_factory.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from supervised.algorithms.factory import AlgorithmFactory 4 | from supervised.algorithms.xgboost import XgbAlgorithm 5 | 6 | 7 | class AlgorithmFactoryTest(unittest.TestCase): 8 | def test_fit(self): 9 | params = { 10 | "learner_type": "Xgboost", 11 | "objective": "binary:logistic", 12 | "eval_metric": "logloss", 13 | } 14 | learner = AlgorithmFactory.get_algorithm(params) 15 | self.assertEqual( 16 | learner.algorithm_short_name, XgbAlgorithm.algorithm_short_name 17 | ) 18 | 19 | 20 | if __name__ == "__main__": 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /tests/tests_algorithms/test_knn.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from numpy.testing import assert_almost_equal 5 | from sklearn import datasets 6 | 7 | from supervised.algorithms.knn import KNeighborsAlgorithm, KNeighborsRegressorAlgorithm 8 | from supervised.utils.metric import Metric 9 | 10 | 11 | class KNeighborsRegressorAlgorithmTest(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.X, cls.y = datasets.make_regression( 15 | n_samples=100, 16 | n_features=5, 17 | n_informative=4, 18 | shuffle=False, 19 | random_state=0 20 | ) 21 | 22 | def test_reproduce_fit(self): 23 | metric = Metric({"name": "mse"}) 24 | params = {"seed": 1, "ml_task": "regression"} 25 | prev_loss = None 26 | for _ in range(2): 27 | model = KNeighborsRegressorAlgorithm(params) 28 | model.fit(self.X, self.y) 29 | y_predicted = model.predict(self.X) 30 | loss = metric(self.y, y_predicted) 31 | if prev_loss is not None: 32 | assert_almost_equal(prev_loss, loss) 33 | prev_loss = loss 34 | 35 | 36 | class KNeighborsAlgorithmTest(unittest.TestCase): 37 | @classmethod 38 | def setUpClass(cls): 39 | cls.X, cls.y = datasets.make_classification( 40 | n_samples=100, 41 | n_features=5, 42 | n_informative=4, 43 | n_redundant=1, 44 | n_classes=2, 45 | n_clusters_per_class=3, 46 | n_repeated=0, 47 | shuffle=False, 48 | random_state=0, 49 | ) 50 | 51 | def test_reproduce_fit(self): 52 | metric = Metric({"name": "logloss"}) 53 | params = {"seed": 1, "ml_task": "binary_classification"} 54 | prev_loss = None 55 | for _ in range(2): 56 | model = KNeighborsAlgorithm(params) 57 | model.fit(self.X, self.y) 58 | y_predicted = model.predict(self.X) 59 | loss = metric(self.y, y_predicted) 60 | if prev_loss is not None: 61 | assert_almost_equal(prev_loss, loss) 62 | prev_loss = loss 63 | 64 | def test_fit_predict(self): 65 | metric = Metric({"name": "logloss"}) 66 | params = {"ml_task": "binary_classification"} 67 | la = KNeighborsAlgorithm(params) 68 | 69 | la.fit(self.X, self.y) 70 | y_predicted = la.predict(self.X) 71 | self.assertTrue(metric(self.y, y_predicted) < 0.6) 72 | 73 | def test_is_fitted(self): 74 | params = {"ml_task": "binary_classification"} 75 | model = KNeighborsAlgorithm(params) 76 | self.assertFalse(model.is_fitted()) 77 | model.fit(self.X, self.y) 78 | self.assertTrue(model.is_fitted()) 79 | 80 | def test_classes_attribute(self): 81 | params = {"ml_task": "binary_classification"} 82 | model = KNeighborsAlgorithm(params) 83 | model.fit(self.X,self.y) 84 | 85 | try: 86 | classes = model._classes 87 | except AttributeError: 88 | classes = None 89 | 90 | self.assertTrue(np.array_equal(np.unique(self.y), classes)) 91 | -------------------------------------------------------------------------------- /tests/tests_algorithms/test_linear.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.linear import LinearAlgorithm, LinearRegressorAlgorithm 9 | from supervised.utils.metric import Metric 10 | 11 | 12 | class LinearRegressorAlgorithmTest(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.X, cls.y = datasets.make_regression( 16 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 17 | ) 18 | 19 | def test_reproduce_fit(self): 20 | metric = Metric({"name": "mse"}) 21 | params = {"seed": 1, "ml_task": "regression"} 22 | prev_loss = None 23 | for _ in range(3): 24 | model = LinearRegressorAlgorithm(params) 25 | model.fit(self.X, self.y) 26 | y_predicted = model.predict(self.X) 27 | loss = metric(self.y, y_predicted) 28 | if prev_loss is not None: 29 | assert_almost_equal(prev_loss, loss) 30 | prev_loss = loss 31 | 32 | 33 | class LinearAlgorithmTest(unittest.TestCase): 34 | @classmethod 35 | def setUpClass(cls): 36 | cls.X, cls.y = datasets.make_classification( 37 | n_samples=100, 38 | n_features=5, 39 | n_informative=4, 40 | n_redundant=1, 41 | n_classes=2, 42 | n_clusters_per_class=3, 43 | n_repeated=0, 44 | shuffle=False, 45 | random_state=0, 46 | ) 47 | 48 | def test_reproduce_fit(self): 49 | metric = Metric({"name": "logloss"}) 50 | params = {"seed": 1, "ml_task": "binary_classification"} 51 | prev_loss = None 52 | for _ in range(3): 53 | model = LinearAlgorithm(params) 54 | model.fit(self.X, self.y) 55 | y_predicted = model.predict(self.X) 56 | loss = metric(self.y, y_predicted) 57 | if prev_loss is not None: 58 | assert_almost_equal(prev_loss, loss) 59 | prev_loss = loss 60 | 61 | def test_fit_predict(self): 62 | metric = Metric({"name": "logloss"}) 63 | params = {"ml_task": "binary_classification"} 64 | la = LinearAlgorithm(params) 65 | 66 | la.fit(self.X, self.y) 67 | y_predicted = la.predict(self.X) 68 | self.assertTrue(metric(self.y, y_predicted) < 0.6) 69 | 70 | def test_copy(self): 71 | metric = Metric({"name": "logloss"}) 72 | model = LinearAlgorithm({"ml_task": "binary_classification"}) 73 | model.fit(self.X, self.y) 74 | y_predicted = model.predict(self.X) 75 | loss = metric(self.y, y_predicted) 76 | 77 | model2 = LinearAlgorithm({}) 78 | model2 = model.copy() 79 | self.assertEqual(type(model), type(model2)) 80 | y_predicted = model2.predict(self.X) 81 | loss2 = metric(self.y, y_predicted) 82 | assert_almost_equal(loss, loss2) 83 | 84 | def test_save_and_load(self): 85 | metric = Metric({"name": "logloss"}) 86 | model = LinearAlgorithm({"ml_task": "binary_classification"}) 87 | model.fit(self.X, self.y) 88 | y_predicted = model.predict(self.X) 89 | loss = metric(self.y, y_predicted) 90 | 91 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 92 | 93 | model.save(filename) 94 | model2 = LinearAlgorithm({"ml_task": "binary_classification"}) 95 | model2.load(filename) 96 | # Finished with the file, delete it 97 | os.remove(filename) 98 | 99 | y_predicted = model2.predict(self.X) 100 | loss2 = metric(self.y, y_predicted) 101 | assert_almost_equal(loss, loss2) 102 | 103 | def test_is_fitted(self): 104 | model = LinearAlgorithm({"ml_task": "binary_classification"}) 105 | self.assertFalse(model.is_fitted()) 106 | model.fit(self.X, self.y) 107 | self.assertTrue(model.is_fitted()) 108 | -------------------------------------------------------------------------------- /tests/tests_algorithms/test_random_forest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.random_forest import ( 9 | RandomForestAlgorithm, 10 | RandomForestRegressorAlgorithm, 11 | additional, 12 | regression_additional, 13 | ) 14 | from supervised.utils.metric import Metric 15 | 16 | additional["trees_in_step"] = 1 17 | regression_additional["trees_in_step"] = 1 18 | additional["max_steps"] = 1 19 | regression_additional["max_steps"] = 1 20 | 21 | 22 | class RandomForestRegressorAlgorithmTest(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(cls): 25 | cls.X, cls.y = datasets.make_regression( 26 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 27 | ) 28 | 29 | def test_reproduce_fit(self): 30 | metric = Metric({"name": "mse"}) 31 | params = {"trees_in_step": 1, "seed": 1, "ml_task": "regression"} 32 | prev_loss = None 33 | for _ in range(3): 34 | model = RandomForestRegressorAlgorithm(params) 35 | model.fit(self.X, self.y) 36 | y_predicted = model.predict(self.X) 37 | loss = metric(self.y, y_predicted) 38 | if prev_loss is not None: 39 | assert_almost_equal(prev_loss, loss) 40 | prev_loss = loss 41 | 42 | 43 | class RandomForestAlgorithmTest(unittest.TestCase): 44 | @classmethod 45 | def setUpClass(cls): 46 | cls.X, cls.y = datasets.make_classification( 47 | n_samples=100, 48 | n_features=5, 49 | n_informative=4, 50 | n_redundant=1, 51 | n_classes=2, 52 | n_clusters_per_class=3, 53 | n_repeated=0, 54 | shuffle=False, 55 | random_state=0, 56 | ) 57 | 58 | def test_reproduce_fit(self): 59 | metric = Metric({"name": "logloss"}) 60 | params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"} 61 | prev_loss = None 62 | for _ in range(3): 63 | model = RandomForestAlgorithm(params) 64 | model.fit(self.X, self.y) 65 | y_predicted = model.predict(self.X) 66 | loss = metric(self.y, y_predicted) 67 | if prev_loss is not None: 68 | assert_almost_equal(prev_loss, loss) 69 | prev_loss = loss 70 | 71 | def test_fit_predict(self): 72 | metric = Metric({"name": "logloss"}) 73 | params = {"ml_task": "binary_classification"} 74 | rf = RandomForestAlgorithm(params) 75 | 76 | rf.fit(self.X, self.y) 77 | y_predicted = rf.predict(self.X) 78 | self.assertTrue(metric(self.y, y_predicted) < 1.5) 79 | 80 | def test_copy(self): 81 | metric = Metric({"name": "logloss"}) 82 | rf = RandomForestAlgorithm({"ml_task": "binary_classification"}) 83 | rf.fit(self.X, self.y) 84 | y_predicted = rf.predict(self.X) 85 | loss = metric(self.y, y_predicted) 86 | 87 | rf2 = RandomForestAlgorithm({"ml_task": "binary_classification"}) 88 | rf2 = rf.copy() 89 | self.assertEqual(type(rf), type(rf2)) 90 | y_predicted = rf2.predict(self.X) 91 | loss2 = metric(self.y, y_predicted) 92 | assert_almost_equal(loss, loss2) 93 | 94 | def test_save_and_load(self): 95 | metric = Metric({"name": "logloss"}) 96 | rf = RandomForestAlgorithm({"ml_task": "binary_classification"}) 97 | rf.fit(self.X, self.y) 98 | y_predicted = rf.predict(self.X) 99 | loss = metric(self.y, y_predicted) 100 | 101 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 102 | 103 | rf.save(filename) 104 | rf2 = RandomForestAlgorithm({"ml_task": "binary_classification"}) 105 | rf2.load(filename) 106 | # Finished with the file, delete it 107 | os.remove(filename) 108 | 109 | y_predicted = rf2.predict(self.X) 110 | loss2 = metric(self.y, y_predicted) 111 | assert_almost_equal(loss, loss2) 112 | 113 | def test_is_fitted(self): 114 | model = RandomForestAlgorithm({"ml_task": "binary_classification"}) 115 | self.assertFalse(model.is_fitted()) 116 | model.fit(self.X, self.y) 117 | self.assertTrue(model.is_fitted()) 118 | -------------------------------------------------------------------------------- /tests/tests_algorithms/test_registry.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from supervised.algorithms.registry import AlgorithmsRegistry 4 | 5 | 6 | class AlgorithmsRegistryTest(unittest.TestCase): 7 | def test_add_to_registry(self): 8 | class Model1: 9 | algorithm_short_name = "" 10 | 11 | model1 = { 12 | "task_name": "binary_classification", 13 | "model_class": Model1, 14 | "model_params": {}, 15 | "required_preprocessing": {}, 16 | "additional": {}, 17 | "default_params": {}, 18 | } 19 | AlgorithmsRegistry.add(**model1) 20 | 21 | 22 | if __name__ == "__main__": 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /tests/tests_automl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_automl/__init__.py -------------------------------------------------------------------------------- /tests/tests_automl/test_adjust_validation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import numpy as np 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class AutoMLAdjustValidationTest(unittest.TestCase): 11 | automl_dir = "automl_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_custom_init(self): 17 | X = np.random.uniform(size=(60, 2)) 18 | y = np.random.randint(0, 2, size=(60,)) 19 | 20 | automl = AutoML( 21 | results_path=self.automl_dir, 22 | model_time_limit=10, 23 | algorithms=["Xgboost"], 24 | mode="Compete", 25 | explain_level=0, 26 | start_random_models=1, 27 | hill_climbing_steps=0, 28 | top_models_to_improve=0, 29 | kmeans_features=False, 30 | golden_features=False, 31 | features_selection=False, 32 | boost_on_errors=False, 33 | ) 34 | automl.fit(X, y) 35 | 36 | self.assertFalse( 37 | os.path.exists(os.path.join(self.automl_dir, "1_DecisionTree")) 38 | ) 39 | -------------------------------------------------------------------------------- /tests/tests_automl/test_automl_init.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | from supervised import AutoML 7 | 8 | 9 | class AutoMLInitTest(unittest.TestCase): 10 | automl_dir = "AutoMLInitTest" 11 | 12 | def tearDown(self): 13 | shutil.rmtree(self.automl_dir, ignore_errors=True) 14 | 15 | def test_custom_init(self): 16 | X = np.random.uniform(size=(30, 2)) 17 | y = np.random.randint(0, 2, size=(30,)) 18 | 19 | automl = AutoML( 20 | results_path=self.automl_dir, 21 | model_time_limit=1, 22 | algorithms=["Xgboost"], 23 | explain_level=0, 24 | train_ensemble=False, 25 | stack_models=False, 26 | validation_strategy={"validation_type": "split"}, 27 | start_random_models=3, 28 | hill_climbing_steps=1, 29 | top_models_to_improve=1, 30 | ) 31 | 32 | automl.fit(X, y) 33 | self.assertGreater(len(automl._models), 3) 34 | 35 | def test_get_results_path(self): 36 | automl = AutoML(algorithms=["Baseline"], total_time_limit=1) 37 | first_path = automl._get_results_path() 38 | self.assertEqual(first_path, automl._get_results_path()) 39 | shutil.rmtree(first_path, ignore_errors=True) 40 | 41 | automl = AutoML( 42 | algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir 43 | ) 44 | self.assertEqual(self.automl_dir, automl._get_results_path()) 45 | shutil.rmtree(self.automl_dir, ignore_errors=True) 46 | 47 | # get results path after save 48 | automl = AutoML( 49 | algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir 50 | ) 51 | X = np.random.uniform(size=(30, 2)) 52 | y = np.random.randint(0, 2, size=(30,)) 53 | automl.fit(X, y) 54 | self.assertEqual(self.automl_dir, automl._get_results_path()) 55 | 56 | automl2 = AutoML( 57 | algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir 58 | ) 59 | self.assertEqual(self.automl_dir, automl2._get_results_path()) 60 | -------------------------------------------------------------------------------- /tests/tests_automl/test_automl_report.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | from sklearn import datasets 10 | from sklearn.decomposition import PCA 11 | from sklearn.pipeline import make_pipeline 12 | 13 | from supervised import AutoML 14 | from supervised.exceptions import AutoMLException 15 | 16 | iris = datasets.load_iris() 17 | 18 | class AutoMLReportTest(unittest.TestCase): 19 | automl_dir = "AutoMLTest" 20 | 21 | def tearDown(self): 22 | shutil.rmtree(self.automl_dir, ignore_errors=True) 23 | 24 | def setUp(self): 25 | shutil.rmtree(self.automl_dir, ignore_errors=True) 26 | 27 | def test_report(self): 28 | """Tests AutoML in the iris dataset (Multiclass classification)""" 29 | model = AutoML( 30 | algorithms=["Baseline"], 31 | explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir 32 | ) 33 | model.fit(iris.data, iris.target) 34 | model.report() 35 | 36 | report_path = os.path.join(self.automl_dir, "README.html") 37 | self.assertTrue(os.path.exists(report_path)) 38 | 39 | content = None 40 | with open(report_path, "r") as fin: 41 | content = fin.read() 42 | 43 | 44 | #print(content) 45 | link = '' 46 | self.assertFalse(link in content) 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /tests/tests_automl/test_automl_sample_weight.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised import AutoML 9 | 10 | iris = datasets.load_iris() 11 | housing = datasets.fetch_california_housing() 12 | # limit data size for faster tests 13 | housing.data = housing.data[:500] 14 | housing.target = housing.target[:500] 15 | breast_cancer = datasets.load_breast_cancer() 16 | 17 | 18 | class AutoMLSampleWeightTest(unittest.TestCase): 19 | automl_dir = "AutoMLSampleWeightTest" 20 | 21 | def tearDown(self): 22 | shutil.rmtree(self.automl_dir, ignore_errors=True) 23 | 24 | def test_iris_dataset_sample_weight(self): 25 | """Tests AutoML in the iris dataset (Multiclass classification) 26 | without and with sample weight""" 27 | model = AutoML( 28 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 29 | ) 30 | score_1 = model.fit(iris.data, iris.target).score(iris.data, iris.target) 31 | self.assertGreater(score_1, 0.5) 32 | 33 | shutil.rmtree(self.automl_dir, ignore_errors=True) 34 | model = AutoML( 35 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 36 | ) 37 | sample_weight = np.ones(iris.data.shape[0]) 38 | score_2 = model.fit(iris.data, iris.target, sample_weight=sample_weight).score( 39 | iris.data, iris.target, sample_weight=sample_weight 40 | ) 41 | assert_almost_equal(score_1, score_2) 42 | 43 | def test_housing_dataset(self): 44 | """Tests AutoML in the housing dataset (Regression) 45 | without and with sample weight""" 46 | model = AutoML( 47 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 48 | ) 49 | score_1 = model.fit(housing.data, housing.target).score( 50 | housing.data, housing.target 51 | ) 52 | self.assertGreater(score_1, 0.5) 53 | 54 | shutil.rmtree(self.automl_dir, ignore_errors=True) 55 | model = AutoML( 56 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 57 | ) 58 | sample_weight = np.ones(housing.data.shape[0]) 59 | score_2 = model.fit( 60 | housing.data, housing.target, sample_weight=sample_weight 61 | ).score(housing.data, housing.target, sample_weight=sample_weight) 62 | assert_almost_equal(score_1, score_2) 63 | 64 | def test_breast_cancer_dataset(self): 65 | """Tests AutoML in the breast cancer (binary classification) 66 | without and with sample weight""" 67 | model = AutoML( 68 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 69 | ) 70 | score_1 = model.fit(breast_cancer.data, breast_cancer.target).score( 71 | breast_cancer.data, breast_cancer.target 72 | ) 73 | self.assertGreater(score_1, 0.5) 74 | 75 | shutil.rmtree(self.automl_dir, ignore_errors=True) 76 | model = AutoML( 77 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 78 | ) 79 | sample_weight = np.ones(breast_cancer.data.shape[0]) 80 | score_2 = model.fit( 81 | breast_cancer.data, breast_cancer.target, sample_weight=sample_weight 82 | ).score(breast_cancer.data, breast_cancer.target, sample_weight=sample_weight) 83 | assert_almost_equal(score_1, score_2) 84 | -------------------------------------------------------------------------------- /tests/tests_automl/test_automl_time_constraints.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import time 3 | import unittest 4 | 5 | from supervised import AutoML 6 | from supervised.tuner.time_controller import TimeController 7 | 8 | 9 | class AutoMLTimeConstraintsTest(unittest.TestCase): 10 | automl_dir = "automl_tests" 11 | 12 | def tearDown(self): 13 | shutil.rmtree(self.automl_dir, ignore_errors=True) 14 | 15 | def test_set_total_time_limit(self): 16 | model_type = "Xgboost" 17 | automl = AutoML( 18 | results_path=self.automl_dir, total_time_limit=100, algorithms=[model_type] 19 | ) 20 | 21 | automl._time_ctrl = TimeController( 22 | time.time(), 100, None, ["simple_algorithms", "not_so_random"], "Xgboost" 23 | ) 24 | 25 | time_spend = 0 26 | for i in range(12): 27 | automl._start_time -= 10 28 | automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10) 29 | if automl._time_ctrl.enough_time(model_type, "not_so_random"): 30 | time_spend += 10 31 | 32 | self.assertTrue(time_spend < 100) 33 | 34 | def test_set_model_time_limit(self): 35 | model_type = "Xgboost" 36 | automl = AutoML( 37 | results_path=self.automl_dir, model_time_limit=10, algorithms=[model_type] 38 | ) 39 | automl._time_ctrl = TimeController( 40 | time.time(), None, 10, ["simple_algorithms", "not_so_random"], "Xgboost" 41 | ) 42 | 43 | for i in range(12): 44 | automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10) 45 | # should be always true 46 | self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random")) 47 | 48 | def test_set_model_time_limit_omit_total_time(self): 49 | model_type = "Xgboost" 50 | automl = AutoML( 51 | results_path=self.automl_dir, 52 | total_time_limit=10, 53 | model_time_limit=10, 54 | algorithms=[model_type], 55 | ) 56 | automl._time_ctrl = TimeController( 57 | time.time(), 10, 10, ["simple_algorithms", "not_so_random"], "Xgboost" 58 | ) 59 | 60 | for i in range(12): 61 | automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10) 62 | # should be always true 63 | self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random")) 64 | 65 | def test_enough_time_to_train(self): 66 | model_type = "Xgboost" 67 | model_type_2 = "LightGBM" 68 | 69 | model_type = "Xgboost" 70 | automl = AutoML( 71 | results_path=self.automl_dir, 72 | total_time_limit=10, 73 | model_time_limit=10, 74 | algorithms=[model_type, model_type_2], 75 | ) 76 | automl._time_ctrl = TimeController( 77 | time.time(), 78 | 10, 79 | 10, 80 | ["simple_algorithms", "not_so_random"], 81 | [model_type, model_type_2], 82 | ) 83 | 84 | for i in range(5): 85 | automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 1) 86 | # should be always true 87 | self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random")) 88 | 89 | for i in range(5): 90 | automl._time_ctrl.log_time( 91 | f"LightGBM_{i}", model_type_2, "not_so_random", 1 92 | ) 93 | # should be always true 94 | self.assertTrue( 95 | automl._time_ctrl.enough_time(model_type_2, "not_so_random") 96 | ) 97 | 98 | def test_expected_learners_cnt(self): 99 | automl = AutoML(results_path=self.automl_dir) 100 | automl._validation_strategy = {"k_folds": 7, "repeats": 6} 101 | self.assertEqual(automl._expected_learners_cnt(), 42) 102 | 103 | automl._validation_strategy = {"k_folds": 7} 104 | self.assertEqual(automl._expected_learners_cnt(), 7) 105 | automl._validation_strategy = {} 106 | self.assertEqual(automl._expected_learners_cnt(), 1) 107 | -------------------------------------------------------------------------------- /tests/tests_automl/test_data_types.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class AutoMLDataTypesTest(unittest.TestCase): 11 | automl_dir = "automl_tests" 12 | rows = 250 13 | 14 | def tearDown(self): 15 | shutil.rmtree(self.automl_dir, ignore_errors=True) 16 | 17 | def test_category_data_type(self): 18 | X = np.random.rand(self.rows, 3) 19 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 20 | y = np.random.randint(0, 2, self.rows) 21 | 22 | X["f1"] = X["f1"].astype("category") 23 | 24 | automl = AutoML( 25 | results_path=self.automl_dir, 26 | total_time_limit=1, 27 | algorithms=["CatBoost"], 28 | train_ensemble=False, 29 | explain_level=0, 30 | start_random_models=1, 31 | ) 32 | automl.fit(X, y) 33 | 34 | def test_encoding_strange_characters(self): 35 | X = np.random.rand(self.rows, 3) 36 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 37 | y = np.random.permutation(["ɛ", "🂲"] * int(self.rows / 2)) 38 | 39 | automl = AutoML( 40 | results_path=self.automl_dir, 41 | total_time_limit=1, 42 | algorithms=["Baseline"], 43 | train_ensemble=False, 44 | explain_level=0, 45 | start_random_models=1, 46 | ) 47 | automl.fit(X, y) 48 | -------------------------------------------------------------------------------- /tests/tests_automl/test_dir_change.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal 7 | from sklearn import datasets 8 | 9 | from supervised import AutoML 10 | 11 | 12 | class AutoMLDirChangeTest(unittest.TestCase): 13 | automl_dir_a = "automl_testing_A" 14 | automl_dir_b = "automl_testing_B" 15 | automl_dir = "automl_testing" 16 | 17 | def tearDown(self): 18 | shutil.rmtree(self.automl_dir_a, ignore_errors=True) 19 | shutil.rmtree(self.automl_dir_b, ignore_errors=True) 20 | 21 | def create_dir(self, dir_path): 22 | if not os.path.exists(dir_path): 23 | try: 24 | os.mkdir(dir_path) 25 | except Exception as e: 26 | pass 27 | 28 | def test_create_report_after_dir_change(self): 29 | # 30 | # test for https://github.com/mljar/mljar-supervised/issues/384 31 | # 32 | self.create_dir(self.automl_dir_a) 33 | self.create_dir(self.automl_dir_b) 34 | 35 | path_a = os.path.join(self.automl_dir_a, self.automl_dir) 36 | path_b = os.path.join(self.automl_dir_b, self.automl_dir) 37 | 38 | X = np.random.uniform(size=(30, 2)) 39 | y = np.random.randint(0, 2, size=(30,)) 40 | 41 | automl = AutoML(results_path=path_a, algorithms=["Baseline"], explain_level=0) 42 | automl.fit(X, y) 43 | 44 | shutil.move(path_a, path_b) 45 | 46 | automl2 = AutoML( 47 | results_path=path_b, 48 | ) 49 | automl2.report() 50 | 51 | def test_compute_predictions_after_dir_change(self): 52 | # 53 | # test for https://github.com/mljar/mljar-supervised/issues/384 54 | # 55 | self.create_dir(self.automl_dir_a) 56 | self.create_dir(self.automl_dir_b) 57 | 58 | path_a = os.path.join(self.automl_dir_a, self.automl_dir) 59 | path_b = os.path.join(self.automl_dir_b, self.automl_dir) 60 | 61 | X, y = datasets.make_regression( 62 | n_samples=100, 63 | n_features=5, 64 | n_informative=4, 65 | n_targets=1, 66 | shuffle=False, 67 | random_state=0, 68 | ) 69 | 70 | automl = AutoML( 71 | results_path=path_a, 72 | explain_level=0, 73 | ml_task="regression", 74 | total_time_limit=10, 75 | ) 76 | automl.fit(X, y) 77 | p = automl.predict(X[:3]) 78 | 79 | shutil.move(path_a, path_b) 80 | 81 | automl2 = AutoML( 82 | results_path=path_b, 83 | ) 84 | p2 = automl2.predict(X[:3]) 85 | 86 | for i in range(3): 87 | assert_almost_equal(p[i], p2[i]) 88 | -------------------------------------------------------------------------------- /tests/tests_automl/test_golden_features.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import unittest 5 | 6 | import pandas as pd 7 | from sklearn import datasets 8 | 9 | from supervised import AutoML 10 | 11 | 12 | class AutoMLGoldenFeaturesTest(unittest.TestCase): 13 | automl_dir = "automl_tests" 14 | rows = 50 15 | 16 | def tearDown(self): 17 | shutil.rmtree(self.automl_dir, ignore_errors=True) 18 | 19 | def test_no_golden_features(self): 20 | N_COLS = 10 21 | X, y = datasets.make_classification( 22 | n_samples=100, 23 | n_features=N_COLS, 24 | n_informative=6, 25 | n_redundant=1, 26 | n_classes=2, 27 | n_clusters_per_class=3, 28 | n_repeated=0, 29 | shuffle=False, 30 | random_state=0, 31 | ) 32 | 33 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) 34 | 35 | automl = AutoML( 36 | results_path=self.automl_dir, 37 | total_time_limit=50, 38 | algorithms=["Xgboost"], 39 | train_ensemble=False, 40 | golden_features=False, 41 | explain_level=0, 42 | start_random_models=1, 43 | ) 44 | automl.fit(X, y) 45 | 46 | self.assertEqual(len(automl._models), 1) 47 | 48 | def test_golden_features(self): 49 | N_COLS = 10 50 | X, y = datasets.make_classification( 51 | n_samples=100, 52 | n_features=N_COLS, 53 | n_informative=6, 54 | n_redundant=1, 55 | n_classes=2, 56 | n_clusters_per_class=3, 57 | n_repeated=0, 58 | shuffle=False, 59 | random_state=0, 60 | ) 61 | 62 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) 63 | 64 | automl = AutoML( 65 | results_path=self.automl_dir, 66 | total_time_limit=50, 67 | algorithms=["Xgboost"], 68 | train_ensemble=False, 69 | golden_features=True, 70 | explain_level=0, 71 | start_random_models=1, 72 | ) 73 | automl.fit(X, y) 74 | 75 | self.assertEqual(len(automl._models), 2) 76 | 77 | # there should be 10 golden features 78 | with open(os.path.join(self.automl_dir, "golden_features.json")) as fin: 79 | d = json.loads(fin.read()) 80 | self.assertEqual(len(d["new_features"]), 10) 81 | 82 | def test_golden_features_count(self): 83 | N_COLS = 10 84 | X, y = datasets.make_classification( 85 | n_samples=100, 86 | n_features=N_COLS, 87 | n_informative=6, 88 | n_redundant=1, 89 | n_classes=2, 90 | n_clusters_per_class=3, 91 | n_repeated=0, 92 | shuffle=False, 93 | random_state=0, 94 | ) 95 | 96 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) 97 | 98 | automl = AutoML( 99 | results_path=self.automl_dir, 100 | total_time_limit=50, 101 | algorithms=["Xgboost"], 102 | train_ensemble=False, 103 | golden_features=50, 104 | explain_level=0, 105 | start_random_models=1, 106 | ) 107 | automl.fit(X, y) 108 | 109 | self.assertEqual(len(automl._models), 2) 110 | 111 | # there should be 50 golden features 112 | with open(os.path.join(self.automl_dir, "golden_features.json")) as fin: 113 | d = json.loads(fin.read()) 114 | self.assertEqual(len(d["new_features"]), 50) 115 | -------------------------------------------------------------------------------- /tests/tests_automl/test_handle_imbalance.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | from supervised.algorithms.random_forest import additional 9 | from supervised.algorithms.registry import MULTICLASS_CLASSIFICATION 10 | 11 | additional["max_steps"] = 1 12 | additional["trees_in_step"] = 1 13 | 14 | from supervised.algorithms.xgboost import additional 15 | 16 | additional["max_rounds"] = 1 17 | 18 | 19 | class AutoMLHandleImbalanceTest(unittest.TestCase): 20 | automl_dir = "AutoMLHandleImbalanceTest" 21 | 22 | def tearDown(self): 23 | shutil.rmtree(self.automl_dir, ignore_errors=True) 24 | 25 | def test_handle_drastic_imbalance(self): 26 | a = AutoML( 27 | results_path=self.automl_dir, 28 | total_time_limit=10, 29 | algorithms=["Random Forest"], 30 | train_ensemble=False, 31 | validation_strategy={ 32 | "validation_type": "kfold", 33 | "k_folds": 10, 34 | "shuffle": True, 35 | "stratify": True, 36 | }, 37 | start_random_models=1, 38 | ) 39 | 40 | rows = 100 41 | X = pd.DataFrame( 42 | { 43 | "f1": np.random.rand(rows), 44 | "f2": np.random.rand(rows), 45 | "f3": np.random.rand(rows), 46 | } 47 | ) 48 | y = np.ones(rows) 49 | 50 | y[:8] = 0 51 | y[10:12] = 2 52 | y = pd.Series(np.array(y), name="target") 53 | a._ml_task = MULTICLASS_CLASSIFICATION 54 | a._handle_drastic_imbalance(X, y) 55 | 56 | self.assertEqual(X.shape[0], 130) 57 | self.assertEqual(X.shape[1], 3) 58 | self.assertEqual(y.shape[0], 130) 59 | 60 | def test_handle_drastic_imbalance_sample_weight(self): 61 | a = AutoML( 62 | results_path=self.automl_dir, 63 | total_time_limit=10, 64 | algorithms=["Random Forest"], 65 | train_ensemble=False, 66 | validation_strategy={ 67 | "validation_type": "kfold", 68 | "k_folds": 10, 69 | "shuffle": True, 70 | "stratify": True, 71 | }, 72 | start_random_models=1, 73 | ) 74 | 75 | rows = 100 76 | X = pd.DataFrame( 77 | { 78 | "f1": np.random.rand(rows), 79 | "f2": np.random.rand(rows), 80 | "f3": np.random.rand(rows), 81 | } 82 | ) 83 | y = np.ones(rows) 84 | sample_weight = pd.Series(np.array(range(rows)), name="sample_weight") 85 | 86 | y[:1] = 0 87 | y[10:11] = 2 88 | 89 | y = pd.Series(np.array(y), name="target") 90 | a._ml_task = MULTICLASS_CLASSIFICATION 91 | a._handle_drastic_imbalance(X, y, sample_weight) 92 | 93 | self.assertEqual(X.shape[0], 138) 94 | self.assertEqual(X.shape[1], 3) 95 | self.assertEqual(y.shape[0], 138) 96 | 97 | self.assertEqual(np.sum(sample_weight[100:119]), 0) 98 | self.assertEqual(np.sum(sample_weight[119:138]), 19 * 10) 99 | 100 | def test_imbalance_dont_change_data_after_fit(self): 101 | a = AutoML( 102 | results_path=self.automl_dir, 103 | total_time_limit=5, 104 | train_ensemble=False, 105 | validation_strategy={ 106 | "validation_type": "kfold", 107 | "k_folds": 10, 108 | "shuffle": True, 109 | "stratify": True, 110 | }, 111 | start_random_models=1, 112 | explain_level=0, 113 | ) 114 | 115 | rows = 100 116 | X = pd.DataFrame( 117 | { 118 | "f1": np.random.rand(rows), 119 | "f2": np.random.rand(rows), 120 | "f3": np.random.rand(rows), 121 | } 122 | ) 123 | y = np.ones(rows) 124 | 125 | y[:8] = 0 126 | y[10:12] = 2 127 | sample_weight = np.ones(rows) 128 | 129 | a.fit(X, y, sample_weight=sample_weight) 130 | 131 | # original data **without** inserted samples to handle imbalance 132 | self.assertEqual(X.shape[0], rows) 133 | self.assertEqual(y.shape[0], rows) 134 | self.assertEqual(sample_weight.shape[0], rows) 135 | -------------------------------------------------------------------------------- /tests/tests_automl/test_joblib_version.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import unittest 5 | 6 | import joblib 7 | import numpy as np 8 | 9 | from supervised import AutoML 10 | from supervised.exceptions import AutoMLException 11 | 12 | 13 | class TestJoblibVersion(unittest.TestCase): 14 | automl_dir = "TestJoblibVersion" 15 | 16 | def tearDown(self): 17 | shutil.rmtree(self.automl_dir, ignore_errors=True) 18 | 19 | def test_joblib_good_version(self): 20 | X = np.random.uniform(size=(60, 2)) 21 | y = np.random.randint(0, 2, size=(60,)) 22 | 23 | automl = AutoML( 24 | results_path=self.automl_dir, 25 | model_time_limit=10, 26 | algorithms=["Xgboost"], 27 | mode="Explain", 28 | explain_level=0, 29 | start_random_models=1, 30 | hill_climbing_steps=0, 31 | top_models_to_improve=0, 32 | kmeans_features=False, 33 | golden_features=False, 34 | features_selection=False, 35 | boost_on_errors=False, 36 | ) 37 | automl.fit(X, y) 38 | 39 | # Test if joblib is in json 40 | json_path = os.path.join(self.automl_dir, "1_Default_Xgboost", "framework.json") 41 | 42 | with open(json_path) as file: 43 | frame = json.load(file) 44 | 45 | json_version = frame["joblib_version"] 46 | expected_result = joblib.__version__ 47 | 48 | self.assertEqual(expected_result, json_version) 49 | 50 | def test_joblib_wrong_version(self): 51 | X = np.random.uniform(size=(60, 2)) 52 | y = np.random.randint(0, 2, size=(60,)) 53 | 54 | automl = AutoML( 55 | results_path=self.automl_dir, 56 | model_time_limit=10, 57 | algorithms=["Xgboost"], 58 | mode="Explain", 59 | explain_level=0, 60 | start_random_models=1, 61 | hill_climbing_steps=0, 62 | top_models_to_improve=0, 63 | kmeans_features=False, 64 | golden_features=False, 65 | features_selection=False, 66 | boost_on_errors=False, 67 | ) 68 | automl.fit(X, y) 69 | 70 | json_path = os.path.join(self.automl_dir, "1_Default_Xgboost", "framework.json") 71 | 72 | with open(json_path) as file: 73 | frame = json.load(file) 74 | 75 | # Injection of wrong joblib version 76 | frame["joblib_version"] = "0.2.0" 77 | 78 | with open(json_path, "w") as file: 79 | json.dump(frame, file) 80 | 81 | with self.assertRaises(AutoMLException): 82 | automl_2 = AutoML(results_path=self.automl_dir) 83 | automl_2.predict(X) 84 | 85 | 86 | if __name__ == "__main__": 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /tests/tests_automl/test_models_needed_for_predict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import tempfile 4 | import unittest 5 | 6 | from supervised import AutoML 7 | from supervised.exceptions import AutoMLException 8 | 9 | 10 | class AutoMLModelsNeededForPredictTest(unittest.TestCase): 11 | # models_needed_on_predict 12 | 13 | def test_models_needed_on_predict(self): 14 | with tempfile.TemporaryDirectory() as tmpdir: 15 | params = { 16 | "saved": [ 17 | "model_1", 18 | "model_2", 19 | "model_3", 20 | "unused_model", 21 | "Ensemble", 22 | "model_4_Stacked", 23 | "Stacked_Ensemble", 24 | ], 25 | "stacked": ["Ensemble", "model_1", "model_2"], 26 | } 27 | with open(os.path.join(tmpdir, "params.json"), "w") as fout: 28 | fout.write(json.dumps(params)) 29 | os.mkdir(os.path.join(tmpdir, "Ensemble")) 30 | with open(os.path.join(tmpdir, "Ensemble", "ensemble.json"), "w") as fout: 31 | params = { 32 | "selected_models": [ 33 | {"model": "model_2"}, 34 | {"model": "model_3"}, 35 | ] 36 | } 37 | fout.write(json.dumps(params)) 38 | os.mkdir(os.path.join(tmpdir, "Stacked_Ensemble")) 39 | with open( 40 | os.path.join(tmpdir, "Stacked_Ensemble", "ensemble.json"), "w" 41 | ) as fout: 42 | params = { 43 | "selected_models": [ 44 | {"model": "Ensemble"}, 45 | {"model": "model_4_Stacked"}, 46 | ] 47 | } 48 | fout.write(json.dumps(params)) 49 | 50 | automl = AutoML(results_path=tmpdir) 51 | with self.assertRaises(AutoMLException) as context: 52 | l = automl.models_needed_on_predict("missing_model") 53 | l = automl.models_needed_on_predict("model_1") 54 | self.assertTrue("model_1" in l) 55 | self.assertTrue(len(l) == 1) 56 | l = automl.models_needed_on_predict("model_3") 57 | self.assertTrue("model_3" in l) 58 | self.assertTrue(len(l) == 1) 59 | l = automl.models_needed_on_predict("Ensemble") 60 | self.assertTrue("model_2" in l) 61 | self.assertTrue("model_3" in l) 62 | self.assertTrue("Ensemble" in l) 63 | self.assertTrue(len(l) == 3) 64 | l = automl.models_needed_on_predict("model_4_Stacked") 65 | self.assertTrue("model_1" in l) 66 | self.assertTrue("model_2" in l) 67 | self.assertTrue("model_3" in l) 68 | self.assertTrue("Ensemble" in l) 69 | self.assertTrue("model_4_Stacked" in l) 70 | self.assertTrue(len(l) == 5) 71 | l = automl.models_needed_on_predict("Stacked_Ensemble") 72 | self.assertTrue("model_1" in l) 73 | self.assertTrue("model_2" in l) 74 | self.assertTrue("model_3" in l) 75 | self.assertTrue("Ensemble" in l) 76 | self.assertTrue("model_4_Stacked" in l) 77 | self.assertTrue("Stacked_Ensemble" in l) 78 | self.assertTrue(len(l) == 6) 79 | -------------------------------------------------------------------------------- /tests/tests_automl/test_prediction_after_load.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | from numpy.testing import assert_almost_equal 5 | from sklearn import datasets 6 | from sklearn.model_selection import train_test_split 7 | 8 | from supervised import AutoML 9 | 10 | 11 | class AutoMLPredictionAfterLoadTest(unittest.TestCase): 12 | automl_dir = "AutoMLPredictionAfterLoadTest" 13 | 14 | def tearDown(self): 15 | shutil.rmtree(self.automl_dir, ignore_errors=True) 16 | 17 | def test_integration(self): 18 | a = AutoML( 19 | results_path=self.automl_dir, 20 | mode="Compete", 21 | algorithms=["Baseline", "CatBoost", "LightGBM", "Xgboost"], 22 | stack_models=True, 23 | total_time_limit=60, 24 | validation_strategy={ 25 | "validation_type": "kfold", 26 | "k_folds": 3, 27 | "shuffle": True, 28 | "stratify": True, 29 | "random_seed": 123, 30 | }, 31 | ) 32 | 33 | X, y = datasets.make_classification( 34 | n_samples=1000, 35 | n_features=30, 36 | n_informative=29, 37 | n_redundant=1, 38 | n_classes=8, 39 | n_clusters_per_class=3, 40 | n_repeated=0, 41 | shuffle=False, 42 | random_state=0, 43 | ) 44 | X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) 45 | 46 | a.fit(X_train, y_train) 47 | p = a.predict_all(X_test) 48 | 49 | a2 = AutoML(results_path=self.automl_dir) 50 | p2 = a2.predict_all(X_test) 51 | 52 | assert_almost_equal(p["prediction_0"].iloc[0], p2["prediction_0"].iloc[0]) 53 | assert_almost_equal(p["prediction_7"].iloc[0], p2["prediction_7"].iloc[0]) 54 | -------------------------------------------------------------------------------- /tests/tests_automl/test_repeated_validation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import pandas as pd 6 | from sklearn import datasets 7 | 8 | from supervised import AutoML 9 | from supervised.algorithms.random_forest import additional 10 | from supervised.utils.common import construct_learner_name 11 | 12 | additional["max_steps"] = 1 13 | additional["trees_in_step"] = 1 14 | 15 | from supervised.algorithms.xgboost import additional 16 | 17 | additional["max_rounds"] = 1 18 | 19 | 20 | class AutoMLRepeatedValidationTest(unittest.TestCase): 21 | automl_dir = "AutoMLRepeatedValidationTest" 22 | 23 | def tearDown(self): 24 | shutil.rmtree(self.automl_dir, ignore_errors=True) 25 | 26 | def test_repeated_kfold(self): 27 | REPEATS = 3 28 | FOLDS = 2 29 | 30 | a = AutoML( 31 | results_path=self.automl_dir, 32 | total_time_limit=10, 33 | algorithms=["Random Forest"], 34 | train_ensemble=False, 35 | validation_strategy={ 36 | "validation_type": "kfold", 37 | "k_folds": FOLDS, 38 | "repeats": REPEATS, 39 | "shuffle": True, 40 | "stratify": True, 41 | }, 42 | start_random_models=1, 43 | ) 44 | 45 | X, y = datasets.make_classification( 46 | n_samples=100, 47 | n_features=5, 48 | n_informative=4, 49 | n_redundant=1, 50 | n_classes=2, 51 | n_clusters_per_class=3, 52 | n_repeated=0, 53 | shuffle=False, 54 | random_state=0, 55 | ) 56 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 57 | 58 | a.fit(X, y) 59 | 60 | result_files = os.listdir( 61 | os.path.join(self.automl_dir, "1_Default_RandomForest") 62 | ) 63 | 64 | cnt = 0 65 | for repeat in range(REPEATS): 66 | for fold in range(FOLDS): 67 | learner_name = construct_learner_name(fold, repeat, REPEATS) 68 | self.assertTrue(f"{learner_name}.random_forest" in result_files) 69 | self.assertTrue(f"{learner_name}_training.log" in result_files) 70 | cnt += 1 71 | self.assertTrue(cnt, 6) 72 | 73 | def test_repeated_split(self): 74 | REPEATS = 3 75 | FOLDS = 1 76 | 77 | a = AutoML( 78 | results_path=self.automl_dir, 79 | total_time_limit=10, 80 | algorithms=["Random Forest"], 81 | train_ensemble=False, 82 | validation_strategy={ 83 | "validation_type": "split", 84 | "repeats": REPEATS, 85 | "shuffle": True, 86 | "stratify": True, 87 | }, 88 | start_random_models=1, 89 | ) 90 | 91 | X, y = datasets.make_classification( 92 | n_samples=100, 93 | n_features=5, 94 | n_informative=4, 95 | n_redundant=1, 96 | n_classes=2, 97 | n_clusters_per_class=3, 98 | n_repeated=0, 99 | shuffle=False, 100 | random_state=0, 101 | ) 102 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 103 | 104 | a.fit(X, y) 105 | 106 | result_files = os.listdir( 107 | os.path.join(self.automl_dir, "1_Default_RandomForest") 108 | ) 109 | cnt = 0 110 | for repeat in range(REPEATS): 111 | for fold in range(FOLDS): 112 | learner_name = construct_learner_name(fold, repeat, REPEATS) 113 | self.assertTrue(f"{learner_name}.random_forest" in result_files) 114 | self.assertTrue(f"{learner_name}_training.log" in result_files) 115 | cnt += 1 116 | self.assertTrue(cnt, 3) 117 | -------------------------------------------------------------------------------- /tests/tests_automl/test_restore.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import unittest 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from supervised import AutoML 10 | from supervised.algorithms.xgboost import additional 11 | 12 | additional["max_rounds"] = 1 13 | 14 | 15 | class AutoMLRestoreTest(unittest.TestCase): 16 | automl_dir = "automl_tests" 17 | rows = 50 18 | 19 | def tearDown(self): 20 | shutil.rmtree(self.automl_dir, ignore_errors=True) 21 | 22 | def test_tune_only_default(self): 23 | X = np.random.rand(self.rows, 3) 24 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 25 | y = np.random.randint(0, 2, self.rows) 26 | 27 | automl = AutoML( 28 | results_path=self.automl_dir, 29 | total_time_limit=3, 30 | algorithms=["Decision Tree"], 31 | explain_level=0, 32 | train_ensemble=False, 33 | ) 34 | automl.fit(X, y) 35 | 36 | # Get number of starting models 37 | n1 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()]) 38 | 39 | with open(os.path.join(self.automl_dir, "progress.json"), "r") as file: 40 | progress = json.load(file) 41 | progress["fit_level"] = "default_algorithms" 42 | 43 | with open(os.path.join(self.automl_dir, "progress.json"), "w") as fout: 44 | fout.write(json.dumps(progress, indent=4)) 45 | 46 | automl = AutoML( 47 | results_path=self.automl_dir, 48 | total_time_limit=3, 49 | algorithms=["Decision Tree", "Xgboost"], 50 | explain_level=0, 51 | train_ensemble=False, 52 | ) 53 | automl.fit(X, y) 54 | # Get number of models after second fit 55 | n2 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()]) 56 | # number of models should be equal 57 | # user cannot overwrite parameters 58 | self.assertEqual(n2, n1) 59 | -------------------------------------------------------------------------------- /tests/tests_automl/test_stack_models_constraints.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | from supervised import AutoML 7 | 8 | 9 | class AutoMLStackModelsConstraintsTest(unittest.TestCase): 10 | automl_dir = "AutoMLStackModelsConstraintsTest" 11 | 12 | def tearDown(self): 13 | shutil.rmtree(self.automl_dir, ignore_errors=True) 14 | 15 | def test_allow_stack_models(self): 16 | X = np.random.uniform(size=(100, 2)) 17 | y = np.random.randint(0, 2, size=(100,)) 18 | X[:, 0] = y 19 | X[:, 1] = -y 20 | 21 | automl = AutoML( 22 | results_path=self.automl_dir, 23 | total_time_limit=5, 24 | mode="Compete", 25 | validation_strategy={"validation_type": "kfold", "k_folds": 5}, 26 | ) 27 | automl.fit(X, y) 28 | self.assertTrue(automl._stack_models) 29 | self.assertTrue(automl.tuner._stack_models) 30 | self.assertTrue(automl._time_ctrl._is_stacking) 31 | 32 | def test_disable_stack_models(self): 33 | X = np.random.uniform(size=(100, 2)) 34 | y = np.random.randint(0, 2, size=(100,)) 35 | X[:, 0] = y 36 | X[:, 1] = -y 37 | 38 | automl = AutoML( 39 | results_path=self.automl_dir, 40 | total_time_limit=5, 41 | mode="Compete", 42 | validation_strategy={"validation_type": "split"}, 43 | ) 44 | automl.fit(X, y) 45 | self.assertFalse(automl._stack_models) 46 | self.assertFalse(automl.tuner._stack_models) 47 | self.assertFalse(automl._time_ctrl._is_stacking) 48 | 49 | def test_disable_stack_models_adjusted_validation(self): 50 | X = np.random.uniform(size=(100, 2)) 51 | y = np.random.randint(0, 2, size=(100,)) 52 | X[:, 0] = y 53 | X[:, 1] = -y 54 | 55 | automl = AutoML( 56 | results_path=self.automl_dir, total_time_limit=5, mode="Compete" 57 | ) 58 | automl.fit(X, y) 59 | # the stacking should be disabled 60 | # because of small time limit 61 | self.assertFalse(automl._stack_models) 62 | self.assertFalse(automl.tuner._stack_models) 63 | self.assertFalse(automl._time_ctrl._is_stacking) 64 | -------------------------------------------------------------------------------- /tests/tests_automl/test_update_errors_report.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import numpy as np 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class AutoMLUpdateErrorsReportTest(unittest.TestCase): 11 | automl_dir = "automl_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_custom_init(self): 17 | X = np.random.uniform(size=(30, 2)) 18 | y = np.random.randint(0, 2, size=(30,)) 19 | 20 | automl = AutoML(results_path=self.automl_dir) 21 | automl._update_errors_report("model_1", "bad error") 22 | 23 | errors_filename = os.path.join(self.automl_dir, "errors.md") 24 | self.assertTrue(os.path.exists(errors_filename)) 25 | with open(errors_filename) as file: 26 | self.assertTrue("bad error" in file.read()) 27 | -------------------------------------------------------------------------------- /tests/tests_callbacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_callbacks/__init__.py -------------------------------------------------------------------------------- /tests/tests_callbacks/test_total_time_constraint.py: -------------------------------------------------------------------------------- 1 | import time 2 | import unittest 3 | 4 | from supervised.callbacks.total_time_constraint import TotalTimeConstraint 5 | from supervised.exceptions import NotTrainedException 6 | 7 | 8 | class TotalTimeConstraintTest(unittest.TestCase): 9 | def test_stop_on_first_learner(self): 10 | params = { 11 | "total_time_limit": 100, 12 | "total_time_start": time.time(), 13 | "expected_learners_cnt": 1001, 14 | } 15 | callback = TotalTimeConstraint(params) 16 | callback.add_and_set_learner(learner={}) 17 | callback.on_learner_train_start(logs=None) 18 | time.sleep(0.1) 19 | with self.assertRaises(NotTrainedException) as context: 20 | callback.on_learner_train_end(logs=None) 21 | self.assertTrue("Stop training after the first fold" in str(context.exception)) 22 | 23 | def test_stop_on_not_first_learner(self): 24 | params = { 25 | "total_time_limit": 100, 26 | "total_time_start": time.time(), 27 | "expected_learners_cnt": 10, 28 | } 29 | callback = TotalTimeConstraint(params) 30 | callback.add_and_set_learner(learner={}) 31 | callback.on_learner_train_start(logs=None) 32 | callback.on_learner_train_end(logs=None) 33 | with self.assertRaises(NotTrainedException) as context: 34 | # 35 | # hardcoded change just for tests! 36 | callback.total_time_start = time.time() - 600 - 100 - 1 37 | # 38 | callback.add_and_set_learner(learner={}) 39 | callback.on_learner_train_start(logs=None) 40 | callback.on_learner_train_end(logs=None) 41 | self.assertTrue("Force to stop" in str(context.exception)) 42 | 43 | def test_dont_stop(self): 44 | params = { 45 | "total_time_limit": 100, 46 | "total_time_start": time.time(), 47 | "expected_learners_cnt": 10, 48 | } 49 | callback = TotalTimeConstraint(params) 50 | 51 | for i in range(10): 52 | callback.add_and_set_learner(learner={}) 53 | callback.on_learner_train_start(logs=None) 54 | callback.on_learner_train_end(logs=None) 55 | -------------------------------------------------------------------------------- /tests/tests_ensemble/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_ensemble/__init__.py -------------------------------------------------------------------------------- /tests/tests_ensemble/test_save_load.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | import pandas as pd 5 | from sklearn import datasets 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class EnsembleSaveLoadTest(unittest.TestCase): 11 | automl_dir = "EnsembleSaveLoadTest" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_save_load(self): 17 | a = AutoML( 18 | results_path=self.automl_dir, 19 | total_time_limit=10, 20 | explain_level=0, 21 | mode="Explain", 22 | train_ensemble=True, 23 | start_random_models=1, 24 | ) 25 | 26 | X, y = datasets.make_classification( 27 | n_samples=100, 28 | n_features=5, 29 | n_informative=4, 30 | n_redundant=1, 31 | n_classes=2, 32 | n_clusters_per_class=3, 33 | n_repeated=0, 34 | shuffle=False, 35 | random_state=0, 36 | ) 37 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 38 | 39 | a.fit(X, y) 40 | p = a.predict(X) 41 | 42 | a2 = AutoML(results_path=self.automl_dir) 43 | p2 = a2.predict(X) 44 | 45 | self.assertTrue((p == p2).all()) 46 | -------------------------------------------------------------------------------- /tests/tests_fairness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_fairness/__init__.py -------------------------------------------------------------------------------- /tests/tests_fairness/test_multi_class_classification.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class FairnessInMultiClassClassificationTest(unittest.TestCase): 11 | automl_dir = "automl_fairness_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_init(self): 17 | X = np.random.uniform(size=(30, 2)) 18 | y = np.array(["A", "B", "C"] * 10) 19 | S = pd.DataFrame({"sensitive": ["D", "E"] * 15}) 20 | 21 | automl = AutoML( 22 | results_path=self.automl_dir, 23 | model_time_limit=10, 24 | algorithms=["Xgboost"], 25 | explain_level=0, 26 | train_ensemble=False, 27 | stack_models=False, 28 | validation_strategy={"validation_type": "split"}, 29 | start_random_models=1, 30 | ) 31 | 32 | automl.fit(X, y, sensitive_features=S) 33 | 34 | self.assertGreater(len(automl._models), 0) 35 | 36 | sensitive_features_names = automl._models[0].get_sensitive_features_names() 37 | self.assertEqual(len(sensitive_features_names), 3) 38 | 39 | self.assertTrue("sensitive__A" in sensitive_features_names) 40 | self.assertTrue("sensitive__B" in sensitive_features_names) 41 | self.assertTrue("sensitive__C" in sensitive_features_names) 42 | 43 | self.assertTrue( 44 | automl._models[0].get_fairness_metric("sensitive__A") is not None 45 | ) 46 | self.assertTrue( 47 | automl._models[0].get_fairness_metric("sensitive__B") is not None 48 | ) 49 | self.assertTrue( 50 | automl._models[0].get_fairness_metric("sensitive__C") is not None 51 | ) 52 | 53 | self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1) 54 | self.assertTrue(automl._models[0].get_worst_fairness() is not None) 55 | self.assertTrue(automl._models[0].get_best_fairness() is not None) 56 | -------------------------------------------------------------------------------- /tests/tests_fairness/test_regression.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class FairnessInRegressionTest(unittest.TestCase): 11 | automl_dir = "automl_fairness_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_init(self): 17 | X = np.random.uniform(size=(30, 2)) 18 | y = np.random.randint(0, 100, size=(30,)) 19 | S = pd.DataFrame({"sensitive": ["A", "B"] * 15}) 20 | 21 | automl = AutoML( 22 | results_path=self.automl_dir, 23 | model_time_limit=10, 24 | algorithms=["Xgboost"], 25 | explain_level=0, 26 | train_ensemble=False, 27 | stack_models=False, 28 | validation_strategy={"validation_type": "split"}, 29 | start_random_models=1, 30 | ) 31 | 32 | automl.fit(X, y, sensitive_features=S) 33 | 34 | self.assertGreater(len(automl._models), 0) 35 | 36 | sensitive_features_names = automl._models[0].get_sensitive_features_names() 37 | self.assertEqual(len(sensitive_features_names), 1) 38 | self.assertTrue("sensitive" in sensitive_features_names) 39 | 40 | self.assertTrue(automl._models[0].get_fairness_metric("sensitive") is not None) 41 | self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1) 42 | self.assertTrue(automl._models[0].get_worst_fairness() is not None) 43 | self.assertTrue(automl._models[0].get_best_fairness() is not None) 44 | 45 | def test_two_sensitive_features(self): 46 | X = np.random.uniform(size=(30, 2)) 47 | y = np.random.randint(0, 100, size=(30,)) 48 | S = pd.DataFrame( 49 | { 50 | "sensitive_1": ["White", "Black"] * 15, 51 | "sensitive_2": ["Male", "Female"] * 15, 52 | } 53 | ) 54 | 55 | automl = AutoML( 56 | results_path=self.automl_dir, 57 | model_time_limit=10, 58 | algorithms=["Xgboost"], 59 | explain_level=0, 60 | train_ensemble=False, 61 | stack_models=False, 62 | start_random_models=1, 63 | ) 64 | 65 | automl.fit(X, y, sensitive_features=S) 66 | 67 | self.assertGreater(len(automl._models), 0) 68 | 69 | sensitive_features_names = automl._models[0].get_sensitive_features_names() 70 | self.assertEqual(len(sensitive_features_names), 2) 71 | -------------------------------------------------------------------------------- /tests/tests_preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_preprocessing/__init__.py -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_datetime_transformer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from supervised.preprocessing.datetime_transformer import DateTimeTransformer 6 | 7 | 8 | class DateTimeTransformerTest(unittest.TestCase): 9 | def test_transformer(self): 10 | d = { 11 | "col1": [ 12 | "2020/06/01", 13 | "2020/06/02", 14 | "2020/06/03", 15 | "2021/06/01", 16 | "2022/06/01", 17 | ] 18 | } 19 | df = pd.DataFrame(data=d) 20 | df["col1"] = pd.to_datetime(df["col1"]) 21 | df_org = df.copy() 22 | 23 | transf = DateTimeTransformer() 24 | transf.fit(df, "col1") 25 | df = transf.transform(df) 26 | 27 | self.assertTrue(df.shape[0] == 5) 28 | self.assertTrue("col1" not in df.columns) 29 | self.assertTrue("col1_Year" in df.columns) 30 | 31 | transf2 = DateTimeTransformer() 32 | transf2.from_json(transf.to_json()) 33 | df2 = transf2.transform(df_org) 34 | self.assertTrue("col1" not in df2.columns) 35 | self.assertTrue("col1_Year" in df2.columns) 36 | -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_encoding_selector.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from supervised.preprocessing.encoding_selector import EncodingSelector 6 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical 7 | 8 | 9 | class CategoricalIntegersTest(unittest.TestCase): 10 | def test_selector(self): 11 | d = {"col1": [f"{i}" for i in range(31)], "col2": ["a"] * 31} 12 | df = pd.DataFrame(data=d) 13 | 14 | self.assertEqual( 15 | EncodingSelector.get(df, None, "col1"), 16 | PreprocessingCategorical.MANY_CATEGORIES, 17 | ) 18 | self.assertEqual( 19 | EncodingSelector.get(df, None, "col2"), 20 | PreprocessingCategorical.FEW_CATEGORIES, 21 | ) 22 | -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_exclude_missing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget 7 | 8 | 9 | class ExcludeRowsMissingTargetTest(unittest.TestCase): 10 | def test_transform(self): 11 | d_test = { 12 | "col1": [1, 1, np.nan, 3], 13 | "col2": ["a", "a", np.nan, "a"], 14 | "col3": [1, 1, 1, 3], 15 | "col4": ["a", "a", "b", "c"], 16 | "y": [np.nan, 1, np.nan, 2], 17 | } 18 | df_test = pd.DataFrame(data=d_test) 19 | X = df_test.loc[:, ["col1", "col2", "col3", "col4"]] 20 | y = df_test.loc[:, "y"] 21 | 22 | self.assertEqual(X.shape[0], 4) 23 | self.assertEqual(y.shape[0], 4) 24 | X, y, _, _ = ExcludeRowsMissingTarget.transform(X, y) 25 | self.assertEqual(X.shape[0], 2) 26 | self.assertEqual(y.shape[0], 2) 27 | 28 | self.assertEqual(y[0], 1) 29 | self.assertEqual(y[1], 2) 30 | 31 | def test_transform_with_sample_weight(self): 32 | d_test = { 33 | "col1": [1, 1, np.nan, 3], 34 | "col2": ["a", "a", np.nan, "a"], 35 | "col3": [1, 1, 1, 3], 36 | "col4": ["a", "a", "b", "c"], 37 | "sample_weight": [1, 2, 3, 4], 38 | "y": [np.nan, 1, np.nan, 2], 39 | } 40 | df_test = pd.DataFrame(data=d_test) 41 | X = df_test.loc[:, ["col1", "col2", "col3", "col4"]] 42 | y = df_test.loc[:, "y"] 43 | sample_weight = df_test.loc[:, "sample_weight"] 44 | 45 | self.assertEqual(X.shape[0], 4) 46 | self.assertEqual(y.shape[0], 4) 47 | X, y, sw, _ = ExcludeRowsMissingTarget.transform(X, y, sample_weight) 48 | self.assertEqual(X.shape[0], 2) 49 | self.assertEqual(y.shape[0], 2) 50 | self.assertEqual(sw.shape[0], 2) 51 | 52 | self.assertEqual(y[0], 1) 53 | self.assertEqual(y[1], 2) 54 | self.assertEqual(sw[0], 2) 55 | self.assertEqual(sw[1], 4) 56 | -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_loo_encoder.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from numpy.testing import assert_almost_equal 6 | 7 | from supervised.preprocessing.loo_encoder import LooEncoder 8 | 9 | # disable tests 10 | # class LabelEncoderTest(unittest.TestCase): 11 | # def test_fit(self): 12 | # # training data 13 | # d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"], "y": [1, 2, 0]} 14 | # df = pd.DataFrame(data=d) 15 | # le = LooEncoder(cols=["col1"]) 16 | # le.fit(df[["col1", "col2"]], df["y"]) 17 | 18 | # self.assertTrue(le.enc is not None) 19 | # self.assertTrue(le.enc._dim == 2) 20 | # assert_almost_equal(le.enc._mean, 1.0) 21 | # self.assertTrue("col1" in le.enc.mapping) 22 | # self.assertTrue("col2" not in le.enc.mapping) 23 | 24 | # def test_transform(self): 25 | # # training data 26 | # d = {"col1": ["a", "a", "c"]} 27 | # y = [1, 1, 0] 28 | # df = pd.DataFrame(data=d) 29 | # # fit encoder 30 | # le = LooEncoder(cols=["col1"]) 31 | # le.fit(df, y) 32 | # t1 = le.transform(df) 33 | 34 | # # test data 35 | # d_test = {"col1": ["c", "c", "a"]} 36 | # df_test = pd.DataFrame(data=d_test) 37 | # # transform 38 | # t2 = le.transform(df_test) 39 | # assert_almost_equal(t1["col1"][0], t2["col1"][2]) 40 | # assert_almost_equal(t1["col1"][2], t2["col1"][1]) 41 | 42 | # def test_transform_with_new_and_missing_values(self): 43 | # # training data 44 | # d = {"col1": ["a", "a", "c"]} 45 | # y = [1, 1, 1] 46 | # df = pd.DataFrame(data=d) 47 | # # fit encoder 48 | # le = LooEncoder(cols=["col1"]) 49 | # le.fit(df, y) 50 | # # test data 51 | # d_test = {"col1": ["c", "a", "d", "f", np.nan]} 52 | # df_test = pd.DataFrame(data=d_test) 53 | # # transform 54 | # t = le.transform(df_test) 55 | # assert_almost_equal(t["col1"][2], 1) 56 | # assert_almost_equal(t["col1"][3], 1) 57 | # assert_almost_equal(t["col1"][4], 1) 58 | 59 | # def test_to_and_from_json(self): 60 | # # training data 61 | # d = {"col1": ["a", "a", "c"]} 62 | # y = [1, 1, 1] 63 | # df = pd.DataFrame(data=d) 64 | # # fit encoder 65 | # le = LooEncoder() 66 | # le.fit(df, y) 67 | 68 | # # new encoder 69 | # new_le = LooEncoder() 70 | # new_le.from_json(le.to_json()) 71 | 72 | # # test data 73 | # d_test = {"col1": ["c", "c", "a", "e"]} 74 | # df_test = pd.DataFrame(data=d_test) 75 | # # transform 76 | # t = new_le.transform(df_test) 77 | # self.assertEqual(t["col1"][0], 1) 78 | # self.assertEqual(t["col1"][1], 1) 79 | # self.assertEqual(t["col1"][2], 1) 80 | # self.assertEqual(t["col1"][3], 1) 81 | -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_preprocessing_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 7 | 8 | 9 | class PreprocessingUtilsTest(unittest.TestCase): 10 | def test_get_type_numpy_number(self): 11 | tmp = np.array([1, 2, 3]) 12 | tmp_type = PreprocessingUtils.get_type(tmp) 13 | self.assertNotEqual(tmp_type, PreprocessingUtils.CATEGORICAL) 14 | 15 | def test_get_type_numpy_categorical(self): 16 | tmp = np.array(["a", "b", "c"]) 17 | tmp_type = PreprocessingUtils.get_type(tmp) 18 | self.assertEqual(tmp_type, PreprocessingUtils.CATEGORICAL) 19 | 20 | def test_get_type_pandas_bug(self): 21 | d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]} 22 | df = pd.DataFrame(data=d) 23 | col1_type = PreprocessingUtils.get_type(df.loc[:, "col2"]) 24 | self.assertEqual(col1_type, PreprocessingUtils.CATEGORICAL) 25 | 26 | def test_get_type_pandas(self): 27 | d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]} 28 | df = pd.DataFrame(data=d) 29 | col1_type = PreprocessingUtils.get_type(df["col1"]) 30 | self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL) 31 | col2_type = PreprocessingUtils.get_type(df["col2"]) 32 | self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL) 33 | 34 | def test_get_stats(self): 35 | tmp = np.array([1, np.nan, 2, 3, np.nan, np.nan]) 36 | self.assertEqual(1, PreprocessingUtils.get_min(tmp)) 37 | self.assertEqual(2, PreprocessingUtils.get_mean(tmp)) 38 | self.assertEqual(2, PreprocessingUtils.get_median(tmp)) 39 | d = {"col1": [1, 2, 1, 3, 1, np.nan], "col2": ["a", np.nan, "b", "a", "c", "a"]} 40 | df = pd.DataFrame(data=d) 41 | self.assertEqual(1, PreprocessingUtils.get_min(df["col1"])) 42 | self.assertEqual(8.0 / 5.0, PreprocessingUtils.get_mean(df["col1"])) 43 | self.assertEqual(1, PreprocessingUtils.get_median(df["col1"])) 44 | 45 | self.assertEqual(1, PreprocessingUtils.get_most_frequent(df["col1"])) 46 | self.assertEqual("a", PreprocessingUtils.get_most_frequent(df["col2"])) 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_scale.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from numpy.testing import assert_almost_equal 6 | 7 | from supervised.preprocessing.scale import Scale 8 | 9 | 10 | class ScaleTest(unittest.TestCase): 11 | def test_fit_log_and_normal(self): 12 | # training data 13 | d = { 14 | "col1": [12, 13, 3, 4, 5, 6, 7, 8000, 9000, 10000.0], 15 | "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0], 16 | "col3": [12, 2, 3, 4, 5, 6, 7, 8000, 9000, 10000.0], 17 | } 18 | df = pd.DataFrame(data=d) 19 | 20 | scale = Scale(["col1", "col3"], scale_method=Scale.SCALE_LOG_AND_NORMAL) 21 | scale.fit(df) 22 | df = scale.transform(df) 23 | val = float(df["col1"][0]) 24 | 25 | assert_almost_equal(np.mean(df["col1"]), 0) 26 | self.assertTrue( 27 | df["col1"][0] + 0.01 < df["col1"][1] 28 | ) # in case of wrong scaling the small values will be squeezed 29 | 30 | df = scale.inverse_transform(df) 31 | 32 | scale2 = Scale() 33 | scale_params = scale.to_json() 34 | 35 | scale2.from_json(scale_params) 36 | df = scale2.transform(df) 37 | assert_almost_equal(df["col1"][0], val) 38 | 39 | def test_fit(self): 40 | # training data 41 | d = { 42 | "col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10.0], 43 | "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0], 44 | } 45 | df = pd.DataFrame(data=d) 46 | 47 | scale = Scale(["col1"]) 48 | scale.fit(df) 49 | df = scale.transform(df) 50 | 51 | assert_almost_equal(np.mean(df["col1"]), 0) 52 | assert_almost_equal(np.mean(df["col2"]), 25.5) 53 | 54 | df = scale.inverse_transform(df) 55 | assert_almost_equal(df["col1"][0], 1) 56 | assert_almost_equal(df["col1"][1], 2) 57 | 58 | def test_to_and_from_json(self): 59 | # training data 60 | d = { 61 | "col1": [1, 2, 3, 4, 5, 6, 7, 8.0, 9, 10], 62 | "col2": [21, 22.0, 23, 24, 25, 26, 27, 28, 29, 30], 63 | } 64 | df = pd.DataFrame(data=d) 65 | 66 | scale = Scale(["col1"]) 67 | scale.fit(df) 68 | # do not transform 69 | assert_almost_equal(np.mean(df["col1"]), 5.5) 70 | assert_almost_equal(np.mean(df["col2"]), 25.5) 71 | # to and from json 72 | 73 | json_data = scale.to_json() 74 | scale2 = Scale() 75 | scale2.from_json(json_data) 76 | # transform with loaded scaler 77 | df = scale2.transform(df) 78 | assert_almost_equal(np.mean(df["col1"]), 0) 79 | assert_almost_equal(np.mean(df["col2"]), 25.5) 80 | -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_text_transformer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | from numpy.testing import assert_almost_equal 5 | 6 | from supervised.preprocessing.text_transformer import TextTransformer 7 | 8 | 9 | class TextTransformerTest(unittest.TestCase): 10 | def test_transformer(self): 11 | d = { 12 | "col1": [ 13 | "This is the first document.", 14 | "This document is the second document.", 15 | "And this is the third one.", 16 | None, 17 | "Is this the first document?", 18 | ] 19 | } 20 | df = pd.DataFrame(data=d) 21 | df_org = df.copy() 22 | 23 | transf = TextTransformer() 24 | transf.fit(df, "col1") 25 | df = transf.transform(df) 26 | 27 | self.assertTrue(df.shape[0] == 5) 28 | self.assertTrue("col1" not in df.columns) 29 | 30 | transf2 = TextTransformer() 31 | transf2.from_json(transf.to_json()) 32 | df2 = transf2.transform(df_org) 33 | self.assertTrue("col1" not in df2.columns) 34 | 35 | assert_almost_equal(df.iloc[0, 0], df2.iloc[0, 0]) 36 | -------------------------------------------------------------------------------- /tests/tests_tuner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_tuner/__init__.py -------------------------------------------------------------------------------- /tests/tests_tuner/test_hill_climbing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from supervised.tuner.mljar_tuner import MljarTuner 4 | 5 | 6 | class ModelMock: 7 | def __init__(self, name, model_type, final_loss, params): 8 | self.name = name 9 | self.model_type = model_type 10 | self.final_loss = final_loss 11 | self.params = params 12 | 13 | def get_name(self): 14 | return self.name 15 | 16 | def get_type(self): 17 | return self.model_type 18 | 19 | def get_final_loss(self): 20 | return self.final_loss 21 | 22 | def get_train_time(self): 23 | return 0.1 24 | 25 | 26 | class TunerHillClimbingTest(unittest.TestCase): 27 | def test_hill_climbing(self): 28 | models = [] 29 | models += [ 30 | ModelMock( 31 | "121_RandomForest", 32 | "Random Forest", 33 | 0.1, 34 | { 35 | "learner": {"max_features": 0.4, "model_type": "Random Forest"}, 36 | "preprocessing": {}, 37 | "validation_strategy": {}, 38 | }, 39 | ) 40 | ] 41 | models += [ 42 | ModelMock( 43 | "1_RandomForest", 44 | "Random Forest", 45 | 0.1, 46 | { 47 | "learner": {"max_features": 0.4, "model_type": "Random Forest"}, 48 | "preprocessing": {}, 49 | "validation_strategy": {}, 50 | }, 51 | ) 52 | ] 53 | tuner = MljarTuner( 54 | { 55 | "start_random_models": 0, 56 | "hill_climbing_steps": 1, 57 | "top_models_to_improve": 2, 58 | }, 59 | algorithms=["Random Foresrt"], 60 | ml_task="binary_classification", 61 | eval_metric="logloss", 62 | validation_strategy={}, 63 | explain_level=2, 64 | data_info={"columns_info": [], "target_info": []}, 65 | golden_features=False, 66 | features_selection=False, 67 | train_ensemble=False, 68 | stack_models=False, 69 | adjust_validation=False, 70 | boost_on_errors=False, 71 | kmeans_features=False, 72 | mix_encoding=False, 73 | optuna_time_budget=None, 74 | optuna_init_params={}, 75 | optuna_verbose=True, 76 | n_jobs=1, 77 | seed=12, 78 | ) 79 | ind = 121 80 | score = 0.1 81 | for _ in range(5): 82 | for p in tuner.get_hill_climbing_params(models): 83 | models += [ModelMock(p["name"], "Random Forest", score, p)] 84 | score *= 0.1 85 | self.assertTrue(int(p["name"].split("_")[0]) > ind) 86 | ind += 1 87 | -------------------------------------------------------------------------------- /tests/tests_tuner/test_time_controller.py: -------------------------------------------------------------------------------- 1 | import time 2 | import unittest 3 | 4 | from numpy.testing import assert_almost_equal 5 | 6 | from supervised.tuner.time_controller import TimeController 7 | 8 | 9 | class TimeControllerTest(unittest.TestCase): 10 | def test_to_and_from_json(self): 11 | tc = TimeController( 12 | start_time=time.time(), 13 | total_time_limit=10, 14 | model_time_limit=None, 15 | steps=["simple_algorithms"], 16 | algorithms=["Baseline"], 17 | ) 18 | tc.log_time("1_Baseline", "Baseline", "simple_algorithms", 123.1) 19 | 20 | tc2 = TimeController.from_json(tc.to_json()) 21 | 22 | assert_almost_equal(tc2.step_spend("simple_algorithms"), 123.1) 23 | assert_almost_equal(tc2.model_spend("Baseline"), 123.1) 24 | 25 | def test_enough_time_for_stacking(self): 26 | for t in [5, 10, 20]: 27 | tc = TimeController( 28 | start_time=time.time(), 29 | total_time_limit=100, 30 | model_time_limit=None, 31 | steps=[ 32 | "default_algorithms", 33 | "not_so_random", 34 | "golden_features", 35 | "insert_random_feature", 36 | "features_selection", 37 | "hill_climbing_1", 38 | "hill_climbing_3", 39 | "hill_climbing_5", 40 | "ensemble", 41 | "stack", 42 | "ensemble_stacked", 43 | ], 44 | algorithms=["Xgboost"], 45 | ) 46 | tc.log_time("1_Xgboost", "Xgboost", "default_algorithms", t) 47 | tc.log_time("2_Xgboost", "Xgboost", "not_so_random", t) 48 | tc.log_time("3_Xgboost", "Xgboost", "insert_random_feature", t) 49 | tc.log_time("4_Xgboost", "Xgboost", "features_selection", t) 50 | tc.log_time("5_Xgboost", "Xgboost", "hill_climbing_1", t) 51 | tc.log_time("6_Xgboost", "Xgboost", "hill_climbing_2", t) 52 | tc.log_time("7_Xgboost", "Xgboost", "hill_climbing_3", t) 53 | 54 | tc._start_time = time.time() - 7 * t 55 | assert_almost_equal(tc.already_spend(), 7 * t) 56 | if t < 20: 57 | self.assertTrue(tc.enough_time("Xgboost", "stack")) 58 | else: 59 | self.assertFalse(tc.enough_time("Xgboost", "stack")) 60 | self.assertTrue(tc.enough_time("Ensemble_Stacked", "ensemble_stacked")) 61 | -------------------------------------------------------------------------------- /tests/tests_tuner/test_tuner.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from supervised.tuner.mljar_tuner import MljarTuner 4 | 5 | 6 | class TunerTest(unittest.TestCase): 7 | def test_key_params(self): 8 | params1 = { 9 | "preprocessing": {"p1": 1, "p2": 2}, 10 | "learner": {"p1": 1, "p2": 2}, 11 | "validation_strategy": {}, 12 | } 13 | params2 = { 14 | "preprocessing": {"p1": 1, "p2": 2}, 15 | "learner": {"p2": 2, "p1": 1}, 16 | "validation_strategy": {}, 17 | } 18 | key1 = MljarTuner.get_params_key(params1) 19 | key2 = MljarTuner.get_params_key(params2) 20 | self.assertEqual(key1, key2) 21 | -------------------------------------------------------------------------------- /tests/tests_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_utils/__init__.py -------------------------------------------------------------------------------- /tests/tests_utils/test_compute_additional_metrics.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from supervised.algorithms.registry import BINARY_CLASSIFICATION, REGRESSION 6 | from supervised.utils.additional_metrics import AdditionalMetrics 7 | 8 | 9 | class ComputeAdditionalMetricsTest(unittest.TestCase): 10 | def test_compute(self): 11 | target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) 12 | pred = np.array([0.1, 0.8, 0.1, 0.1, 0.8, 0.1, 0.8, 0.8]) 13 | info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION) 14 | details = info["metric_details"] 15 | max_metrics = info["max_metrics"] 16 | conf = info["confusion_matrix"] 17 | self.assertEqual(conf.iloc[0, 0], 3) 18 | self.assertEqual(conf.iloc[1, 1], 3) 19 | self.assertTrue(details is not None) 20 | self.assertTrue(max_metrics is not None) 21 | 22 | def test_compute_f1(self): 23 | target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) 24 | pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8]) 25 | info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION) 26 | details = info["metric_details"] 27 | max_metrics = info["max_metrics"] 28 | conf = info["confusion_matrix"] 29 | self.assertEqual(max_metrics["f1"]["score"], 1) 30 | self.assertTrue(details is not None) 31 | self.assertTrue(conf is not None) 32 | 33 | def test_compute_for_regression(self): 34 | target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) 35 | pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8]) 36 | info = AdditionalMetrics.compute(target, pred, None, REGRESSION) 37 | all_metrics = list(info["max_metrics"]["Metric"].values) 38 | for m in ["MAE", "MSE", "RMSE", "R2"]: 39 | self.assertTrue(m in all_metrics) 40 | 41 | def test_compute_constant_preds(self): 42 | target = np.array([0, 0, 1, 1, 0, 0, 0, 0]) 43 | pred = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) 44 | info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION) 45 | details = info["metric_details"] 46 | max_metrics = info["max_metrics"] 47 | conf = info["confusion_matrix"] 48 | self.assertTrue(max_metrics["f1"]["score"] < 1) 49 | self.assertTrue(max_metrics["mcc"]["score"] < 1) 50 | -------------------------------------------------------------------------------- /tests/tests_utils/test_importance.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.tree import DecisionTreeClassifier 8 | 9 | from supervised.utils.importance import PermutationImportance 10 | 11 | 12 | class PermutationImportanceTest(unittest.TestCase): 13 | def test_compute_and_plot(self): 14 | rows = 20 15 | X = np.random.rand(rows, 3) 16 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 17 | y = np.random.randint(0, 2, rows) 18 | 19 | model = DecisionTreeClassifier(max_depth=1) 20 | model.fit(X, y) 21 | 22 | with tempfile.TemporaryDirectory() as tmpdir: 23 | PermutationImportance.compute_and_plot( 24 | model, 25 | X_validation=X, 26 | y_validation=y, 27 | model_file_path=tmpdir, 28 | learner_name="learner_test", 29 | metric_name=None, 30 | ml_task="binary_classification", 31 | ) 32 | self.assertTrue( 33 | os.path.exists(os.path.join(tmpdir, "learner_test_importance.csv")) 34 | ) 35 | -------------------------------------------------------------------------------- /tests/tests_utils/test_learning_curves.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from supervised.utils.learning_curves import LearningCurves 5 | 6 | 7 | class LearningCurvesTest(unittest.TestCase): 8 | def test_plot_close(self): 9 | """ 10 | Test if we close plots. To avoid following warning: 11 | RuntimeWarning: More than 20 figures have been opened. 12 | Figures created through the pyplot interface (`matplotlib.pyplot.figure`) 13 | are retained until explicitly closed and may consume too much memory. 14 | """ 15 | for _ in range( 16 | 1 17 | ): # you can increase the range, for tests speed reason I keep it low 18 | LearningCurves.plot_for_ensemble([3, 2, 1], "random_metrics", ".") 19 | 20 | os.remove(LearningCurves.output_file_name) 21 | -------------------------------------------------------------------------------- /tests/tests_utils/test_metric.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from numpy.testing import assert_almost_equal 5 | 6 | from supervised.utils.metric import Metric 7 | from supervised.utils.metric import UserDefinedEvalMetric 8 | 9 | 10 | class MetricTest(unittest.TestCase): 11 | def test_create(self): 12 | params = {"name": "logloss"} 13 | m = Metric(params) 14 | y_true = np.array([0, 0, 1, 1]) 15 | y_predicted = np.array([0, 0, 1, 1]) 16 | score = m(y_true, y_predicted) 17 | self.assertTrue(score < 0.1) 18 | y_true = np.array([0, 0, 1, 1]) 19 | y_predicted = np.array([1, 1, 0, 0]) 20 | score = m(y_true, y_predicted) 21 | self.assertTrue(score > 1.0) 22 | 23 | def test_metric_improvement(self): 24 | params = {"name": "logloss"} 25 | m = Metric(params) 26 | y_true = np.array([0, 0, 1, 1]) 27 | y_predicted = np.array([0, 0, 0, 1]) 28 | score_1 = m(y_true, y_predicted) 29 | y_true = np.array([0, 0, 1, 1]) 30 | y_predicted = np.array([0, 0, 1, 1]) 31 | score_2 = m(y_true, y_predicted) 32 | self.assertTrue(m.improvement(score_1, score_2)) 33 | 34 | def test_sample_weight(self): 35 | metrics = ["logloss", "auc", "acc", "rmse", "mse", "mae", "r2", "mape"] 36 | for m in metrics: 37 | metric = Metric({"name": m}) 38 | y_true = np.array([0, 0, 1, 1]) 39 | y_predicted = np.array([0, 0, 0, 1]) 40 | sample_weight = np.array([1, 1, 1, 1]) 41 | 42 | score_1 = metric(y_true, y_predicted) 43 | score_2 = metric(y_true, y_predicted, sample_weight) 44 | assert_almost_equal(score_1, score_2) 45 | 46 | def test_r2_metric(self): 47 | params = {"name": "r2"} 48 | m = Metric(params) 49 | y_true = np.array([0, 0, 1, 1]) 50 | y_predicted = np.array([0, 0, 1, 1]) 51 | score = m(y_true, y_predicted) 52 | self.assertEqual(score, -1.0) # negative r2 53 | 54 | def test_mape_metric(self): 55 | params = {"name": "mape"} 56 | m = Metric(params) 57 | y_true = np.array([0, 0, 1, 1]) 58 | y_predicted = np.array([0, 0, 1, 1]) 59 | score = m(y_true, y_predicted) 60 | self.assertEqual(score, 0.0) 61 | 62 | def test_user_defined_metric(self): 63 | def custom(x, y, sample_weight=None): 64 | return np.sum(x + y) 65 | 66 | UserDefinedEvalMetric().set_metric(custom) 67 | 68 | params = {"name": "user_defined_metric"} 69 | m = Metric(params) 70 | 71 | a = np.array([1, 1, 1]) 72 | 73 | score = m(a, a) 74 | self.assertEqual(score, 6) 75 | -------------------------------------------------------------------------------- /tests/tests_utils/test_shap.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.utils.shap import PlotSHAP 7 | 8 | 9 | class PlotSHAPTest(unittest.TestCase): 10 | def test_get_sample_data_larger_1k(self): 11 | """Get sample when data is larger than 1k""" 12 | X = pd.DataFrame(np.random.uniform(size=(5763, 31))) 13 | y = pd.Series(np.random.randint(0, 2, size=(5763,))) 14 | 15 | X_, y_ = PlotSHAP.get_sample(X, y) 16 | 17 | self.assertEqual(X_.shape[0], 1000) 18 | self.assertEqual(y_.shape[0], 1000) 19 | 20 | def test_get_sample_data_smaller_1k(self): 21 | """Get sample when data is smaller than 1k""" 22 | SAMPLES = 100 23 | X = pd.DataFrame(np.random.uniform(size=(SAMPLES, 31))) 24 | y = pd.Series(np.random.randint(0, 2, size=(SAMPLES,))) 25 | 26 | X_, y_ = PlotSHAP.get_sample(X, y) 27 | 28 | self.assertEqual(X_.shape[0], SAMPLES) 29 | self.assertEqual(y_.shape[0], SAMPLES) 30 | -------------------------------------------------------------------------------- /tests/tests_utils/test_subsample.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.algorithms.registry import REGRESSION 7 | from supervised.utils.subsample import subsample 8 | 9 | 10 | class SubsampleTest(unittest.TestCase): 11 | def test_subsample_regression_10k(self): 12 | rows = 10000 13 | cols = 51 14 | X = np.random.rand(rows, cols) 15 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(cols)]) 16 | y = pd.Series(np.random.rand(rows), name="target") 17 | 18 | X_train, X_test, y_train, y_test = subsample( 19 | X, y, train_size=1000, ml_task=REGRESSION 20 | ) 21 | 22 | self.assertTrue(X_train.shape[0], 1000) 23 | self.assertTrue(X_test.shape[0], 9000) 24 | self.assertTrue(y_train.shape[0], 1000) 25 | self.assertTrue(y_test.shape[0], 9000) 26 | -------------------------------------------------------------------------------- /tests/tests_validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_validation/__init__.py --------------------------------------------------------------------------------