├── .github
    └── workflows
    │   ├── run-tests.yml
    │   └── test-installation-with-conda.yml
├── .gitignore
├── CITATION
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples
    ├── notebooks
    │   ├── Titanic.ipynb
    │   └── basic_run.ipynb
    └── scripts
    │   ├── binary_classifier.py
    │   ├── binary_classifier_Titanic.py
    │   ├── binary_classifier_adult_fairness.py
    │   ├── binary_classifier_ensemble.py
    │   ├── binary_classifier_marketing.py
    │   ├── binary_classifier_random.py
    │   ├── multi_class_classifier.py
    │   ├── multi_class_classifier_MNIST.py
    │   ├── multi_class_classifier_digits.py
    │   ├── multi_class_drug_fairness.py
    │   ├── regression.py
    │   ├── regression_acs_fairness.py
    │   ├── regression_crime_fairness.py
    │   ├── regression_housing_fairness.py
    │   ├── regression_law_school_fairness.py
    │   └── tabular_mar_2021.py
├── pytest.ini
├── requirements.txt
├── requirements_dev.txt
├── setup.py
├── supervised
    ├── __init__.py
    ├── algorithms
    │   ├── __init__.py
    │   ├── algorithm.py
    │   ├── baseline.py
    │   ├── catboost.py
    │   ├── decision_tree.py
    │   ├── extra_trees.py
    │   ├── factory.py
    │   ├── knn.py
    │   ├── lightgbm.py
    │   ├── linear.py
    │   ├── nn.py
    │   ├── random_forest.py
    │   ├── registry.py
    │   ├── sklearn.py
    │   └── xgboost.py
    ├── automl.py
    ├── base_automl.py
    ├── callbacks
    │   ├── __init__.py
    │   ├── callback.py
    │   ├── callback_list.py
    │   ├── early_stopping.py
    │   ├── learner_time_constraint.py
    │   ├── max_iters_constraint.py
    │   ├── metric_logger.py
    │   ├── terminate_on_nan.py
    │   └── total_time_constraint.py
    ├── ensemble.py
    ├── exceptions.py
    ├── fairness
    │   ├── __init__.py
    │   ├── metrics.py
    │   ├── optimization.py
    │   ├── plots.py
    │   ├── report.py
    │   └── utils.py
    ├── model_framework.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── datetime_transformer.py
    │   ├── eda.py
    │   ├── encoding_selector.py
    │   ├── exclude_missing_target.py
    │   ├── goldenfeatures_transformer.py
    │   ├── kmeans_transformer.py
    │   ├── label_binarizer.py
    │   ├── label_encoder.py
    │   ├── loo_encoder.py
    │   ├── preprocessing.py
    │   ├── preprocessing_categorical.py
    │   ├── preprocessing_missing.py
    │   ├── preprocessing_utils.py
    │   ├── scale.py
    │   └── text_transformer.py
    ├── tuner
    │   ├── __init__.py
    │   ├── data_info.py
    │   ├── hill_climbing.py
    │   ├── mljar_tuner.py
    │   ├── optuna
    │   │   ├── __init__.py
    │   │   ├── catboost.py
    │   │   ├── extra_trees.py
    │   │   ├── knn.py
    │   │   ├── lightgbm.py
    │   │   ├── nn.py
    │   │   ├── random_forest.py
    │   │   ├── tuner.py
    │   │   └── xgboost.py
    │   ├── preprocessing_tuner.py
    │   ├── random_parameters.py
    │   └── time_controller.py
    ├── utils
    │   ├── __init__.py
    │   ├── additional_metrics.py
    │   ├── additional_plots.py
    │   ├── automl_plots.py
    │   ├── common.py
    │   ├── config.py
    │   ├── constants.py
    │   ├── data_validation.py
    │   ├── importance.py
    │   ├── jsonencoder.py
    │   ├── leaderboard_plots.py
    │   ├── learning_curves.py
    │   ├── metric.py
    │   ├── shap.py
    │   ├── subsample.py
    │   └── utils.py
    └── validation
    │   ├── __init__.py
    │   ├── validation_step.py
    │   ├── validator_base.py
    │   ├── validator_custom.py
    │   ├── validator_kfold.py
    │   └── validator_split.py
└── tests
    ├── README.md
    ├── __init__.py
    ├── checks
        ├── __init__.py
        ├── check_automl_with_regression.py
        ├── run_ml_tests.py
        └── run_performance_tests.py
    ├── conftest.py
    ├── data
        ├── 179.csv
        ├── 24.csv
        ├── 3.csv
        ├── 31.csv
        ├── 38.csv
        ├── 44.csv
        ├── 720.csv
        ├── 737.csv
        ├── CrimeData
        │   ├── README.md
        │   ├── cities.json
        │   └── crimedata.csv
        ├── Drug
        │   ├── Drug_Consumption.csv
        │   └── README.md
        ├── LawSchool
        │   ├── README.md
        │   └── bar_pass_prediction.csv
        ├── PortugeseBankMarketing
        │   └── Data_FinalProject.csv
        ├── Titanic
        │   ├── test_with_Survived.csv
        │   └── train.csv
        ├── acs_income_1k.csv
        ├── adult_missing_values_missing_target_500rows.csv
        ├── boston_housing.csv
        ├── housing_regression_missing_values_missing_target.csv
        ├── iris_classes_missing_values_missing_target.csv
        └── iris_missing_values_missing_target.csv
    ├── tests_algorithms
        ├── __init__.py
        ├── test_baseline.py
        ├── test_catboost.py
        ├── test_decision_tree.py
        ├── test_extra_trees.py
        ├── test_factory.py
        ├── test_knn.py
        ├── test_lightgbm.py
        ├── test_linear.py
        ├── test_nn.py
        ├── test_random_forest.py
        ├── test_registry.py
        └── test_xgboost.py
    ├── tests_automl
        ├── __init__.py
        ├── test_adjust_validation.py
        ├── test_automl.py
        ├── test_automl_init.py
        ├── test_automl_report.py
        ├── test_automl_sample_weight.py
        ├── test_automl_time_constraints.py
        ├── test_data_types.py
        ├── test_dir_change.py
        ├── test_explain_levels.py
        ├── test_golden_features.py
        ├── test_handle_imbalance.py
        ├── test_integration.py
        ├── test_joblib_version.py
        ├── test_models_needed_for_predict.py
        ├── test_prediction_after_load.py
        ├── test_repeated_validation.py
        ├── test_restore.py
        ├── test_stack_models_constraints.py
        ├── test_targets.py
        └── test_update_errors_report.py
    ├── tests_callbacks
        ├── __init__.py
        └── test_total_time_constraint.py
    ├── tests_ensemble
        ├── __init__.py
        └── test_save_load.py
    ├── tests_fairness
        ├── __init__.py
        ├── test_binary_classification.py
        ├── test_multi_class_classification.py
        └── test_regression.py
    ├── tests_preprocessing
        ├── __init__.py
        ├── disable_eda.py
        ├── test_categorical_integers.py
        ├── test_datetime_transformer.py
        ├── test_encoding_selector.py
        ├── test_exclude_missing.py
        ├── test_goldenfeatures_transformer.py
        ├── test_label_binarizer.py
        ├── test_label_encoder.py
        ├── test_loo_encoder.py
        ├── test_preprocessing.py
        ├── test_preprocessing_missing.py
        ├── test_preprocessing_utils.py
        ├── test_scale.py
        └── test_text_transformer.py
    ├── tests_tuner
        ├── __init__.py
        ├── test_hill_climbing.py
        ├── test_time_controller.py
        └── test_tuner.py
    ├── tests_utils
        ├── __init__.py
        ├── test_compute_additional_metrics.py
        ├── test_importance.py
        ├── test_learning_curves.py
        ├── test_metric.py
        ├── test_shap.py
        └── test_subsample.py
    └── tests_validation
        ├── __init__.py
        ├── test_validator_kfold.py
        └── test_validator_split.py


/.github/workflows/run-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [ push,pull_request ]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         os: [ ubuntu-latest ]
12 |         python-version: [ '3.10']
13 |         #os: [ ubuntu-latest, macos-latest, windows-latest ]
14 |         #python-version: [ '3.8', '3.9', '3.10', '3.11' ]
15 | 
16 |     steps:
17 |       - name: Install OS Dependencies
18 |         if: matrix.os == 'ubuntu-latest'
19 |         run: |
20 |           sudo apt-get update
21 |           sudo apt-get -y install graphviz
22 | 
23 |       - name: Install OS Dependencies
24 |         if: matrix.os == 'macos-latest'
25 |         run: |
26 |           brew install graphviz
27 | 
28 |       - name: Install OS Dependencies
29 |         if: matrix.os == 'windows-latest'
30 |         run: |
31 |           choco install graphviz
32 |       - uses: actions/checkout@v2
33 |       - name: Set up Python ${{ matrix.python-version }}
34 |         uses: actions/setup-python@v2
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 |       - name: Install Python Dependencies
38 |         run: |
39 |           python -m pip install --upgrade pip
40 |           pip install --upgrade setuptools
41 |           pip install -U importlib-metadata>=1.7.0
42 |           pip install -U -r requirements.txt
43 |           pip install -U -r requirements_dev.txt
44 |           pip install ipython
45 |           python setup.py install
46 |       - name: Test with pytest
47 |         run: |
48 |           pytest tests --cov=supervised/
49 |     continue-on-error: true
50 | 


--------------------------------------------------------------------------------
/.github/workflows/test-installation-with-conda.yml:
--------------------------------------------------------------------------------
 1 | name: Test installation with conda
 2 | 
 3 | on: 
 4 |   schedule:
 5 |     - cron:  '0 8 * * 1'
 6 |   # run workflow manually
 7 |   workflow_dispatch:
 8 |   
 9 | jobs:
10 |   build:
11 |     name: Run (${{ matrix.python-version }}, ${{ matrix.os }})
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         os: [windows-latest] 
17 |         python-version: ['3.9']
18 |     
19 |     steps:
20 |       - uses: conda-incubator/setup-miniconda@v2
21 |         with:
22 |           activate-environment: test
23 |           auto-update-conda: false
24 |           python-version: ${{ matrix.python-version }}
25 |       - name: Activate conda and check versions
26 |         run: |
27 |           conda activate test
28 |           conda --version
29 |           python --version
30 |       - name: Install MLJAR AutoML
31 |         run: conda install -c conda-forge mljar-supervised
32 |       - name: Try to import
33 |         run: python -c "import supervised;print(supervised.__version__)"
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | AutoML_*
  2 | .vscode
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 


--------------------------------------------------------------------------------
/CITATION:
--------------------------------------------------------------------------------
1 | @misc{mljar,
2 |   author    = {Aleksandra P\l{}o\'{n}ska and Piotr P\l{}o\'{n}ski},
3 |   year      = {2021},
4 |   publisher = {MLJAR Sp. z o.o.},
5 |   address   = {\L{}apy, Poland},
6 |   title     = {MLJAR: State-of-the-art Automated Machine Learning Framework for Tabular Data.  Version 0.10.3},
7 |   url       = {https://github.com/mljar/mljar-supervised}
8 | }
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 MLJAR Sp. z o.o.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include LICENSE
3 | include README.md


--------------------------------------------------------------------------------
/examples/notebooks/basic_run.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import pandas as pd\n",
10 |     "import openml\n",
11 |     "from sklearn.ensemble import RandomForestClassifier\n",
12 |     "from supervised.automl import AutoML\n",
13 |     "\n",
14 |     "import os\n",
15 |     "import numpy as np\n",
16 |     "import pandas as pd\n",
17 |     "import sklearn.model_selection\n",
18 |     "from sklearn.metrics import log_loss, f1_score\n"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": [
27 |     "dataset_id = 3\n",
28 |     "df = pd.read_csv('./tests/data/{0}.csv'.format(dataset_id))\n",
29 |     "x_cols = [c for c in df.columns if c != 'target']\n",
30 |     "X = df[x_cols]\n",
31 |     "y = df['target']"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {},
38 |    "outputs": [],
39 |    "source": [
40 |     "a = AutoML(total_time_limit=10)"
41 |    ]
42 |   },
43 |   {
44 |    "cell_type": "code",
45 |    "execution_count": null,
46 |    "metadata": {},
47 |    "outputs": [],
48 |    "source": [
49 |     "a.fit(X, y)"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": null,
55 |    "metadata": {},
56 |    "outputs": [],
57 |    "source": [
58 |     " "
59 |    ]
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "kernelspec": {
64 |    "display_name": ".venv",
65 |    "language": "python",
66 |    "name": ".venv"
67 |   },
68 |   "language_info": {
69 |    "codemirror_mode": {
70 |     "name": "ipython",
71 |     "version": 3
72 |    },
73 |    "file_extension": ".py",
74 |    "mimetype": "text/x-python",
75 |    "name": "python",
76 |    "nbconvert_exporter": "python",
77 |    "pygments_lexer": "ipython3",
78 |    "version": "3.6.7"
79 |   }
80 |  },
81 |  "nbformat": 4,
82 |  "nbformat_minor": 2
83 | }


--------------------------------------------------------------------------------
/examples/scripts/binary_classifier.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | from sklearn.model_selection import train_test_split
 5 | import os
 6 | from sklearn.metrics import log_loss
 7 | import warnings
 8 | 
 9 | # warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning)
10 | 
11 | df = pd.read_csv(
12 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
13 |     skipinitialspace=True,
14 | )
15 | 
16 | X = df[df.columns[:-1]]
17 | y = df["income"]
18 | 
19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
20 | 
21 | automl = AutoML(
22 |     algorithms=["LightGBM"],
23 |     mode="Compete",
24 |     explain_level=0,
25 |     train_ensemble=True,
26 |     golden_features=False,
27 |     features_selection=False,
28 |     eval_metric="auc",
29 | )
30 | automl.fit(X_train, y_train)
31 | 
32 | predictions = automl.predict_all(X_test)
33 | 
34 | print(predictions.head())
35 | print(predictions.tail())
36 | print(X_test.shape, predictions.shape)
37 | print("LogLoss", log_loss(y_test, predictions["prediction_>50K"]))
38 | 


--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_Titanic.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.metrics import accuracy_score
 4 | from supervised import AutoML
 5 | 
 6 | train = pd.read_csv(
 7 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv"
 8 | )
 9 | print(train.head())
10 | 
11 | X = train[train.columns[2:]]
12 | y = train["Survived"]
13 | 
14 | automl = AutoML()  # default mode is Explain
15 | 
16 | automl.fit(X, y)
17 | 
18 | test = pd.read_csv(
19 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv"
20 | )
21 | predictions = automl.predict(test)
22 | print(predictions)
23 | print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%")
24 | 


--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_adult_fairness.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.datasets import fetch_openml
 4 | from supervised.automl import AutoML
 5 | 
 6 | data = fetch_openml(data_id=1590, as_frame=True)
 7 | X = data.data
 8 | # data.target #
 9 | y = data.target # (data.target == ">50K") * 1
10 | sensitive_features = X[["sex"]]
11 | 
12 | X_train, X_test, y_train, y_test, S_train, S_test = train_test_split(
13 |     X, y, sensitive_features, stratify=y, test_size=0.75, random_state=42
14 | )
15 | 
16 | automl = AutoML(
17 |     algorithms=[
18 |         "Xgboost"
19 |     ],
20 |     train_ensemble=False,
21 |     fairness_metric="demographic_parity_ratio",  
22 |     fairness_threshold=0.8,
23 |     privileged_groups = [{"sex": "Male"}],
24 |     underprivileged_groups = [{"sex": "Female"}],
25 | )
26 | 
27 | automl.fit(X_train, y_train, sensitive_features=S_train)
28 | 


--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_ensemble.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from supervised.automl import AutoML
 3 | from supervised.ensemble import Ensemble
 4 | import os
 5 | 
 6 | df = pd.read_csv(
 7 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
 8 |     skipinitialspace=True,
 9 | )
10 | 
11 | X = df[df.columns[:-1]]
12 | y = df["income"]
13 | 
14 | results_path = "AutoML_2"
15 | automl = AutoML(
16 |     results_path=results_path,
17 |     total_time_limit=400,
18 |     start_random_models=10,
19 |     hill_climbing_steps=0,
20 |     top_models_to_improve=0,
21 |     train_ensemble=False,
22 | )
23 | 
24 | 
25 | models_map = {m.get_name(): m for m in automl._models}
26 | 
27 | ensemble = Ensemble("logloss", "binary_classification")
28 | ensemble.models_map = models_map
29 | 
30 | oofs = {}
31 | target = None
32 | for i in range(1, 30):
33 |     oof = pd.read_csv(
34 |         os.path.join(results_path, f"model_{i}", "predictions_out_of_folds.csv")
35 |     )
36 |     prediction_cols = [c for c in oof.columns if "prediction" in c]
37 |     oofs[f"model_{i}"] = oof[prediction_cols]
38 |     if target is None:
39 |         target_columns = [c for c in oof.columns if "target" in c]
40 |         target = oof[target_columns]
41 | 
42 | ensemble.target = target
43 | ensemble.target_columns = "target"
44 | ensemble.fit(oofs, target)
45 | ensemble.save(os.path.join(results_path, "ensemble"))
46 | 
47 | 
48 | predictions = ensemble.predict(X)
49 | print(predictions.head())
50 | 
51 | """
52 |     p_<=50K    p_>50K
53 | 0  0.982940  0.017060
54 | 1  0.722781  0.277219
55 | 2  0.972687  0.027313
56 | 3  0.903021  0.096979
57 | 4  0.591373  0.408627
58 | """
59 | 
60 | 
61 | ensemble2 = Ensemble.load(os.path.join(results_path, "ensemble"), models_map)
62 | predictions2 = ensemble2.predict(X)
63 | print(predictions2.head())
64 | 
65 | """
66 |     p_<=50K    p_>50K
67 | 0  0.982940  0.017060
68 | 1  0.722781  0.277219
69 | 2  0.972687  0.027313
70 | 3  0.903021  0.096979
71 | 4  0.591373  0.408627
72 | """
73 | 


--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_marketing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from supervised.automl import AutoML
 3 | import os
 4 | 
 5 | from sklearn.metrics import accuracy_score
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | df = pd.read_csv("tests/data/PortugeseBankMarketing/Data_FinalProject.csv")
 9 | 
10 | X = df[df.columns[:-1]]
11 | y = df["y"]
12 | 
13 | 
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)
15 | 
16 | 
17 | automl = AutoML(
18 |     # results_path="AutoML_22",
19 |     total_time_limit=30 * 60,
20 |     start_random_models=10,
21 |     hill_climbing_steps=3,
22 |     top_models_to_improve=3,
23 |     train_ensemble=True,
24 | )
25 | 
26 | automl.fit(X_train, y_train)
27 | 
28 | 
29 | pred = automl.predict(X_test)
30 | print("Test accuracy", accuracy_score(y_test, pred))
31 | 


--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_random.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | from sklearn.metrics import accuracy_score
 5 | import os
 6 | 
 7 | nrows = 100
 8 | ncols = 3
 9 | X = np.random.rand(nrows, ncols)
10 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(ncols)])
11 | y = np.random.randint(0, 2, nrows)
12 | # y = np.random.permutation(["a", "B"] * 50)
13 | 
14 | automl = AutoML(model_time_limit=10)  # , algorithms=["Decision Tree"])
15 | automl.fit(X, y)
16 | print("Train accuracy", accuracy_score(y, automl.predict_all(X)["label"]))
17 | 
18 | # X = np.random.rand(1000, 10)
19 | # X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)])
20 | # y = np.random.randint(0, 2, 1000)
21 | # print("Test accuracy", accuracy_score(y, automl.predict(X)["label"]))
22 | 


--------------------------------------------------------------------------------
/examples/scripts/multi_class_classifier.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from supervised.automl import AutoML
 4 | import supervised
 5 | 
 6 | 
 7 | import warnings
 8 | 
 9 | from sklearn import datasets
10 | from sklearn.pipeline import make_pipeline
11 | from sklearn.decomposition import PCA
12 | 
13 | from supervised import AutoML
14 | from supervised.exceptions import AutoMLException
15 | 
16 | df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv")
17 | X = df[["feature_1", "feature_2", "feature_3", "feature_4"]]
18 | y = df["class"]
19 | 
20 | automl = AutoML()
21 | 
22 | automl.fit(X, y)
23 | 
24 | predictions = automl.predict_all(X)
25 | 
26 | print(predictions.head())
27 | print(predictions.tail())
28 | 
29 | print(X.shape)
30 | print(predictions.shape)
31 | 


--------------------------------------------------------------------------------
/examples/scripts/multi_class_classifier_MNIST.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from supervised.automl import AutoML
 4 | 
 5 | 
 6 | from supervised.utils.config import mem
 7 | 
 8 | 
 9 | df = pd.read_csv("tests/data/MNIST/train.csv")
10 | 
11 | X = df[[f for f in df.columns if "pixel" in f]]
12 | y = df["label"]
13 | 
14 | for _ in range(4):
15 |     X = pd.concat([X, X], axis=0)
16 |     y = pd.concat([y, y], axis=0)
17 | 
18 | 
19 | mem()
20 | 
21 | 
22 | automl = AutoML(
23 |     # results_path="AutoML_12",
24 |     total_time_limit=60 * 60,
25 |     start_random_models=5,
26 |     hill_climbing_steps=2,
27 |     top_models_to_improve=3,
28 |     train_ensemble=True,
29 | )
30 | 
31 | mem()
32 | print("Start fit")
33 | automl.fit(X, y)
34 | 
35 | test = pd.read_csv("tests/data/MNIST/test.csv")
36 | predictions = automl.predict(test)
37 | 
38 | print(predictions.head())
39 | print(predictions.tail())
40 | 
41 | sub = pd.DataFrame({"ImageId": 0, "Label": predictions["label"]})
42 | sub["ImageId"] = sub.index + 1
43 | sub.to_csv("sub1.csv", index=False)
44 | 


--------------------------------------------------------------------------------
/examples/scripts/multi_class_classifier_digits.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # scikit learn utilites
 4 | from sklearn.datasets import load_digits
 5 | from sklearn.metrics import accuracy_score
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | # mljar-supervised package
 9 | from supervised.automl import AutoML
10 | 
11 | # Load the data
12 | digits = load_digits()
13 | X_train, X_test, y_train, y_test = train_test_split(
14 |     pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25
15 | )
16 | 
17 | # train models
18 | automl = AutoML(mode="Perform")
19 | automl.fit(X_train, y_train)
20 | 
21 | # compute the accuracy on test data
22 | predictions = automl.predict(X_test)
23 | print(predictions.head())
24 | print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))
25 | 


--------------------------------------------------------------------------------
/examples/scripts/multi_class_drug_fairness.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from supervised import AutoML
 5 | 
 6 | 
 7 | df = pd.read_csv("tests/data/Drug/Drug_Consumption.csv")
 8 | 
 9 | 
10 | X = df[df.columns[1:13]]
11 | 
12 | # convert to 3 classes
13 | df = df.replace(
14 |     {
15 |         "Cannabis": {
16 |             "CL0": "never_used",
17 |             "CL1": "not_in_last_year",
18 |             "CL2": "not_in_last_year",
19 |             "CL3": "used_in_last_year",
20 |             "CL4": "used_in_last_year",
21 |             "CL5": "used_in_last_year",
22 |             "CL6": "used_in_last_year",
23 |         }
24 |     }
25 | )
26 | 
27 | y = df["Cannabis"]
28 | 
29 | # maybe should be 
30 | # The binary sensitive feature is education level (college degree or not).
31 | # like in 
32 | # Fairness guarantee in multi-class classification
33 | sensitive_features = df["Gender"]
34 | 
35 | 
36 | automl = AutoML(
37 |     algorithms=["Xgboost"],
38 |     train_ensemble=True,
39 |     start_random_models=3,
40 |     hill_climbing_steps=3,
41 |     top_models_to_improve=2,
42 |     fairness_threshold=0.8,
43 |     explain_level=1
44 | )
45 | automl.fit(X, y, sensitive_features=sensitive_features)
46 | 


--------------------------------------------------------------------------------
/examples/scripts/regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | df = pd.read_csv("./tests/data/housing_regression_missing_values_missing_target.csv")
 6 | x_cols = [c for c in df.columns if c != "MEDV"]
 7 | X = df[x_cols]
 8 | y = df["MEDV"]
 9 | 
10 | automl = AutoML()
11 | automl.fit(X, y)
12 | 
13 | df["predictions"] = automl.predict(X)
14 | print("Predictions")
15 | print(df[["MEDV", "predictions"]].head())
16 | 


--------------------------------------------------------------------------------
/examples/scripts/regression_acs_fairness.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | # to get data
 6 | # from fairlearn.datasets import fetch_acs_income
 7 | # df = fetch_acs_income(as_frame=True)
 8 | # df["frame"].to_csv("acs_income.csv", index=False)
 9 | 
10 | df = pd.read_csv("tests/data/acs_income_1k.csv")
11 | 
12 | print(df)
13 | 
14 | x_cols = [c for c in df.columns if c != "PINCP"]
15 | 
16 | sensitive_features = df["SEX"].astype(str)
17 | 
18 | X = df[x_cols]
19 | y = df["PINCP"]
20 | 
21 | automl = AutoML(
22 |     algorithms=["Xgboost", "LightGBM"],
23 |     train_ensemble=True,
24 |     fairness_threshold=0.91,
25 |     # underprivileged_groups=[{"SEX": "1.0"}],
26 |     # privileged_groups=[{"SEX": "2.0"}]
27 | )
28 | automl.fit(X, y, sensitive_features=sensitive_features)
29 | 


--------------------------------------------------------------------------------
/examples/scripts/regression_crime_fairness.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | # data source http://archive.ics.uci.edu/ml/datasets/Communities%20and%20Crime%20Unnormalized
 6 | 
 7 | df = pd.read_csv("tests/data/CrimeData/crimedata.csv", na_values=["?"])
 8 | 
 9 | X = df[df.columns[5:129]]
10 | y = df["ViolentCrimesPerPop"]
11 | 
12 | sensitive_features = (df["racePctWhite"] > 84).astype(str)
13 | 
14 | automl = AutoML(
15 |     #algorithms=["Decision Tree", "Neural Network", "Xgboost", "Linear", "CatBoost"],
16 |     algorithms=["Xgboost", "Linear", "CatBoost"],
17 |     train_ensemble=True,
18 |     fairness_threshold=0.5,
19 | )
20 | automl.fit(X, y, sensitive_features=sensitive_features)
21 | 


--------------------------------------------------------------------------------
/examples/scripts/regression_housing_fairness.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | df = pd.read_csv("./tests/data/boston_housing.csv")
 6 | x_cols = [c for c in df.columns if c != "MEDV"]
 7 | 
 8 | df["large_B"] = (df["B"] > 380) * 1
 9 | df["large_B"] = df["large_B"].astype(str)
10 | 
11 | 
12 | print(df["large_B"].dtype.name)
13 | sensitive_features = df["large_B"]
14 | 
15 | X = df[x_cols]
16 | y = df["MEDV"]
17 | 
18 | automl = AutoML(
19 |     algorithms=["Xgboost", "LightGBM"],
20 |     train_ensemble=True,
21 |     fairness_threshold=0.9,
22 | )
23 | automl.fit(X, y, sensitive_features=sensitive_features)
24 | 
25 | df["predictions"] = automl.predict(X)
26 | print("Predictions")
27 | print(df[["MEDV", "predictions"]].head())
28 | 


--------------------------------------------------------------------------------
/examples/scripts/regression_law_school_fairness.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | df = pd.read_csv("tests/data/LawSchool/bar_pass_prediction.csv")
 6 | df["race1"][df["race1"] != "white"] = "non-white"  # keep it as binary feature
 7 | 
 8 | X = df[["gender", "lsat", "race1", "pass_bar"]]
 9 | y = df["gpa"]
10 | 
11 | sensitive_features = df["race1"]
12 | 
13 | automl = AutoML(
14 |     algorithms=["Xgboost", "LightGBM", "Extra Trees"],
15 |     train_ensemble=True,
16 |     fairness_threshold=0.9,
17 | )
18 | automl.fit(X, y, sensitive_features=sensitive_features)
19 | 


--------------------------------------------------------------------------------
/examples/scripts/tabular_mar_2021.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from supervised import AutoML
 3 | 
 4 | train = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/train.csv")
 5 | test = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/test.csv")
 6 | 
 7 | X_train = train.drop(["id", "target"], axis=1)
 8 | y_train = train.target
 9 | X_test = test.drop(["id"], axis=1)
10 | 
11 | automl = AutoML(
12 |     mode="Optuna",
13 |     eval_metric="auc",
14 |     algorithms=["CatBoost"],
15 |     optuna_time_budget=1800,  # tune each algorithm for 30 minutes
16 |     total_time_limit=48
17 |     * 3600,  # total time limit, set large enough to have time to compute all steps
18 |     features_selection=False,
19 | )
20 | automl.fit(X_train, y_train)
21 | 
22 | preds = automl.predict_proba(X_test)
23 | submission = pd.DataFrame({"id": test.id, "target": preds[:, 1]})
24 | submission.to_csv("1_submission.csv", index=False)
25 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -p no:warnings


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.19.5,<2
 2 | pandas>=2.0.0
 3 | scipy>=1.6.1
 4 | scikit-learn>=1.5.0
 5 | xgboost>=2.0.0
 6 | lightgbm>=3.0.0
 7 | catboost>=0.24.4
 8 | joblib>=1.0.1
 9 | tabulate>=0.8.7
10 | matplotlib>=3.2.2
11 | dtreeviz>=2.2.2
12 | shap>=0.42.1
13 | seaborn>=0.11.1
14 | wordcloud>=1.8.1
15 | category_encoders>=2.2.2
16 | optuna-integration>=3.6.0
17 | mljar-scikit-plot>=0.3.11
18 | markdown
19 | typing-extensions
20 | ipython
21 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | black
3 | pytest-cov
4 | coveralls


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | 
 7 | # Get the long description from the README file
 8 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
 9 |     long_description = f.read()
10 | 
11 | setup(
12 |     name="mljar-supervised",
13 |     version="1.1.17",
14 |     description="Automated Machine Learning for Humans",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/mljar/mljar-supervised",
18 |     author="MLJAR, Sp. z o.o.",
19 |     author_email="contact@mljar.com",
20 |     license="MIT",
21 |     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
22 |     install_requires=open("requirements.txt").readlines(),
23 |     include_package_data=True,
24 |     python_requires='>=3.8',
25 |     classifiers=[
26 |         "Programming Language :: Python",
27 |         "Programming Language :: Python :: 3.8",
28 |         "Programming Language :: Python :: 3.9",
29 |         "Programming Language :: Python :: 3.10",
30 |         "Programming Language :: Python :: 3.11",
31 |     ],
32 |     keywords=[
33 |         "automated machine learning",
34 |         "automl",
35 |         "machine learning",
36 |         "data science",
37 |         "data mining",
38 |         "mljar",
39 |         "random forest",
40 |         "decision tree",
41 |         "xgboost",
42 |         "lightgbm",
43 |         "catboost",
44 |         "neural network",
45 |         "extra trees",
46 |         "linear model",
47 |         "features selection",
48 |         "features engineering"
49 |     ],
50 | )
51 | 


--------------------------------------------------------------------------------
/supervised/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1.17"
2 | 
3 | from supervised.automl import AutoML
4 | 


--------------------------------------------------------------------------------
/supervised/algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/algorithms/__init__.py


--------------------------------------------------------------------------------
/supervised/algorithms/baseline.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import sklearn
 4 | from sklearn.base import ClassifierMixin, RegressorMixin
 5 | from sklearn.dummy import DummyClassifier, DummyRegressor
 6 | 
 7 | from supervised.algorithms.registry import (
 8 |     BINARY_CLASSIFICATION,
 9 |     MULTICLASS_CLASSIFICATION,
10 |     REGRESSION,
11 |     AlgorithmsRegistry,
12 | )
13 | from supervised.algorithms.sklearn import SklearnAlgorithm
14 | from supervised.utils.config import LOG_LEVEL
15 | 
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(LOG_LEVEL)
18 | 
19 | 
20 | class BaselineClassifierAlgorithm(ClassifierMixin, SklearnAlgorithm):
21 |     algorithm_name = "Baseline Classifier"
22 |     algorithm_short_name = "Baseline"
23 | 
24 |     def __init__(self, params):
25 |         super(BaselineClassifierAlgorithm, self).__init__(params)
26 |         logger.debug("BaselineClassifierAlgorithm.__init__")
27 | 
28 |         self.library_version = sklearn.__version__
29 |         self.max_iters = additional.get("max_steps", 1)
30 |         self.model = DummyClassifier(
31 |             strategy="prior", random_state=params.get("seed", 1)
32 |         )
33 | 
34 |     def file_extension(self):
35 |         return "baseline"
36 | 
37 |     def is_fitted(self):
38 |         return (
39 |             hasattr(self.model, "n_outputs_")
40 |             and self.model.n_outputs_ is not None
41 |             and self.model.n_outputs_ > 0
42 |         )
43 | 
44 | 
45 | class BaselineRegressorAlgorithm(RegressorMixin, SklearnAlgorithm):
46 |     algorithm_name = "Baseline Regressor"
47 |     algorithm_short_name = "Baseline"
48 | 
49 |     def __init__(self, params):
50 |         super(BaselineRegressorAlgorithm, self).__init__(params)
51 |         logger.debug("BaselineRegressorAlgorithm.__init__")
52 | 
53 |         self.library_version = sklearn.__version__
54 |         self.max_iters = additional.get("max_steps", 1)
55 |         self.model = DummyRegressor(strategy="mean")
56 | 
57 |     def file_extension(self):
58 |         return "baseline"
59 | 
60 |     def is_fitted(self):
61 |         return (
62 |             hasattr(self.model, "n_outputs_")
63 |             and self.model.n_outputs_ is not None
64 |             and self.model.n_outputs_ > 0
65 |         )
66 | 
67 | 
68 | additional = {"max_steps": 1, "max_rows_limit": None, "max_cols_limit": None}
69 | required_preprocessing = ["target_as_integer"]
70 | 
71 | AlgorithmsRegistry.add(
72 |     BINARY_CLASSIFICATION,
73 |     BaselineClassifierAlgorithm,
74 |     {},
75 |     required_preprocessing,
76 |     additional,
77 |     {},
78 | )
79 | 
80 | AlgorithmsRegistry.add(
81 |     MULTICLASS_CLASSIFICATION,
82 |     BaselineClassifierAlgorithm,
83 |     {},
84 |     required_preprocessing,
85 |     additional,
86 |     {},
87 | )
88 | 
89 | 
90 | AlgorithmsRegistry.add(REGRESSION, BaselineRegressorAlgorithm, {}, {}, additional, {})
91 | 


--------------------------------------------------------------------------------
/supervised/algorithms/factory.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from supervised.algorithms.registry import BINARY_CLASSIFICATION, AlgorithmsRegistry
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | from supervised.exceptions import AutoMLException
 8 | 
 9 | 
10 | class AlgorithmFactory(object):
11 |     @classmethod
12 |     def get_algorithm(cls, params):
13 |         alg_type = params.get("model_type", "Xgboost")
14 |         ml_task = params.get("ml_task", BINARY_CLASSIFICATION)
15 | 
16 |         try:
17 |             Algorithm = AlgorithmsRegistry.get_algorithm_class(ml_task, alg_type)
18 |             return Algorithm(params)
19 |         except Exception as e:
20 |             raise AutoMLException(f"Cannot get algorithm class. {str(e)}")
21 | 
22 |     @classmethod
23 |     def load(cls, json_desc, learner_path, lazy_load):
24 |         learner = AlgorithmFactory.get_algorithm(json_desc.get("params"))
25 |         learner.set_params(json_desc, learner_path)
26 |         if not lazy_load:
27 |             learner.reload()
28 |         return learner
29 | 


--------------------------------------------------------------------------------
/supervised/algorithms/knn.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import sklearn
  4 | from sklearn.base import ClassifierMixin, RegressorMixin
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
  7 | 
  8 | from supervised.algorithms.registry import (
  9 |     BINARY_CLASSIFICATION,
 10 |     MULTICLASS_CLASSIFICATION,
 11 |     REGRESSION,
 12 |     AlgorithmsRegistry,
 13 | )
 14 | from supervised.algorithms.sklearn import SklearnAlgorithm
 15 | from supervised.utils.config import LOG_LEVEL
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | logger.setLevel(LOG_LEVEL)
 19 | 
 20 | 
 21 | KNN_ROWS_LIMIT = 1000
 22 | 
 23 | 
 24 | class KNNFit(SklearnAlgorithm):
 25 |     def file_extension(self):
 26 |         return "k_neighbors"
 27 | 
 28 |     def is_fitted(self):
 29 |         return (
 30 |             hasattr(self.model, "n_samples_fit_")
 31 |             and self.model.n_samples_fit_ is not None
 32 |             and self.model.n_samples_fit_ > 0
 33 |         )
 34 | 
 35 |     def fit(
 36 |         self,
 37 |         X,
 38 |         y,
 39 |         sample_weight=None,
 40 |         X_validation=None,
 41 |         y_validation=None,
 42 |         sample_weight_validation=None,
 43 |         log_to_file=None,
 44 |         max_time=None,
 45 |     ):
 46 |         rows_limit = self.params.get("rows_limit", KNN_ROWS_LIMIT)
 47 |         if X.shape[0] > rows_limit:
 48 |             X1, _, y1, _ = train_test_split(
 49 |                 X, y, train_size=rows_limit, stratify=y, random_state=1234
 50 |             )
 51 |             self.model.fit(X1, y1)
 52 |         else:
 53 |             self.model.fit(X, y)
 54 | 
 55 |     @property
 56 |     def _classes(self):
 57 |         # Returns the unique classes based on the fitted model
 58 |         if hasattr(self.model, "classes_"):
 59 |             return self.model.classes_
 60 |         else:
 61 |             return None
 62 | 
 63 | 
 64 | class KNeighborsAlgorithm(ClassifierMixin, KNNFit):
 65 |     algorithm_name = "k-Nearest Neighbors"
 66 |     algorithm_short_name = "Nearest Neighbors"
 67 | 
 68 |     def __init__(self, params):
 69 |         super(KNeighborsAlgorithm, self).__init__(params)
 70 |         logger.debug("KNeighborsAlgorithm.__init__")
 71 |         self.library_version = sklearn.__version__
 72 |         self.max_iters = 1
 73 |         self.model = KNeighborsClassifier(
 74 |             n_neighbors=params.get("n_neighbors", 3),
 75 |             weights=params.get("weights", "uniform"),
 76 |             algorithm="kd_tree",
 77 |             n_jobs=params.get("n_jobs", -1),
 78 |         )
 79 | 
 80 | 
 81 | class KNeighborsRegressorAlgorithm(RegressorMixin, KNNFit):
 82 |     algorithm_name = "k-Nearest Neighbors"
 83 |     algorithm_short_name = "Nearest Neighbors"
 84 | 
 85 |     def __init__(self, params):
 86 |         super(KNeighborsRegressorAlgorithm, self).__init__(params)
 87 |         logger.debug("KNeighborsRegressorAlgorithm.__init__")
 88 |         self.library_version = sklearn.__version__
 89 |         self.max_iters = 1
 90 |         self.model = KNeighborsRegressor(
 91 |             n_neighbors=params.get("n_neighbors", 3),
 92 |             weights=params.get("weights", "uniform"),
 93 |             algorithm="ball_tree",
 94 |             n_jobs=params.get("n_jobs", -1),
 95 |         )
 96 | 
 97 | 
 98 | knn_params = {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}
 99 | 
100 | default_params = {"n_neighbors": 5, "weights": "uniform"}
101 | 
102 | additional = {"max_rows_limit": 100000, "max_cols_limit": 100}
103 | 
104 | required_preprocessing = [
105 |     "missing_values_inputation",
106 |     "convert_categorical",
107 |     "datetime_transform",
108 |     "text_transform",
109 |     "scale",
110 |     "target_as_integer",
111 | ]
112 | 
113 | AlgorithmsRegistry.add(
114 |     BINARY_CLASSIFICATION,
115 |     KNeighborsAlgorithm,
116 |     knn_params,
117 |     required_preprocessing,
118 |     additional,
119 |     default_params,
120 | )
121 | AlgorithmsRegistry.add(
122 |     MULTICLASS_CLASSIFICATION,
123 |     KNeighborsAlgorithm,
124 |     knn_params,
125 |     required_preprocessing,
126 |     additional,
127 |     default_params,
128 | )
129 | 
130 | AlgorithmsRegistry.add(
131 |     REGRESSION,
132 |     KNeighborsRegressorAlgorithm,
133 |     knn_params,
134 |     required_preprocessing,
135 |     additional,
136 |     default_params,
137 | )
138 | 


--------------------------------------------------------------------------------
/supervised/algorithms/registry.py:
--------------------------------------------------------------------------------
 1 | # tasks that can be handled by the package
 2 | BINARY_CLASSIFICATION = "binary_classification"
 3 | MULTICLASS_CLASSIFICATION = "multiclass_classification"
 4 | REGRESSION = "regression"
 5 | 
 6 | class AlgorithmsRegistry:
 7 |     registry = {
 8 |         BINARY_CLASSIFICATION: {},
 9 |         MULTICLASS_CLASSIFICATION: {},
10 |         REGRESSION: {},
11 |     }
12 | 
13 |     @staticmethod
14 |     def add(
15 |         task_name,
16 |         model_class,
17 |         model_params,
18 |         required_preprocessing,
19 |         additional,
20 |         default_params,
21 |     ):
22 |         model_information = {
23 |             "class": model_class,
24 |             "params": model_params,
25 |             "required_preprocessing": required_preprocessing,
26 |             "additional": additional,
27 |             "default_params": default_params,
28 |         }
29 |         AlgorithmsRegistry.registry[task_name][
30 |             model_class.algorithm_short_name
31 |         ] = model_information
32 | 
33 |     @staticmethod
34 |     def get_supported_ml_tasks():
35 |         return AlgorithmsRegistry.registry.keys()
36 | 
37 |     @staticmethod
38 |     def get_algorithm_class(ml_task, algorithm_name):
39 |         return AlgorithmsRegistry.registry[ml_task][algorithm_name]["class"]
40 | 
41 |     @staticmethod
42 |     def get_long_name(ml_task, algorithm_name):
43 |         return AlgorithmsRegistry.registry[ml_task][algorithm_name][
44 |             "class"
45 |         ].algorithm_name
46 | 
47 |     @staticmethod
48 |     def get_max_rows_limit(ml_task, algorithm_name):
49 |         return AlgorithmsRegistry.registry[ml_task][algorithm_name]["additional"][
50 |             "max_rows_limit"
51 |         ]
52 | 
53 |     @staticmethod
54 |     def get_max_cols_limit(ml_task, algorithm_name):
55 |         return AlgorithmsRegistry.registry[ml_task][algorithm_name]["additional"][
56 |             "max_cols_limit"
57 |         ]
58 | 
59 |     @staticmethod
60 |     def get_eval_metric(algorithm_name, ml_task, automl_eval_metric):
61 |         if algorithm_name == "Xgboost":
62 |             return xgboost_eval_metric(ml_task, automl_eval_metric)
63 | 
64 |         return automl_eval_metric
65 | 
66 | # Import algorithm to be registered
67 | import supervised.algorithms.baseline
68 | import supervised.algorithms.catboost
69 | import supervised.algorithms.decision_tree
70 | import supervised.algorithms.extra_trees
71 | import supervised.algorithms.knn
72 | import supervised.algorithms.lightgbm
73 | import supervised.algorithms.linear
74 | import supervised.algorithms.nn
75 | import supervised.algorithms.random_forest
76 | import supervised.algorithms.xgboost


--------------------------------------------------------------------------------
/supervised/callbacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/callbacks/__init__.py


--------------------------------------------------------------------------------
/supervised/callbacks/callback.py:
--------------------------------------------------------------------------------
 1 | class Callback(object):
 2 |     def __init__(self, params):
 3 |         self.params = params
 4 |         self.learners = []
 5 |         self.learner = None  # current learner
 6 |         self.name = "callback"
 7 | 
 8 |     def add_and_set_learner(self, learner):
 9 |         self.learners += [learner]
10 |         self.learner = learner
11 | 
12 |     def on_learner_train_start(self, logs):
13 |         pass
14 | 
15 |     def on_learner_train_end(self, logs):
16 |         pass
17 | 
18 |     def on_iteration_start(self, logs):
19 |         pass
20 | 
21 |     def on_iteration_end(self, logs, predictions):
22 |         pass
23 | 
24 |     def on_framework_train_end(self, logs):
25 |         pass
26 | 


--------------------------------------------------------------------------------
/supervised/callbacks/callback_list.py:
--------------------------------------------------------------------------------
 1 | class CallbackList(object):
 2 |     def __init__(self, callbacks=[]):
 3 |         self.callbacks = callbacks
 4 | 
 5 |     def add_and_set_learner(self, learner):
 6 |         for cb in self.callbacks:
 7 |             cb.add_and_set_learner(learner)
 8 | 
 9 |     def on_learner_train_start(self, logs=None):
10 |         for cb in self.callbacks:
11 |             cb.on_learner_train_start(logs)
12 | 
13 |     def on_learner_train_end(self, logs=None):
14 |         for cb in self.callbacks:
15 |             cb.on_learner_train_end(logs)
16 | 
17 |     def on_iteration_start(self, logs=None):
18 |         for cb in self.callbacks:
19 |             cb.on_iteration_start(logs)
20 | 
21 |     def on_iteration_end(self, logs=None, predictions=None):
22 |         for cb in self.callbacks:
23 |             cb.on_iteration_end(logs, predictions)
24 | 
25 |     def on_framework_train_end(self, logs=None):
26 |         for cb in self.callbacks:
27 |             cb.on_framework_train_end(logs)
28 | 
29 |     def get(self, callback_name):
30 |         for cb in self.callbacks:
31 |             if cb.name == callback_name:
32 |                 return cb
33 |         return None
34 | 


--------------------------------------------------------------------------------
/supervised/callbacks/learner_time_constraint.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | 
 4 | import numpy as np
 5 | 
 6 | from supervised.callbacks.callback import Callback
 7 | from supervised.utils.config import LOG_LEVEL
 8 | 
 9 | log = logging.getLogger(__name__)
10 | log.setLevel(LOG_LEVEL)
11 | 
12 | 
13 | class LearnerTimeConstraint(Callback):
14 |     def __init__(self, params={}):
15 |         super(LearnerTimeConstraint, self).__init__(params)
16 |         self.name = params.get("name", "learner_time_constraint")
17 |         self.min_steps = params.get("min_steps")
18 |         self.learner_time_limit = params.get("learner_time_limit")  # in seconds
19 |         self.iterations_count = 0
20 | 
21 |     def on_learner_train_start(self, logs):
22 |         self.train_start_time = time.time()
23 |         self.iterations_count = 0
24 | 
25 |     def on_iteration_start(self, logs):
26 |         self.iter_start_time = time.time()
27 | 
28 |     def on_iteration_end(self, logs, predictions):
29 |         self.iterations_count += 1
30 |         iteration_elapsed_time = np.round(time.time() - self.iter_start_time, 2)
31 |         learner_elapsed_time = np.round(time.time() - self.train_start_time, 2)
32 |         log.debug(
33 |             "Iteration {0} took {1} seconds, learner training time {2} seconds".format(
34 |                 self.iterations_count, iteration_elapsed_time, learner_elapsed_time
35 |             )
36 |         )
37 | 
38 |         if self.min_steps is not None:
39 |             if self.iterations_count < self.min_steps:
40 |                 # self.learner.stop_training = False
41 |                 # return before checking other conditions
42 |                 return
43 | 
44 |         if self.learner_time_limit is not None:
45 |             if learner_elapsed_time >= self.learner_time_limit:
46 |                 self.learner.stop_training = True
47 |                 log.info("Terminating learning, time limit reached")
48 | 


--------------------------------------------------------------------------------
/supervised/callbacks/max_iters_constraint.py:
--------------------------------------------------------------------------------
 1 | from supervised.callbacks.callback import Callback
 2 | 
 3 | 
 4 | class MaxItersConstraint(Callback):
 5 |     def __init__(self, params):
 6 |         super(MaxItersConstraint, self).__init__(params)
 7 |         self.name = params.get("name", "max_iters_constraint")
 8 |         self.max_iters = params.get("max_iters", 10)
 9 | 
10 |     def add_and_set_learner(self, learner):
11 |         self.learner = learner
12 | 
13 |     def on_iteration_end(self, logs, predictions):
14 |         # iters are computed starting from 0
15 |         if logs.get("iter_cnt") + 1 >= self.max_iters:
16 |             self.learner.stop_training = True
17 | 


--------------------------------------------------------------------------------
/supervised/callbacks/metric_logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | from supervised.callbacks.callback import Callback
 6 | from supervised.utils.metric import Metric
 7 | 
 8 | 
 9 | class MetricLogger(Callback):
10 |     def __init__(self, params):
11 |         super(MetricLogger, self).__init__(params)
12 |         self.name = params.get("name", "metric_logger")
13 |         self.loss_values = {}
14 |         self.metrics = []
15 |         for metric_name in params.get("metric_names"):
16 |             self.metrics += [Metric({"name": metric_name})]
17 | 
18 |     def add_and_set_learner(self, learner):
19 |         self.loss_values[learner.uid] = {"train": {}, "validation": {}, "iters": []}
20 |         for metric in self.metrics:
21 |             self.loss_values[learner.uid]["train"][metric.name] = []
22 |             self.loss_values[learner.uid]["validation"][metric.name] = []
23 | 
24 |         self.current_learner_uid = learner.uid
25 | 
26 |     def on_iteration_end(self, logs, predictions):
27 |         for metric in self.metrics:
28 |             train_loss = 0
29 |             if predictions.get("y_train_predicted") is not None:
30 |                 train_loss = metric(
31 |                     predictions.get("y_train_true"),
32 |                     predictions.get("y_train_predicted"),
33 |                 )
34 |             validation_loss = metric(
35 |                 predictions.get("y_validation_true"),
36 |                 predictions.get("y_validation_predicted"),
37 |             )
38 |             self.loss_values[self.current_learner_uid]["train"][metric.name] += [
39 |                 train_loss
40 |             ]
41 |             self.loss_values[self.current_learner_uid]["validation"][metric.name] += [
42 |                 validation_loss
43 |             ]
44 |             # keep information about iter number only once :)
45 |             if metric == self.metrics[0]:
46 |                 self.loss_values[self.current_learner_uid]["iters"] += [
47 |                     logs.get("iter_cnt")
48 |                 ]
49 | 


--------------------------------------------------------------------------------
/supervised/callbacks/terminate_on_nan.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | import numpy as np
 6 | 
 7 | from supervised.callbacks.callback import Callback
 8 | 
 9 | 
10 | class TerminateOnNan(Callback):
11 |     def __init__(self, learner, params):
12 |         super(TerminateOnNan, self).__init__(learner, params)
13 |         self.metric = Metric(params.get("metric_name"))
14 | 
15 |     def on_iteration_end(self, iter_cnt, data):
16 |         loss_train = 0
17 |         if data.get("y_train_predicted") is not None:
18 |             loss_train = self.metric(
19 |                 data.get("y_train_true"), data.get("y_train_predicted")
20 |             )
21 |         loss_validation = self.metric(
22 |             data.get("y_validation_true"), data.get("y_validation_predicted")
23 |         )
24 | 
25 |         for loss in [loss_train, loss_validation]:
26 |             if np.isnan(loss) or np.isinf(loss) or np.isneginf(loss):
27 |                 self.learner.stop_training = True
28 |                 log.info("Terminating learning, invalid loss value")
29 | 


--------------------------------------------------------------------------------
/supervised/callbacks/total_time_constraint.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | 
 4 | import numpy as np
 5 | 
 6 | from supervised.callbacks.callback import Callback
 7 | from supervised.exceptions import NotTrainedException
 8 | from supervised.utils.config import LOG_LEVEL
 9 | 
10 | log = logging.getLogger(__name__)
11 | log.setLevel(LOG_LEVEL)
12 | 
13 | 
14 | class TotalTimeConstraint(Callback):
15 |     def __init__(self, params={}):
16 |         super(TotalTimeConstraint, self).__init__(params)
17 |         self.name = params.get("name", "total_time_constraint")
18 |         self.total_time_limit = params.get("total_time_limit")
19 |         self.total_time_start = params.get("total_time_start")
20 |         self.expected_learners_cnt = params.get("expected_learners_cnt", 1)
21 | 
22 |     def on_learner_train_start(self, logs):
23 |         self.train_start_time = time.time()
24 | 
25 |     def on_learner_train_end(self, logs):
26 |         if (
27 |             self.total_time_limit is not None
28 |             and len(self.learners) == 1
29 |             and self.expected_learners_cnt > 1
30 |             # just check for the first learner
31 |             # need to have more than 1 learner
32 |             # otherwise it is a finish of the training
33 |         ):
34 |             one_fold_time = time.time() - self.train_start_time
35 |             estimate_all_folds = one_fold_time * self.expected_learners_cnt
36 | 
37 |             total_elapsed_time = np.round(time.time() - self.total_time_start, 2)
38 | 
39 |             # we need to add time for the rest of learners (assuming that all folds training time is the same)
40 |             estimate_elapsed_time = total_elapsed_time + one_fold_time * (
41 |                 self.expected_learners_cnt - 1
42 |             )
43 | 
44 |             if estimate_elapsed_time >= self.total_time_limit:
45 |                 raise NotTrainedException(
46 |                     "Stop training after the first fold. "
47 |                     f"Time needed to train on the first fold {np.round(one_fold_time)} seconds. "
48 |                     "The time estimate for training on all folds is larger than total_time_limit."
49 |                 )
50 |         if (
51 |             self.total_time_limit is not None
52 |             and len(self.learners) < self.expected_learners_cnt
53 |             # dont stop for last learner, we are finishing anyway
54 |         ):
55 |             total_elapsed_time = np.round(time.time() - self.total_time_start, 2)
56 | 
57 |             if total_elapsed_time > self.total_time_limit + 600:
58 |                 # add 10 minutes of margin
59 |                 # margin is added because of unexpected time changes
60 |                 # if training on each fold will be the same
61 |                 # then the training will be stopped after first fold (above condition)
62 |                 raise NotTrainedException(
63 |                     "Force to stop the training. "
64 |                     "Total time for AutoML training already exceeded."
65 |                 )
66 | 
67 |     def on_iteration_end(self, logs, predictions):
68 |         total_elapsed_time = np.round(time.time() - self.total_time_start, 2)
69 | 
70 |         if self.total_time_limit is not None:
71 |             log.debug(
72 |                 f"Total elapsed time {total_elapsed_time} seconds. "
73 |                 + f"Time left {np.round(self.total_time_limit - total_elapsed_time, 2)} seconds."
74 |             )
75 |             # not time left, stop now
76 |             if total_elapsed_time >= self.total_time_limit:
77 |                 self.learner.stop_training = True
78 |         else:
79 |             log.debug(f"Total elapsed time {total_elapsed_time} seconds")
80 | 


--------------------------------------------------------------------------------
/supervised/exceptions.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from supervised.utils.config import LOG_LEVEL
 4 | 
 5 | logging.basicConfig(
 6 |     format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.ERROR
 7 | )
 8 | logger = logging.getLogger(__name__)
 9 | logger.setLevel(LOG_LEVEL)
10 | 
11 | 
12 | class AutoMLException(Exception):
13 |     def __init__(self, message):
14 |         super(AutoMLException, self).__init__(message)
15 |         logger.error(message)
16 | 
17 | 
18 | class NotTrainedException(Exception):
19 |     def __init__(self, message):
20 |         super(NotTrainedException, self).__init__(message)
21 |         logger.debug(message)
22 | 


--------------------------------------------------------------------------------
/supervised/fairness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/fairness/__init__.py


--------------------------------------------------------------------------------
/supervised/fairness/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(t, y):
 5 |     return np.round(np.sum(t == y) / t.shape[0], 4)
 6 | 
 7 | 
 8 | def selection_rate(y):
 9 |     return np.round(
10 |         np.sum((y == 1)) / y.shape[0],
11 |         4,
12 |     )
13 | 
14 | 
15 | def true_positive_rate(t, y):
16 |     return np.round(
17 |         np.sum((y == 1) & (t == 1)) / np.sum((t == 1)),
18 |         4,
19 |     )
20 | 
21 | 
22 | def false_positive_rate(t, y):
23 |     return np.round(
24 |         np.sum((y == 1) & (t == 0)) / np.sum((t == 0)),
25 |         4,
26 |     )
27 | 
28 | 
29 | def true_negative_rate(t, y):
30 |     return np.round(
31 |         np.sum((y == 0) & (t == 0)) / np.sum((t == 0)),
32 |         4,
33 |     )
34 | 
35 | 
36 | def false_negative_rate(t, y):
37 |     return np.round(
38 |         np.sum((y == 0) & (t == 1)) / np.sum((t == 1)),
39 |         4,
40 |     )
41 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/preprocessing/__init__.py


--------------------------------------------------------------------------------
/supervised/preprocessing/datetime_transformer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | class DateTimeTransformer(object):
  6 |     def __init__(self):
  7 |         self._new_columns = []
  8 |         self._old_column = None
  9 |         self._min_datetime = None
 10 |         self._transforms = []
 11 | 
 12 |     def fit(self, X, column):
 13 |         self._old_column = column
 14 |         self._min_datetime = np.min(X[column])
 15 | 
 16 |         values = X[column].dt.year
 17 |         if len(np.unique(values)) > 1:
 18 |             self._transforms += ["year"]
 19 |             new_column = column + "_Year"
 20 |             self._new_columns += [new_column]
 21 | 
 22 |         values = X[column].dt.month
 23 |         if len(np.unique(values)) > 1:
 24 |             self._transforms += ["month"]
 25 |             new_column = column + "_Month"
 26 |             self._new_columns += [new_column]
 27 | 
 28 |         values = X[column].dt.day
 29 |         if len(np.unique(values)) > 1:
 30 |             self._transforms += ["day"]
 31 |             new_column = column + "_Day"
 32 |             self._new_columns += [new_column]
 33 | 
 34 |         values = X[column].dt.weekday
 35 |         if len(np.unique(values)) > 1:
 36 |             self._transforms += ["weekday"]
 37 |             new_column = column + "_WeekDay"
 38 |             self._new_columns += [new_column]
 39 | 
 40 |         values = X[column].dt.dayofyear
 41 |         if len(np.unique(values)) > 1:
 42 |             self._transforms += ["dayofyear"]
 43 |             new_column = column + "_DayOfYear"
 44 |             self._new_columns += [new_column]
 45 | 
 46 |         values = X[column].dt.hour
 47 |         if len(np.unique(values)) > 1:
 48 |             self._transforms += ["hour"]
 49 |             new_column = column + "_Hour"
 50 |             self._new_columns += [new_column]
 51 | 
 52 |         values = (X[column] - self._min_datetime).dt.days
 53 |         if len(np.unique(values)) > 1:
 54 |             self._transforms += ["days_diff"]
 55 |             new_column = column + "_Days_Diff_To_Min"
 56 |             self._new_columns += [new_column]
 57 | 
 58 |     def transform(self, X):
 59 |         column = self._old_column
 60 | 
 61 |         if "year" in self._transforms:
 62 |             new_column = column + "_Year"
 63 |             X[new_column] = X[column].dt.year
 64 | 
 65 |         if "month" in self._transforms:
 66 |             new_column = column + "_Month"
 67 |             X[new_column] = X[column].dt.month
 68 | 
 69 |         if "day" in self._transforms:
 70 |             new_column = column + "_Day"
 71 |             X[new_column] = X[column].dt.day
 72 | 
 73 |         if "weekday" in self._transforms:
 74 |             new_column = column + "_WeekDay"
 75 |             X[new_column] = X[column].dt.weekday
 76 | 
 77 |         if "dayofyear" in self._transforms:
 78 |             new_column = column + "_DayOfYear"
 79 |             X[new_column] = X[column].dt.dayofyear
 80 | 
 81 |         if "hour" in self._transforms:
 82 |             new_column = column + "_Hour"
 83 |             X[new_column] = X[column].dt.hour
 84 | 
 85 |         if "days_diff" in self._transforms:
 86 |             new_column = column + "_Days_Diff_To_Min"
 87 |             X[new_column] = (X[column] - self._min_datetime).dt.days
 88 | 
 89 |         X.drop(column, axis=1, inplace=True)
 90 |         return X
 91 | 
 92 |     def to_json(self):
 93 |         data_json = {
 94 |             "new_columns": list(self._new_columns),
 95 |             "old_column": self._old_column,
 96 |             "min_datetime": str(self._min_datetime),
 97 |             "transforms": list(self._transforms),
 98 |         }
 99 |         return data_json
100 | 
101 |     def from_json(self, data_json):
102 |         self._new_columns = data_json.get("new_columns", None)
103 |         self._old_column = data_json.get("old_column", None)
104 |         d = data_json.get("min_datetime", None)
105 |         self._min_datetime = None if d is None else pd.to_datetime(d)
106 |         self._transforms = data_json.get("transforms", [])
107 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/encoding_selector.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
 5 | 
 6 | 
 7 | class EncodingSelector:
 8 | 
 9 |     """
10 |     EncodingSelector object decides which method should be used for categorical encoding.
11 | 
12 |     Please keep it fast and simple. Thank you.
13 |     """
14 | 
15 |     @staticmethod
16 |     def get(X, y, column):
17 |         # return PreprocessingCategorical.CONVERT_LOO
18 |         try:
19 |             unique_cnt = len(np.unique(X.loc[~pd.isnull(X[column]), column]))
20 |             if unique_cnt <= 20:
21 |                 return PreprocessingCategorical.FEW_CATEGORIES
22 |         except Exception as e:
23 |             pass
24 | 
25 |         return PreprocessingCategorical.MANY_CATEGORIES
26 |         """
27 |         if unique_cnt <= 2 or unique_cnt > 25:
28 |             return PreprocessingCategorical.CONVERT_INTEGER
29 | 
30 |         return PreprocessingCategorical.CONVERT_ONE_HOT
31 |         """
32 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/exclude_missing_target.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import warnings
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from supervised.utils.config import LOG_LEVEL
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(LOG_LEVEL)
11 | 
12 | 
13 | class ExcludeRowsMissingTarget(object):
14 |     @staticmethod
15 |     def transform(
16 |         X=None, y=None, sample_weight=None, sensitive_features=None, warn=False
17 |     ):
18 |         if y is None:
19 |             return X, y, sample_weight, sensitive_features
20 |         y_missing = pd.isnull(y)
21 |         if np.sum(np.array(y_missing)) == 0:
22 |             return X, y, sample_weight, sensitive_features
23 |         logger.debug("Exclude rows with missing target values")
24 |         if warn:
25 |             warnings.warn(
26 |                 "There are samples with missing target values in the data which will be excluded for further analysis",
27 |                 UserWarning
28 |             )
29 |         y = y.drop(y.index[y_missing])
30 |         y.reset_index(drop=True, inplace=True)
31 | 
32 |         if X is not None:
33 |             X = X.drop(X.index[y_missing])
34 |             X.reset_index(drop=True, inplace=True)
35 | 
36 |         if sample_weight is not None:
37 |             sample_weight = sample_weight.drop(sample_weight.index[y_missing])
38 |             sample_weight.reset_index(drop=True, inplace=True)
39 | 
40 |         if sensitive_features is not None:
41 |             sensitive_features = sensitive_features.drop(
42 |                 sensitive_features.index[y_missing]
43 |             )
44 |             sensitive_features.reset_index(drop=True, inplace=True)
45 | 
46 |         return X, y, sample_weight, sensitive_features
47 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/kmeans_transformer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import joblib
  5 | import numpy as np
  6 | from sklearn.cluster import MiniBatchKMeans
  7 | from sklearn.preprocessing import StandardScaler
  8 | 
  9 | from supervised.exceptions import AutoMLException
 10 | 
 11 | 
 12 | class KMeansTransformer(object):
 13 |     def __init__(self, results_path=None, model_name=None, k_fold=None):
 14 |         self._new_features = []
 15 |         self._input_columns = []
 16 |         self._error = None
 17 |         self._kmeans = None
 18 |         self._scale = None
 19 |         self._model_name = model_name
 20 |         self._k_fold = k_fold
 21 | 
 22 |         if results_path is not None:
 23 |             self._result_file = os.path.join(
 24 |                 self._model_name, f"kmeans_fold_{k_fold}.joblib"
 25 |             )
 26 |             self._result_path = os.path.join(results_path, self._result_file)
 27 |             # self.try_load()
 28 | 
 29 |     def fit(self, X, y):
 30 |         if self._new_features:
 31 |             return
 32 |         if self._error is not None and self._error:
 33 |             raise AutoMLException(
 34 |                 "KMeans Features not created due to error (please check errors.md). "
 35 |                 + self._error
 36 |             )
 37 |             return
 38 |         if X.shape[1] == 0:
 39 |             self._error = f"KMeans not created. No continous features. Input data shape: {X.shape}, {y.shape}"
 40 |             raise AutoMLException("KMeans Features not created. No continous features.")
 41 | 
 42 |         start_time = time.time()
 43 | 
 44 |         n_clusters = int(np.log10(X.shape[0]) * 8)
 45 |         n_clusters = max(8, n_clusters)
 46 |         n_clusters = min(n_clusters, X.shape[1])
 47 | 
 48 |         self._input_columns = X.columns.tolist()
 49 |         # scale data
 50 |         self._scale = StandardScaler(copy=True, with_mean=True, with_std=True)
 51 |         X = self._scale.fit_transform(X)
 52 | 
 53 |         # Kmeans
 54 |         self._kmeans = kmeans = MiniBatchKMeans(n_clusters=n_clusters, init="k-means++")
 55 |         self._kmeans.fit(X)
 56 |         self._create_new_features_names()
 57 | 
 58 |         # print(
 59 |         #    f"Created {len(self._new_features)} KMeans Features in {np.round(time.time() - start_time,2)} seconds."
 60 |         # )
 61 | 
 62 |     def _create_new_features_names(self):
 63 |         n_clusters = self._kmeans.cluster_centers_.shape[0]
 64 |         self._new_features = [f"Dist_Cluster_{i}" for i in range(n_clusters)]
 65 |         self._new_features += ["Cluster"]
 66 | 
 67 |     def transform(self, X):
 68 |         if self._kmeans is None:
 69 |             raise AutoMLException("KMeans not fitted")
 70 | 
 71 |         # scale
 72 |         X_scaled = self._scale.transform(X[self._input_columns])
 73 | 
 74 |         # kmeans
 75 |         distances = self._kmeans.transform(X_scaled)
 76 |         clusters = self._kmeans.predict(X_scaled)
 77 | 
 78 |         X[self._new_features[:-1]] = distances
 79 |         X[self._new_features[-1]] = clusters
 80 | 
 81 |         return X
 82 | 
 83 |     def to_json(self):
 84 |         self.save()
 85 |         data_json = {
 86 |             "new_features": self._new_features,
 87 |             "result_file": self._result_file,
 88 |             "input_columns": self._input_columns,
 89 |         }
 90 |         if self._error is not None and self._error:
 91 |             data_json["error"] = self._error
 92 |         return data_json
 93 | 
 94 |     def from_json(self, data_json, results_path):
 95 |         self._new_features = data_json.get("new_features", [])
 96 |         self._input_columns = data_json.get("input_columns", [])
 97 |         self._result_file = data_json.get("result_file")
 98 |         self._result_path = os.path.join(results_path, self._result_file)
 99 |         self._error = data_json.get("error")
100 |         self.try_load()
101 | 
102 |     def save(self):
103 |         joblib.dump(
104 |             {"kmeans": self._kmeans, "scale": self._scale},
105 |             self._result_path,
106 |             compress=True,
107 |         )
108 | 
109 |     def try_load(self):
110 |         if os.path.exists(self._result_path):
111 |             data = joblib.load(self._result_path)
112 |             self._kmeans = data["kmeans"]
113 |             self._scale = data["scale"]
114 | 
115 |             self._create_new_features_names()
116 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/label_binarizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LabelBinarizer(object):
 5 |     def __init__(self):
 6 |         self._new_columns = []
 7 |         self._uniq_values = None
 8 |         self._old_column = None
 9 |         self._old_column_dtype = None
10 | 
11 |     def fit(self, X, column):
12 |         self._old_column = column
13 |         self._old_column_dtype = str(X[column].dtype)
14 |         self._uniq_values = np.unique(X[column].values)
15 |         # self._uniq_values = [str(u) for u in self._uniq_values]
16 | 
17 |         if len(self._uniq_values) == 2:
18 |             self._new_columns.append(column + "_" + str(self._uniq_values[1]))
19 |         else:
20 |             for v in self._uniq_values:
21 |                 self._new_columns.append(column + "_" + str(v))
22 | 
23 |     def transform(self, X, column):
24 |         if len(self._uniq_values) == 2:
25 |             X[column + "_" + str(self._uniq_values[1])] = (
26 |                 X[column] == self._uniq_values[1]
27 |             ).astype(int)
28 |         else:
29 |             for v in self._uniq_values:
30 |                 X[column + "_" + str(v)] = (X[column] == v).astype(int)
31 | 
32 |         X.drop(column, axis=1, inplace=True)
33 |         return X
34 | 
35 |     def inverse_transform(self, X):
36 |         if self._old_column is None:
37 |             return X
38 | 
39 |         old_col = (X[self._new_columns[0]] * 0).astype(self._old_column_dtype)
40 | 
41 |         for unique_value in self._uniq_values:
42 |             new_col = f"{self._old_column}_{unique_value}"
43 |             if new_col not in self._new_columns:
44 |                 old_col[:] = unique_value
45 |             else:
46 |                 old_col[X[new_col] == 1] = unique_value
47 | 
48 |         X[self._old_column] = old_col
49 |         X.drop(self._new_columns, axis=1, inplace=True)
50 |         return X
51 | 
52 |     def to_json(self):
53 |         self._uniq_values = [str(i) for i in list(self._uniq_values)]
54 |         data_json = {
55 |             "new_columns": list(self._new_columns),
56 |             "unique_values": self._uniq_values,
57 |             "old_column": self._old_column,
58 |             "old_column_dtype": self._old_column_dtype,
59 |         }
60 | 
61 |         if (
62 |             "True" in self._uniq_values
63 |             and "False" in self._uniq_values
64 |             and len(self._uniq_values) == 2
65 |         ):
66 |             self._uniq_values = [False, True]
67 | 
68 |         return data_json
69 | 
70 |     def from_json(self, data_json):
71 |         self._new_columns = data_json.get("new_columns", None)
72 |         self._uniq_values = data_json.get("unique_values", None)
73 |         self._old_column = data_json.get("old_column", None)
74 |         self._old_column_dtype = data_json.get("old_column_dtype", None)
75 | 
76 |         if (
77 |             "True" in self._uniq_values
78 |             and "False" in self._uniq_values
79 |             and len(self._uniq_values) == 2
80 |         ):
81 |             self._uniq_values = [False, True]
82 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/label_encoder.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from decimal import Decimal
 3 | 
 4 | import numpy as np
 5 | from sklearn import preprocessing as sk_preproc
 6 | 
 7 | from supervised.utils.config import LOG_LEVEL
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(LOG_LEVEL)
11 | 
12 | 
13 | class LabelEncoder(object):
14 |     def __init__(self, try_to_fit_numeric=False):
15 |         self.lbl = sk_preproc.LabelEncoder()
16 |         self._try_to_fit_numeric = try_to_fit_numeric
17 | 
18 |     def fit(self, x):
19 |         self.lbl.fit(x)  # list(x.values))
20 |         if self._try_to_fit_numeric:
21 |             logger.debug("Try to fit numeric in LabelEncoder")
22 |             try:
23 |                 arr = {Decimal(c): c for c in self.lbl.classes_}
24 |                 sorted_arr = dict(sorted(arr.items()))
25 |                 self.lbl.classes_ = np.array(
26 |                     list(sorted_arr.values()), dtype=self.lbl.classes_.dtype
27 |                 )
28 |             except Exception as e:
29 |                 pass
30 | 
31 |     def transform(self, x):
32 |         try:
33 |             return self.lbl.transform(x)  # list(x.values))
34 |         except ValueError as ve:
35 |             # rescue
36 |             classes = np.unique(x)  # list(x.values))
37 |             diff = np.setdiff1d(classes, self.lbl.classes_)
38 |             self.lbl.classes_ = np.concatenate((self.lbl.classes_, diff))
39 |             return self.lbl.transform(x)  # list(x.values))
40 | 
41 |     def inverse_transform(self, x):
42 |         return self.lbl.inverse_transform(x)  # (list(x.values))
43 | 
44 |     def to_json(self):
45 |         data_json = {}
46 |         for i, cl in enumerate(self.lbl.classes_):
47 |             data_json[str(cl)] = i
48 |         return data_json
49 | 
50 |     def from_json(self, data_json):
51 |         keys = np.array(list(data_json.keys()))
52 |         if len(keys) == 2 and "False" in keys and "True" in keys:
53 |             keys = np.array([False, True])
54 |         self.lbl.classes_ = keys
55 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/loo_encoder.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import warnings
 4 | 
 5 | import pandas as pd
 6 | from category_encoders.leave_one_out import LeaveOneOutEncoder
 7 | 
 8 | from supervised.utils.config import LOG_LEVEL
 9 | 
10 | logger = logging.getLogger(__name__)
11 | logger.setLevel(LOG_LEVEL)
12 | 
13 | 
14 | class LooEncoder(object):
15 |     def __init__(self, cols=None):
16 |         self.enc = LeaveOneOutEncoder(
17 |             cols=cols,
18 |             verbose=1,
19 |             drop_invariant=False,
20 |             return_df=True,
21 |             handle_unknown="value",
22 |             handle_missing="value",
23 |             random_state=1,
24 |             sigma=0,
25 |         )
26 | 
27 |     def fit(self, X, y):
28 |         with warnings.catch_warnings():
29 |             warnings.simplefilter("ignore")
30 |             self.enc.fit(X, y)
31 | 
32 |     def transform(self, X):
33 |         return self.enc.transform(X)
34 | 
35 |     def to_json(self):
36 |         data_json = {
37 |             "cols": self.enc.cols,
38 |             "dim": self.enc._dim,
39 |             "mean": float(self.enc._mean),
40 |             "feature_names": self.enc.get_feature_names_out(),
41 |             "mapping": {},
42 |         }
43 |         for k, v in self.enc.mapping.items():
44 |             data_json["mapping"][k] = v.to_json()
45 |         return data_json
46 | 
47 |     def from_json(self, data_json):
48 |         self.enc.cols = data_json.get("cols")
49 |         self.enc._dim = data_json.get("dim")
50 |         self.enc._mean = data_json.get("mean")
51 |         self.enc.feature_names = data_json.get("feature_names")
52 |         self.enc.mapping = {}
53 |         for k, v in data_json.get("mapping", {}).items():
54 |             self.enc.mapping[k] = pd.DataFrame(json.loads(v))
55 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/preprocessing_missing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
 5 | 
 6 | 
 7 | class PreprocessingMissingValues(object):
 8 |     FILL_NA_MIN = "na_fill_min_1"
 9 |     FILL_NA_MEAN = "na_fill_mean"
10 |     FILL_NA_MEDIAN = "na_fill_median"
11 |     FILL_DATETIME = "na_fill_datetime"
12 | 
13 |     NA_EXCLUDE = "na_exclude"
14 |     MISSING_VALUE = "_missing_value_"
15 |     REMOVE_COLUMN = "remove_column"
16 | 
17 |     def __init__(self, columns=[], na_fill_method=FILL_NA_MEDIAN):
18 |         self._columns = columns
19 |         # fill method
20 |         self._na_fill_method = na_fill_method
21 |         # fill parameters stored as a dict, feature -> fill value
22 |         self._na_fill_params = {}
23 |         self._datetime_columns = []
24 | 
25 |     def fit(self, X):
26 |         X = self._fit_na_fill(X)
27 | 
28 |     def _fit_na_fill(self, X):
29 |         for column in self._columns:
30 |             if np.sum(pd.isnull(X[column]) == True) == 0:
31 |                 continue
32 |             self._na_fill_params[column] = self._get_fill_value(X[column])
33 |             if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME:
34 |                 self._datetime_columns += [column]
35 | 
36 |     def _get_fill_value(self, x):
37 |         # categorical type
38 |         if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL:
39 |             if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
40 |                 return (
41 |                     PreprocessingMissingValues.MISSING_VALUE
42 |                 )  # add new categorical value
43 |             return PreprocessingUtils.get_most_frequent(x)
44 |         # datetime
45 |         if PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME:
46 |             return PreprocessingUtils.get_most_frequent(x)
47 |         # text
48 |         if PreprocessingUtils.get_type(x) == PreprocessingUtils.TEXT:
49 |             return PreprocessingMissingValues.MISSING_VALUE
50 | 
51 |         # numerical type
52 |         if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
53 |             return PreprocessingUtils.get_min(x) - 1.0
54 |         if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN:
55 |             return PreprocessingUtils.get_mean(x)
56 |         return PreprocessingUtils.get_median(x)
57 | 
58 |     def transform(self, X):
59 |         X = self._transform_na_fill(X)
60 |         # this is additional run through columns,
61 |         # in case of transforming data with new columns with missing values
62 |         # X = self._make_sure_na_filled(X) # disbaled for now
63 |         return X
64 | 
65 |     def _transform_na_fill(self, X):
66 |         for column, value in self._na_fill_params.items():
67 |             ind = pd.isnull(X.loc[:, column])
68 |             X.loc[ind, column] = value
69 |         return X
70 | 
71 |     def _make_sure_na_filled(self, X):
72 |         self._fit_na_fill(X)
73 |         return self._transform_na_fill(X)
74 | 
75 |     def to_json(self):
76 |         # prepare json with all parameters
77 |         if len(self._na_fill_params) == 0:
78 |             return {}
79 |         params = {
80 |             "fill_method": self._na_fill_method,
81 |             "fill_params": self._na_fill_params,
82 |             "datetime_columns": list(self._datetime_columns),
83 |         }
84 |         for col in self._datetime_columns:
85 |             params["fill_params"][col] = str(params["fill_params"][col])
86 |         return params
87 | 
88 |     def from_json(self, params):
89 |         if params is not None:
90 |             self._na_fill_method = params.get("fill_method", None)
91 |             self._na_fill_params = params.get("fill_params", {})
92 |             self._datetime_columns = params.get("datetime_columns", [])
93 |             for col in self._datetime_columns:
94 |                 self._na_fill_params[col] = pd.to_datetime(self._na_fill_params[col])
95 |         else:
96 |             self._na_fill_method, self._na_fill_params = None, None
97 |             self._datetime_columns = []
98 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/scale.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import preprocessing
 3 | 
 4 | 
 5 | class Scale(object):
 6 |     SCALE_NORMAL = "scale_normal"
 7 |     SCALE_LOG_AND_NORMAL = "scale_log_and_normal"
 8 | 
 9 |     def __init__(self, columns=[], scale_method=SCALE_NORMAL):
10 |         self.scale_method = scale_method
11 |         self.columns = columns
12 |         self.scale = preprocessing.StandardScaler(
13 |             copy=True, with_mean=True, with_std=True
14 |         )
15 |         self.X_min_values = None  # it is used in SCALE_LOG_AND_NORMAL
16 | 
17 |     def fit(self, X):
18 |         if len(self.columns):
19 |             for c in self.columns:
20 |                 X[c] = X[c].astype(float)
21 | 
22 |             if self.scale_method == self.SCALE_NORMAL:
23 |                 self.scale.fit(X[self.columns])
24 |             elif self.scale_method == self.SCALE_LOG_AND_NORMAL:
25 |                 self.X_min_values = np.min(X[self.columns], axis=0)
26 |                 self.scale.fit(np.log(X[self.columns] - self.X_min_values + 1))
27 | 
28 |     def transform(self, X):
29 |         if len(self.columns):
30 |             for c in self.columns:
31 |                 X[c] = X[c].astype(float)
32 |             if self.scale_method == self.SCALE_NORMAL:
33 |                 X.loc[:, self.columns] = self.scale.transform(X[self.columns])
34 |             elif self.scale_method == self.SCALE_LOG_AND_NORMAL:
35 |                 X[self.columns] = np.log(
36 |                     np.clip(
37 |                         X[self.columns] - self.X_min_values + 1, a_min=1, a_max=None
38 |                     )
39 |                 )
40 |                 X.loc[:, self.columns] = self.scale.transform(X[self.columns])
41 |         return X
42 | 
43 |     def inverse_transform(self, X):
44 |         if len(self.columns):
45 |             if self.scale_method == self.SCALE_NORMAL:
46 |                 X.loc[:, self.columns] = self.scale.inverse_transform(X[self.columns])
47 |             elif self.scale_method == self.SCALE_LOG_AND_NORMAL:
48 |                 X[self.columns] = X[self.columns].astype("float64")
49 | 
50 |                 X[self.columns] = self.scale.inverse_transform(X[self.columns])
51 |                 X[self.columns] = np.exp(X[self.columns])
52 | 
53 |                 X.loc[:, self.columns] += self.X_min_values - 1
54 |         return X
55 | 
56 |     def to_json(self):
57 |         if len(self.columns) == 0:
58 |             return None
59 |         data_json = {
60 |             "scale": list(self.scale.scale_),
61 |             "mean": list(self.scale.mean_),
62 |             "var": list(self.scale.var_),
63 |             "n_samples_seen": int(self.scale.n_samples_seen_),
64 |             "n_features_in": int(self.scale.n_features_in_),
65 |             "columns": self.columns,
66 |             "scale_method": self.scale_method,
67 |         }
68 |         if self.X_min_values is not None:
69 |             data_json["X_min_values"] = list(self.X_min_values)
70 |         return data_json
71 | 
72 |     def from_json(self, data_json):
73 |         self.scale = preprocessing.StandardScaler(
74 |             copy=True, with_mean=True, with_std=True
75 |         )
76 |         self.scale.scale_ = data_json.get("scale")
77 |         if self.scale.scale_ is not None:
78 |             self.scale.scale_ = np.array(self.scale.scale_)
79 |         self.scale.mean_ = data_json.get("mean")
80 |         if self.scale.mean_ is not None:
81 |             self.scale.mean_ = np.array(self.scale.mean_)
82 |         self.scale.var_ = data_json.get("var")
83 |         if self.scale.var_ is not None:
84 |             self.scale.var_ = np.array(self.scale.var_)
85 |         self.scale.n_samples_seen_ = int(data_json.get("n_samples_seen"))
86 |         self.scale.n_features_in_ = int(data_json.get("n_features_in"))
87 |         self.columns = data_json.get("columns", [])
88 |         self.scale.feature_names_in_ = data_json.get("columns")
89 |         self.scale_method = data_json.get("scale_method")
90 |         self.X_min_values = data_json.get("X_min_values")
91 |         if self.X_min_values is not None:
92 |             self.X_min_values = np.array(self.X_min_values)
93 | 


--------------------------------------------------------------------------------
/supervised/preprocessing/text_transformer.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | 
 6 | 
 7 | class TextTransformer(object):
 8 |     def __init__(self):
 9 |         self._new_columns = []
10 |         self._old_column = None
11 |         self._max_features = 100
12 |         self._vectorizer = None
13 | 
14 |     def fit(self, X, column):
15 |         self._old_column = column
16 |         self._vectorizer = TfidfVectorizer(
17 |             analyzer="word",
18 |             stop_words="english",
19 |             lowercase=True,
20 |             max_features=self._max_features,
21 |         )
22 | 
23 |         x = X[column][~pd.isnull(X[column])]
24 |         self._vectorizer.fit(x)
25 |         for f in list(self._vectorizer.get_feature_names_out()):
26 |             new_col = self._old_column + "_" + f
27 |             self._new_columns += [new_col]
28 | 
29 |     def transform(self, X):
30 |         with warnings.catch_warnings():
31 |             warnings.simplefilter(
32 |                 action="ignore", category=pd.errors.PerformanceWarning
33 |             )
34 |             ii = ~pd.isnull(X[self._old_column])
35 |             x = X[self._old_column][ii]
36 |             vect = self._vectorizer.transform(x)
37 | 
38 |             for f in self._new_columns:
39 |                 X[f] = 0.0
40 | 
41 |             X.loc[ii, self._new_columns] = vect.toarray()
42 |             X.drop(self._old_column, axis=1, inplace=True)
43 |         return X
44 | 
45 |     def to_json(self):
46 |         for k in self._vectorizer.vocabulary_.keys():
47 |             self._vectorizer.vocabulary_[k] = int(self._vectorizer.vocabulary_[k])
48 | 
49 |         data_json = {
50 |             "new_columns": list(self._new_columns),
51 |             "old_column": self._old_column,
52 |             "vocabulary": self._vectorizer.vocabulary_,
53 |             "fixed_vocabulary": self._vectorizer.fixed_vocabulary_,
54 |             "idf": list(self._vectorizer.idf_),
55 |         }
56 |         return data_json
57 | 
58 |     def from_json(self, data_json):
59 |         self._new_columns = data_json.get("new_columns", None)
60 |         self._old_column = data_json.get("old_column", None)
61 |         vocabulary = data_json.get("vocabulary")
62 |         fixed_vocabulary = data_json.get("fixed_vocabulary")
63 |         idf = data_json.get("idf")
64 |         if vocabulary is not None and fixed_vocabulary is not None and idf is not None:
65 |             self._vectorizer = TfidfVectorizer(
66 |                 analyzer="word",
67 |                 stop_words="english",
68 |                 lowercase=True,
69 |                 max_features=self._max_features,
70 |             )
71 |             self._vectorizer.vocabulary_ = vocabulary
72 |             self._vectorizer.fixed_vocabulary_ = fixed_vocabulary
73 |             self._vectorizer.idf_ = np.array(idf)
74 | 


--------------------------------------------------------------------------------
/supervised/tuner/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/tuner/__init__.py


--------------------------------------------------------------------------------
/supervised/tuner/data_info.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from supervised.algorithms.registry import (
 5 |     BINARY_CLASSIFICATION,
 6 |     MULTICLASS_CLASSIFICATION,
 7 |     REGRESSION,
 8 | )
 9 | from supervised.preprocessing.encoding_selector import EncodingSelector
10 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
11 | 
12 | 
13 | class DataInfo:
14 |     @staticmethod
15 |     def compute(X, y, machinelearning_task):
16 |         columns_info = {}
17 |         for col in X.columns:
18 |             columns_info[col] = []
19 |             #
20 |             empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
21 |             if empty_column:
22 |                 columns_info[col] += ["empty_column"]
23 |                 continue
24 |             #
25 |             constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
26 |             if constant_column:
27 |                 columns_info[col] += ["constant_column"]
28 |                 continue
29 |             #
30 |             if PreprocessingUtils.is_na(X[col]):
31 |                 columns_info[col] += ["missing_values"]
32 |             #
33 |             if PreprocessingUtils.is_categorical(X[col]):
34 |                 columns_info[col] += ["categorical"]
35 |                 columns_info[col] += [EncodingSelector.get(X, y, col)]
36 |             elif PreprocessingUtils.is_datetime(X[col]):
37 |                 columns_info[col] += ["datetime_transform"]
38 |             elif PreprocessingUtils.is_text(X[col]):
39 |                 columns_info[col] = ["text_transform"]  # override other transforms
40 |             else:
41 |                 # numeric type, check if scale needed
42 |                 if PreprocessingUtils.is_scale_needed(X[col]):
43 |                     columns_info[col] += ["scale"]
44 | 
45 |         target_info = []
46 |         if machinelearning_task == BINARY_CLASSIFICATION:
47 |             if not PreprocessingUtils.is_0_1(y):
48 |                 target_info += ["convert_0_1"]
49 | 
50 |         if machinelearning_task == REGRESSION:
51 |             if PreprocessingUtils.is_log_scale_needed(y):
52 |                 target_info += ["scale_log"]
53 |             elif PreprocessingUtils.is_scale_needed(y):
54 |                 target_info += ["scale"]
55 | 
56 |         num_class = None
57 |         if machinelearning_task == MULTICLASS_CLASSIFICATION:
58 |             num_class = PreprocessingUtils.num_class(y)
59 | 
60 |         return {
61 |             "columns_info": columns_info,
62 |             "target_info": target_info,
63 |             "num_class": num_class,
64 |         }
65 | 


--------------------------------------------------------------------------------
/supervised/tuner/hill_climbing.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | import numpy as np
 4 | 
 5 | from supervised.algorithms.registry import AlgorithmsRegistry
 6 | 
 7 | 
 8 | class HillClimbing:
 9 | 
10 |     """
11 |     Example params are in JSON format:
12 |     {
13 |         "booster": ["gbtree", "gblinear"],
14 |         "objective": ["binary:logistic"],
15 |         "eval_metric": ["auc", "logloss"],
16 |         "eta": [0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1]
17 |     }
18 |     """
19 | 
20 |     @staticmethod
21 |     def get(params, ml_task, seed=1):
22 |         np.random.seed(seed)
23 |         keys = list(params.keys())
24 |         for k in [
25 |             "num_class",
26 |             "model_type",
27 |             "seed",
28 |             "ml_task",
29 |             "explain_level",
30 |             "model_architecture_json",
31 |             "n_jobs",
32 |             "metric",
33 |             "eval_metric",
34 |             "custom_eval_metric_name",
35 |             "eval_metric_name",
36 |         ]:
37 |             if k in keys:
38 |                 keys.remove(k)
39 | 
40 |         model_type = params["model_type"]
41 |         if model_type == "Baseline":
42 |             return [None, None]
43 |         model_info = AlgorithmsRegistry.registry[ml_task][model_type]
44 |         model_params = model_info["params"]
45 | 
46 |         permuted_keys = np.random.permutation(keys)
47 |         key_to_update = None
48 |         values = None
49 | 
50 |         for key_to_update in permuted_keys:
51 |             if key_to_update not in model_params:
52 |                 continue
53 |             values = model_params[key_to_update]
54 |             if len(values) > 1:
55 |                 break
56 |         if values is None:
57 |             return [None, None]
58 | 
59 |         left, right = None, None
60 |         for i, v in enumerate(values):
61 |             if v == params[key_to_update]:
62 |                 if i + 1 < len(values):
63 |                     right = values[i + 1]
64 |                 if i - 1 >= 0:
65 |                     left = values[i - 1]
66 | 
67 |         params_1, params_2 = None, None
68 |         if left is not None:
69 |             params_1 = copy.deepcopy(params)
70 |             params_1[key_to_update] = left
71 |         if right is not None:
72 |             params_2 = copy.deepcopy(params)
73 |             params_2[key_to_update] = right
74 | 
75 |         if params_1 is not None and "model_architecture_json" in params_1:
76 |             del params_1["model_architecture_json"]
77 |         if params_2 is not None and "model_architecture_json" in params_2:
78 |             del params_2["model_architecture_json"]
79 | 
80 |         return [params_1, params_2]
81 | 


--------------------------------------------------------------------------------
/supervised/tuner/optuna/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/tuner/optuna/__init__.py


--------------------------------------------------------------------------------
/supervised/tuner/optuna/extra_trees.py:
--------------------------------------------------------------------------------
 1 | import optuna
 2 | 
 3 | from supervised.algorithms.extra_trees import (
 4 |     ExtraTreesAlgorithm,
 5 |     ExtraTreesRegressorAlgorithm,
 6 | )
 7 | from supervised.algorithms.registry import (
 8 |     REGRESSION,
 9 | )
10 | from supervised.utils.metric import Metric
11 | 
12 | EPS = 1e-8
13 | 
14 | 
15 | class ExtraTreesObjective:
16 |     def __init__(
17 |         self,
18 |         ml_task,
19 |         X_train,
20 |         y_train,
21 |         sample_weight,
22 |         X_validation,
23 |         y_validation,
24 |         sample_weight_validation,
25 |         eval_metric,
26 |         n_jobs,
27 |         random_state,
28 |     ):
29 |         self.ml_task = ml_task
30 |         self.X_train = X_train
31 |         self.y_train = y_train
32 |         self.sample_weight = sample_weight
33 |         self.X_validation = X_validation
34 |         self.y_validation = y_validation
35 |         self.eval_metric = eval_metric
36 |         self.n_jobs = n_jobs
37 |         self.objective = "squared_error" if ml_task == REGRESSION else "gini"
38 |         self.max_steps = 10  # ET is trained in steps 100 trees each
39 |         self.seed = random_state
40 | 
41 |     def __call__(self, trial):
42 |         try:
43 |             Algorithm = (
44 |                 ExtraTreesRegressorAlgorithm
45 |                 if self.ml_task == REGRESSION
46 |                 else ExtraTreesAlgorithm
47 |             )
48 |             self.objective = (
49 |                 "squared_error"
50 |                 if self.ml_task == REGRESSION
51 |                 else trial.suggest_categorical("criterion", ["gini", "entropy"])
52 |             )
53 |             params = {
54 |                 "max_steps": self.max_steps,
55 |                 "criterion": self.objective,
56 |                 "max_depth": trial.suggest_int("max_depth", 2, 32),
57 |                 "min_samples_split": trial.suggest_int("min_samples_split", 2, 100),
58 |                 "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100),
59 |                 "max_features": trial.suggest_float("max_features", 0.01, 1),
60 |                 "n_jobs": self.n_jobs,
61 |                 "seed": self.seed,
62 |                 "ml_task": self.ml_task,
63 |             }
64 |             model = Algorithm(params)
65 | 
66 |             model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
67 | 
68 |             preds = model.predict(self.X_validation)
69 | 
70 |             score = self.eval_metric(self.y_validation, preds)
71 |             if Metric.optimize_negative(self.eval_metric.name):
72 |                 score *= -1.0
73 | 
74 |         except optuna.exceptions.TrialPruned as e:
75 |             raise e
76 |         except Exception as e:
77 |             print("Exception in ExtraTreesObjective", str(e))
78 |             return None
79 | 
80 |         return score
81 | 


--------------------------------------------------------------------------------
/supervised/tuner/optuna/knn.py:
--------------------------------------------------------------------------------
 1 | import optuna
 2 | 
 3 | from supervised.algorithms.knn import KNeighborsAlgorithm, KNeighborsRegressorAlgorithm
 4 | from supervised.algorithms.registry import (
 5 |     REGRESSION,
 6 | )
 7 | from supervised.utils.metric import Metric
 8 | 
 9 | 
10 | class KNNObjective:
11 |     def __init__(
12 |         self,
13 |         ml_task,
14 |         X_train,
15 |         y_train,
16 |         sample_weight,
17 |         X_validation,
18 |         y_validation,
19 |         sample_weight_validation,
20 |         eval_metric,
21 |         n_jobs,
22 |         random_state,
23 |     ):
24 |         self.ml_task = ml_task
25 |         self.X_train = X_train
26 |         self.y_train = y_train
27 |         self.sample_weight = sample_weight
28 |         self.X_validation = X_validation
29 |         self.y_validation = y_validation
30 |         self.eval_metric = eval_metric
31 |         self.n_jobs = n_jobs
32 |         self.seed = random_state
33 | 
34 |     def __call__(self, trial):
35 |         try:
36 |             params = {
37 |                 "n_neighbors": trial.suggest_int("n_neighbors", 1, 128),
38 |                 "weights": trial.suggest_categorical(
39 |                     "weights", ["uniform", "distance"]
40 |                 ),
41 |                 "n_jobs": self.n_jobs,
42 |                 "rows_limit": 100000,
43 |                 "ml_task": self.ml_task,
44 |             }
45 |             Algorithm = (
46 |                 KNeighborsRegressorAlgorithm
47 |                 if self.ml_task == REGRESSION
48 |                 else KNeighborsAlgorithm
49 |             )
50 |             model = Algorithm(params)
51 |             model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
52 |             preds = model.predict(self.X_validation)
53 | 
54 |             score = self.eval_metric(self.y_validation, preds)
55 |             if Metric.optimize_negative(self.eval_metric.name):
56 |                 score *= -1.0
57 | 
58 |         except optuna.exceptions.TrialPruned as e:
59 |             raise e
60 |         except Exception as e:
61 |             print("Exception in KNNObjective", str(e))
62 |             return None
63 | 
64 |         return score
65 | 


--------------------------------------------------------------------------------
/supervised/tuner/optuna/nn.py:
--------------------------------------------------------------------------------
 1 | import optuna
 2 | 
 3 | from supervised.algorithms.nn import MLPAlgorithm, MLPRegressorAlgorithm
 4 | from supervised.algorithms.registry import (
 5 |     REGRESSION,
 6 | )
 7 | from supervised.utils.metric import Metric
 8 | 
 9 | 
10 | class NeuralNetworkObjective:
11 |     def __init__(
12 |         self,
13 |         ml_task,
14 |         X_train,
15 |         y_train,
16 |         sample_weight,
17 |         X_validation,
18 |         y_validation,
19 |         sample_weight_validation,
20 |         eval_metric,
21 |         n_jobs,
22 |         random_state,
23 |     ):
24 |         self.ml_task = ml_task
25 |         self.X_train = X_train
26 |         self.y_train = y_train
27 |         self.sample_weight = sample_weight
28 |         self.X_validation = X_validation
29 |         self.y_validation = y_validation
30 |         self.eval_metric = eval_metric
31 |         self.seed = random_state
32 | 
33 |     def __call__(self, trial):
34 |         try:
35 |             Algorithm = (
36 |                 MLPRegressorAlgorithm if self.ml_task == REGRESSION else MLPAlgorithm
37 |             )
38 |             params = {
39 |                 "dense_1_size": trial.suggest_int("dense_1_size", 4, 100),
40 |                 "dense_2_size": trial.suggest_int("dense_2_size", 2, 100),
41 |                 "learning_rate": trial.suggest_categorical(
42 |                     "learning_rate", [0.005, 0.01, 0.05, 0.1, 0.2]
43 |                 ),
44 |                 "learning_rate_type": trial.suggest_categorical(
45 |                     "learning_rate_type", ["constant", "adaptive"]
46 |                 ),
47 |                 "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
48 |                 "seed": self.seed,
49 |                 "ml_task": self.ml_task,
50 |             }
51 |             model = Algorithm(params)
52 |             model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
53 | 
54 |             preds = model.predict(self.X_validation)
55 | 
56 |             score = self.eval_metric(self.y_validation, preds)
57 |             if Metric.optimize_negative(self.eval_metric.name):
58 |                 score *= -1.0
59 | 
60 |         except optuna.exceptions.TrialPruned as e:
61 |             raise e
62 |         except Exception as e:
63 |             print("Exception in NeuralNetworkObjective", str(e))
64 |             return None
65 | 
66 |         return score
67 | 


--------------------------------------------------------------------------------
/supervised/tuner/optuna/random_forest.py:
--------------------------------------------------------------------------------
 1 | import optuna
 2 | 
 3 | from supervised.algorithms.random_forest import (
 4 |     RandomForestAlgorithm,
 5 |     RandomForestRegressorAlgorithm,
 6 | )
 7 | from supervised.algorithms.registry import (
 8 |     REGRESSION,
 9 | )
10 | from supervised.utils.metric import Metric
11 | 
12 | 
13 | class RandomForestObjective:
14 |     def __init__(
15 |         self,
16 |         ml_task,
17 |         X_train,
18 |         y_train,
19 |         sample_weight,
20 |         X_validation,
21 |         y_validation,
22 |         sample_weight_validation,
23 |         eval_metric,
24 |         n_jobs,
25 |         random_state,
26 |     ):
27 |         self.ml_task = ml_task
28 |         self.X_train = X_train
29 |         self.y_train = y_train
30 |         self.sample_weight = sample_weight
31 |         self.X_validation = X_validation
32 |         self.y_validation = y_validation
33 |         self.eval_metric = eval_metric
34 |         self.n_jobs = n_jobs
35 |         self.objective = "squared_error" if ml_task == REGRESSION else "gini"
36 |         self.max_steps = 10  # RF is trained in steps 100 trees each
37 |         self.seed = random_state
38 | 
39 |     def __call__(self, trial):
40 |         try:
41 |             Algorithm = (
42 |                 RandomForestRegressorAlgorithm
43 |                 if self.ml_task == REGRESSION
44 |                 else RandomForestAlgorithm
45 |             )
46 |             self.objective = (
47 |                 "squared_error"
48 |                 if self.ml_task == REGRESSION
49 |                 else trial.suggest_categorical("criterion", ["gini", "entropy"])
50 |             )
51 |             params = {
52 |                 "max_steps": self.max_steps,
53 |                 "criterion": self.objective,
54 |                 "max_depth": trial.suggest_int("max_depth", 2, 32),
55 |                 "min_samples_split": trial.suggest_int("min_samples_split", 2, 100),
56 |                 "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100),
57 |                 "max_features": trial.suggest_float("max_features", 0.01, 1),
58 |                 "n_jobs": self.n_jobs,
59 |                 "seed": self.seed,
60 |                 "ml_task": self.ml_task,
61 |             }
62 |             model = Algorithm(params)
63 |             model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
64 | 
65 |             preds = model.predict(self.X_validation)
66 | 
67 |             score = self.eval_metric(self.y_validation, preds)
68 |             if Metric.optimize_negative(self.eval_metric.name):
69 |                 score *= -1.0
70 | 
71 |         except optuna.exceptions.TrialPruned as e:
72 |             raise e
73 |         except Exception as e:
74 |             print("Exception in RandomForestObjective", str(e))
75 |             return None
76 | 
77 |         return score
78 | 


--------------------------------------------------------------------------------
/supervised/tuner/random_parameters.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RandomParameters:
 5 | 
 6 |     """
 7 |     Example params are in JSON format:
 8 |     {
 9 |         "booster": ["gbtree", "gblinear"],
10 |         "objective": ["binary:logistic"],
11 |         "eval_metric": ["auc", "logloss"],
12 |         "eta": [0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1]
13 |     }
14 |     """
15 | 
16 |     @staticmethod
17 |     def get(params, seed=1):
18 |         np.random.seed(seed)
19 |         generated_params = {"seed": seed}
20 |         for k in params:
21 |             generated_params[k] = np.random.permutation(params[k])[0].item()
22 |         return generated_params
23 | 


--------------------------------------------------------------------------------
/supervised/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from supervised.utils.jsonencoder import MLJSONEncoder
 4 | 
 5 | 
 6 | def json_loads(data, *args, **kwargs):
 7 |     return json.loads(data, *args, **kwargs)
 8 | 
 9 | 
10 | def json_dumps(data, *args, **kwargs):
11 |     return json.dumps(data, cls=MLJSONEncoder, *args, **kwargs)
12 | 


--------------------------------------------------------------------------------
/supervised/utils/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def construct_learner_name(fold, repeat, repeats):
 5 |     repeat_str = f"_repeat_{repeat}" if repeats > 1 else ""
 6 |     return f"learner_fold_{fold}{repeat_str}"
 7 | 
 8 | 
 9 | def learner_name_to_fold_repeat(name):
10 |     fold, repeat = None, None
11 |     arr = name.split("_")
12 |     fold = int(arr[2])
13 |     if "repeat" in name:
14 |         repeat = int(arr[4])
15 |     return fold, repeat
16 | 
17 | 
18 | def get_fold_repeat_cnt(model_path):
19 |     training_logs = [f for f in os.listdir(model_path) if "_training.log" in f]
20 |     fold_cnt, repeat_cnt = 0, 0
21 |     for fname in training_logs:
22 |         fold, repeat = learner_name_to_fold_repeat(fname)
23 |         if fold is not None:
24 |             fold_cnt = max(fold_cnt, fold)
25 |         if repeat is not None:
26 |             repeat_cnt = max(repeat_cnt, repeat)
27 | 
28 |     fold_cnt += 1  # counting from 0
29 |     repeat_cnt += 1
30 | 
31 |     return fold_cnt, repeat_cnt
32 | 
33 | 
34 | def get_learners_names(model_path):
35 |     postfix = "_training.log"
36 |     learner_names = [
37 |         f.repleace(postfix, "") for f in os.listdir(model_path) if postfix in f
38 |     ]
39 |     return learner_names
40 | 


--------------------------------------------------------------------------------
/supervised/utils/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | LOG_LEVEL = logging.ERROR
 4 | 
 5 | # from guppy import hpy
 6 | # from pympler import summary
 7 | # from pympler import muppy
 8 | import time
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def mem(msg=""):
14 |     """Memory usage in MB"""
15 | 
16 |     time.sleep(5)
17 | 
18 |     with open("/proc/self/status") as f:
19 |         memusage = f.read().split("VmRSS:")[1].split("\n")[0][:-3]
20 | 
21 |     print(msg, "- memory:", np.round(float(memusage.strip()) / 1024.0), "MB")
22 | 
23 |     # all_objects = muppy.get_objects()
24 |     # sum1 = summary.summarize(all_objects)
25 |     # summary.print_(sum1)
26 | 


--------------------------------------------------------------------------------
/supervised/utils/constants.py:
--------------------------------------------------------------------------------
1 | # tasks that can be handled by the package
2 | BINARY_CLASSIFICATION = "binary_classification"
3 | MULTICLASS_CLASSIFICATION = "multiclass_classification"
4 | REGRESSION = "regression"
5 | 


--------------------------------------------------------------------------------
/supervised/utils/data_validation.py:
--------------------------------------------------------------------------------
 1 | def check_greater_than_zero_integer(value, original_var_name):
 2 |     if not isinstance(value, int):
 3 |         raise ValueError(
 4 |             f"'{original_var_name}' must be an integer, got '{type(value)}'."
 5 |         )
 6 | 
 7 |     if value <= 0:
 8 |         raise ValueError(
 9 |             f"'{original_var_name}' must be greater than zero, got '{value}'."
10 |         )
11 | 
12 | 
13 | def check_positive_integer(value, original_var_name):
14 |     if not isinstance(value, int):
15 |         raise ValueError(
16 |             f"'{original_var_name}' must be an integer, got '{type(value)}'."
17 |         )
18 | 
19 |     if value < 0:
20 |         raise ValueError(
21 |             f"'{original_var_name}' must be equal or greater than zero, got '{value}'."
22 |         )
23 | 
24 | 
25 | def check_integer(value, original_var_name):
26 |     if not isinstance(value, int):
27 |         raise ValueError(
28 |             f"'{original_var_name}' must be an integer, got '{type(value)}'."
29 |         )
30 | 
31 | 
32 | def check_bool(value, original_var_name):
33 |     if not isinstance(value, bool):
34 |         raise ValueError(
35 |             f"'{original_var_name}' must be a boolean, got '{type(value)}'."
36 |         )
37 | 
38 | 
39 | def check_greater_than_zero_integer_or_float(value, original_var_name):
40 |     if not (isinstance(value, int) or isinstance(value, float)):
41 |         raise ValueError(
42 |             f"'{original_var_name}' must be an integer or float, got '{type(value)}'."
43 |         )
44 | 
45 |     if value <= 0:
46 |         raise ValueError(
47 |             f"'{original_var_name}' must be greater than zero, got '{value}'."
48 |         )
49 | 


--------------------------------------------------------------------------------
/supervised/utils/importance.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import warnings
 4 | 
 5 | import pandas as pd
 6 | from sklearn.inspection import permutation_importance
 7 | 
 8 | from supervised.algorithms.registry import (
 9 |     BINARY_CLASSIFICATION,
10 |     MULTICLASS_CLASSIFICATION,
11 | )
12 | from supervised.utils.subsample import subsample
13 | 
14 | logger = logging.getLogger(__name__)
15 | from supervised.utils.config import LOG_LEVEL
16 | 
17 | logger.setLevel(LOG_LEVEL)
18 | 
19 | from sklearn.metrics import log_loss, make_scorer
20 | 
21 | 
22 | def log_loss_eps(y_true, y_pred):
23 |     ll = log_loss(y_true, y_pred)
24 |     return ll
25 | 
26 | 
27 | log_loss_scorer = make_scorer(log_loss_eps, greater_is_better=False, response_method="predict_proba")
28 | 
29 | 
30 | class PermutationImportance:
31 |     @staticmethod
32 |     def compute_and_plot(
33 |         model,
34 |         X_validation,
35 |         y_validation,
36 |         model_file_path,
37 |         learner_name,
38 |         metric_name=None,
39 |         ml_task=None,
40 |         n_jobs=-1,
41 |     ):
42 |         # for scoring check https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
43 |         if ml_task == BINARY_CLASSIFICATION:
44 |             scoring = log_loss_scorer
45 |         elif ml_task == MULTICLASS_CLASSIFICATION:
46 |             scoring = log_loss_scorer
47 |         else:
48 |             scoring = "neg_mean_squared_error"
49 | 
50 |         try:
51 |             with warnings.catch_warnings():
52 |                 warnings.simplefilter("ignore")
53 |                 # subsample validation data to speed-up importance computation
54 |                 # in the case of large number of columns, it can take a lot of time
55 |                 rows, cols = X_validation.shape
56 |                 if cols > 5000 and rows > 100:
57 |                     X_vald, _, y_vald, _ = subsample(
58 |                         X_validation, y_validation, train_size=100, ml_task=ml_task
59 |                     )
60 |                 elif cols > 50 and rows * cols > 200000 and rows > 1000:
61 |                     X_vald, _, y_vald, _ = subsample(
62 |                         X_validation, y_validation, train_size=1000, ml_task=ml_task
63 |                     )
64 |                 else:
65 |                     X_vald = X_validation
66 |                     y_vald = y_validation
67 | 
68 |                 importance = permutation_importance(
69 |                     model,
70 |                     X_vald,
71 |                     y_vald,
72 |                     scoring=scoring,
73 |                     n_jobs=n_jobs,
74 |                     random_state=12,
75 |                     n_repeats=5,  # default
76 |                 )
77 | 
78 |             sorted_idx = importance["importances_mean"].argsort()
79 | 
80 |             # save detailed importance
81 |             df_imp = pd.DataFrame(
82 |                 {
83 |                     "feature": X_vald.columns[sorted_idx],
84 |                     "mean_importance": importance["importances_mean"][sorted_idx],
85 |                 }
86 |             )
87 |             df_imp.to_csv(
88 |                 os.path.join(model_file_path, f"{learner_name}_importance.csv"),
89 |                 index=False,
90 |             )
91 |         except Exception as e:
92 |             print(str(e))
93 |             print("Problem during computing permutation importance. Skipping ...")
94 | 


--------------------------------------------------------------------------------
/supervised/utils/jsonencoder.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import date
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | class MLJSONEncoder(json.JSONEncoder):
 8 |     def default(self, o):
 9 |         if isinstance(
10 |             o,
11 |             (
12 |                 np.int_,
13 |                 np.intc,
14 |                 np.intp,
15 |                 np.int8,
16 |                 np.int16,
17 |                 np.int32,
18 |                 np.int64,
19 |                 np.uint8,
20 |                 np.uint16,
21 |                 np.uint32,
22 |                 np.uint64,
23 |             ),
24 |         ):
25 |             return int(o)
26 |         elif isinstance(o, (np.float_, np.float16, np.float32, np.float64)):
27 |             return float(o)
28 |         elif isinstance(o, np.ndarray):
29 |             return o.tolist()
30 |         elif isinstance(obj, date):
31 |             return obj.strftime("%Y-%m-%d")
32 | 
33 |         return super(MLJSONEncoder, self).default(o)
34 | 


--------------------------------------------------------------------------------
/supervised/utils/subsample.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import train_test_split
 2 | 
 3 | from supervised.algorithms.registry import REGRESSION
 4 | 
 5 | 
 6 | def subsample(X, y, ml_task, train_size):
 7 |     shuffle = True
 8 |     stratify = None
 9 | 
10 |     if ml_task != REGRESSION:
11 |         stratify = y
12 | 
13 |     X_train, X_test, y_train, y_test = train_test_split(
14 |         X, y, train_size=train_size, shuffle=shuffle, stratify=stratify
15 |     )
16 | 
17 |     return X_train, X_test, y_train, y_test
18 | 


--------------------------------------------------------------------------------
/supervised/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | 
 4 | class Store:
 5 |     data = {}
 6 | 
 7 |     def set(self, key, value):
 8 |         Store.data[key] = value
 9 | 
10 |     def get(self, key):
11 |         return copy.deepcopy(Store.data[key])
12 | 
13 | 
14 | def dump_data(file_path, df):
15 |     store = Store()
16 |     store.set(file_path, df)
17 |     # try:
18 |     #    df.to_parquet(file_path, index=False)
19 |     # except Exception as e:
20 |     #    df.to_csv(file_path, index=False)
21 | 
22 | 
23 | def load_data(file_path):
24 |     store = Store()
25 |     return store.get(file_path)
26 |     # try:
27 |     #    return pd.read_parquet(file_path)
28 |     # except Exception as e:
29 |     #    return pd.read_csv(file_path)
30 | 


--------------------------------------------------------------------------------
/supervised/validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/supervised/validation/__init__.py


--------------------------------------------------------------------------------
/supervised/validation/validation_step.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | from supervised.exceptions import AutoMLException
 6 | from supervised.validation.validator_custom import CustomValidator
 7 | from supervised.validation.validator_kfold import KFoldValidator
 8 | from supervised.validation.validator_split import SplitValidator
 9 | 
10 | 
11 | class ValidationStep:
12 |     def __init__(self, params):
13 |         # kfold is default validation technique
14 |         self.validation_type = params.get("validation_type", "kfold")
15 | 
16 |         if self.validation_type == "kfold":
17 |             self.validator = KFoldValidator(params)
18 |         elif self.validation_type == "split":
19 |             self.validator = SplitValidator(params)
20 |         elif self.validation_type == "custom":
21 |             self.validator = CustomValidator(params)
22 |         else:
23 |             raise AutoMLException(
24 |                 f"The validation type ({self.validation_type}) is not implemented."
25 |             )
26 | 
27 |     def get_split(self, k, repeat=0):
28 |         return self.validator.get_split(k, repeat)
29 | 
30 |     def split(self):
31 |         return self.validator.split()
32 | 
33 |     def get_n_splits(self):
34 |         return self.validator.get_n_splits()
35 | 
36 |     def get_repeats(self):
37 |         return self.validator.get_repeats()
38 | 


--------------------------------------------------------------------------------
/supervised/validation/validator_base.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | class BaseValidator(object):
 7 |     def __init__(self, params):
 8 |         self.params = params
 9 | 
10 |     def split(self):
11 |         pass
12 | 
13 |     def get_n_splits(self):
14 |         pass
15 | 
16 |     def get_repeats(self):
17 |         return 1
18 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Running tests
 2 | 
 3 | 
 4 | To run all tests:
 5 | 
 6 | ```
 7 | pytest tests -v -x
 8 | ```
 9 | 
10 | To run tests for `algorithms`:
11 | 
12 | ```
13 | pytest tests/tests_algorithms -v -x -s
14 | ```


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/__init__.py


--------------------------------------------------------------------------------
/tests/checks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/checks/__init__.py


--------------------------------------------------------------------------------
/tests/checks/check_automl_with_regression.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | import sklearn.model_selection
 5 | 
 6 | from supervised.automl import AutoML
 7 | 
 8 | 
 9 | class AutoMLWithRegressionTest(unittest.TestCase):
10 |     def test_fit_and_predict(self):
11 |         seed = 1709
12 | 
13 |         df = pd.read_csv(
14 |             "./tests/data/housing_regression_missing_values_missing_target.csv"
15 |         )
16 |         print(df.columns)
17 |         x_cols = [c for c in df.columns if c != "MEDV"]
18 |         X = df[x_cols]
19 |         y = df["MEDV"]
20 | 
21 |         X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
22 |             X, y, test_size=0.3, random_state=seed
23 |         )
24 |         automl = AutoML(
25 |             total_time_limit=10,
26 |             algorithms=["Xgboost"],  # ["LightGBM", "RF", "NN", "CatBoost", "Xgboost"],
27 |             start_random_models=1,
28 |             hill_climbing_steps=0,
29 |             top_models_to_improve=0,
30 |             train_ensemble=True,
31 |             verbose=True,
32 |         )
33 |         automl.fit(X_train, y_train)
34 | 
35 |         response = automl.predict(X_test)  # ["p_1"]
36 |         print("Response", response)
37 | 
38 |         # Compute the logloss on test dataset
39 |         # ll = log_loss(y_test, response)
40 |         # print("(*) Dataset id {} logloss {}".format(dataset_id, ll))
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     unittest.main()
45 | 


--------------------------------------------------------------------------------
/tests/checks/run_ml_tests.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from tests.tests_bin_class.run import *
4 | from tests.tests_multi_class.run import *
5 | 
6 | if __name__ == "__main__":
7 |     unittest.main()
8 | 


--------------------------------------------------------------------------------
/tests/checks/run_performance_tests.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from tests.tests_bin_class.test_performance import *
4 | 
5 | if __name__ == "__main__":
6 |     unittest.main()
7 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def data_folder(request) -> Path:
 8 |     folder_path = Path(__file__).parent / 'data'
 9 |     assert folder_path.exists()
10 |     request.cls.data_folder = folder_path
11 |     return folder_path
12 | 


--------------------------------------------------------------------------------
/tests/data/CrimeData/README.md:
--------------------------------------------------------------------------------
1 | Source: https://www.kaggle.com/datasets/kkanda/communities%20and%20crime%20unnormalized%20data%20set?select=crimedata.csv
2 | Description: http://archive.ics.uci.edu/ml/datasets/Communities%20and%20Crime%20Unnormalized


--------------------------------------------------------------------------------
/tests/data/Drug/README.md:
--------------------------------------------------------------------------------
 1 | Source https://www.kaggle.com/datasets/obeykhadija/drug-consumptions-uci
 2 | 
 3 | 
 4 | Rating's for Drug Use:
 5 | 
 6 | CL0 Never Used
 7 | 
 8 | CL1 Used over a Decade Ago
 9 | 
10 | CL2 Used in Last Decade
11 | 
12 | CL3 Used in Last Year 59
13 | 
14 | CL4 Used in Last Month
15 | 
16 | CL5 Used in Last Week
17 | 
18 | CL6 Used in Last Day


--------------------------------------------------------------------------------
/tests/data/LawSchool/README.md:
--------------------------------------------------------------------------------
1 | Source: https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage


--------------------------------------------------------------------------------
/tests/data/iris_classes_missing_values_missing_target.csv:
--------------------------------------------------------------------------------
  1 | feature_1,feature_2,feature_3,feature_4,class
  2 | 5.1,3.5,1.4,0.2,1
  3 | 4.9,3.0,1.4,0.2,1
  4 | 4.7,3.2,1.3,,1
  5 | 4.6,3.1,1.5,,1
  6 | 5.0,3.6,1.4,0.2,1
  7 | ,3.9,1.7,0.4,1
  8 | 4.6,3.4,1.4,0.3,1
  9 | 5.0,3.4,1.5,0.2,1
 10 | 4.4,,1.4,0.2,1
 11 | 4.9,3.1,1.5,0.1,1
 12 | 5.4,3.7,1.5,0.2,1
 13 | 4.8,3.4,,0.2,1
 14 | 4.8,3.0,1.4,0.1,1
 15 | 4.3,3.0,1.1,0.1,1
 16 | 5.8,4.0,1.2,0.2,1
 17 | 5.7,4.4,1.5,0.4,1
 18 | 5.4,3.9,1.3,0.4,1
 19 | 5.1,3.5,1.4,0.3,
 20 | 5.7,3.8,1.7,0.3,1
 21 | 5.1,3.8,1.5,0.3,1
 22 | 5.4,3.4,1.7,0.2,1
 23 | 5.1,3.7,1.5,0.4,1
 24 | 4.6,3.6,1.0,0.2,1
 25 | 5.1,3.3,1.7,0.5,1
 26 | 4.8,3.4,1.9,0.2,1
 27 | 5.0,3.0,1.6,0.2,1
 28 | 5.0,3.4,1.6,0.4,1
 29 | 5.2,3.5,1.5,0.2,1
 30 | 5.2,3.4,1.4,0.2,1
 31 | 4.7,3.2,1.6,0.2,1
 32 | 4.8,3.1,1.6,0.2,1
 33 | 5.4,3.4,1.5,0.4,1
 34 | 5.2,4.1,1.5,0.1,1
 35 | 5.5,4.2,1.4,0.2,1
 36 | 4.9,3.1,1.5,0.1,1
 37 | 5.0,3.2,1.2,0.2,1
 38 | 5.5,3.5,1.3,0.2,1
 39 | 4.9,3.1,1.5,0.1,1
 40 | 4.4,3.0,1.3,0.2,1
 41 | 5.1,3.4,1.5,0.2,1
 42 | 5.0,3.5,1.3,0.3,1
 43 | 4.5,2.3,1.3,0.3,1
 44 | 4.4,3.2,1.3,0.2,1
 45 | 5.0,3.5,1.6,0.6,1
 46 | 5.1,3.8,1.9,0.4,1
 47 | 4.8,3.0,1.4,0.3,1
 48 | 5.1,3.8,1.6,0.2,1
 49 | 4.6,3.2,1.4,0.2,1
 50 | 5.3,3.7,1.5,0.2,1
 51 | 5.0,3.3,1.4,0.2,1
 52 | 7.0,3.2,4.7,1.4,2
 53 | 6.4,3.2,4.5,1.5,2
 54 | 6.9,3.1,4.9,1.5,
 55 | 5.5,2.3,4.0,1.3,2
 56 | 6.5,2.8,4.6,1.5,2
 57 | 5.7,2.8,4.5,1.3,2
 58 | 6.3,3.3,4.7,1.6,2
 59 | 4.9,2.4,3.3,1.0,2
 60 | 6.6,2.9,4.6,1.3,2
 61 | 5.2,2.7,3.9,1.4,2
 62 | 5.0,2.0,3.5,1.0,2
 63 | 5.9,3.0,4.2,1.5,2
 64 | 6.0,2.2,4.0,1.0,2
 65 | 6.1,2.9,4.7,1.4,2
 66 | 5.6,2.9,3.6,1.3,2
 67 | 6.7,3.1,4.4,1.4,2
 68 | 5.6,3.0,4.5,1.5,2
 69 | 5.8,2.7,4.1,1.0,2
 70 | 6.2,2.2,4.5,1.5,2
 71 | 5.6,2.5,3.9,1.1,2
 72 | 5.9,3.2,4.8,1.8,2
 73 | 6.1,2.8,4.0,1.3,2
 74 | 6.3,2.5,4.9,1.5,2
 75 | 6.1,2.8,4.7,1.2,2
 76 | 6.4,2.9,4.3,1.3,2
 77 | 6.6,3.0,4.4,1.4,2
 78 | 6.8,2.8,4.8,1.4,2
 79 | 6.7,3.0,5.0,1.7,2
 80 | 6.0,2.9,4.5,1.5,2
 81 | 5.7,2.6,3.5,1.0,2
 82 | 5.5,2.4,3.8,1.1,2
 83 | 5.5,2.4,3.7,1.0,2
 84 | 5.8,2.7,3.9,1.2,2
 85 | 6.0,2.7,5.1,1.6,2
 86 | 5.4,3.0,4.5,1.5,2
 87 | 6.0,3.4,4.5,1.6,2
 88 | 6.7,3.1,4.7,1.5,2
 89 | 6.3,2.3,4.4,1.3,2
 90 | 5.6,3.0,4.1,1.3,2
 91 | 5.5,2.5,4.0,1.3,2
 92 | 5.5,2.6,4.4,1.2,2
 93 | 6.1,3.0,4.6,1.4,2
 94 | 5.8,2.6,4.0,1.2,2
 95 | 5.0,2.3,3.3,1.0,2
 96 | 5.6,2.7,4.2,1.3,2
 97 | 5.7,3.0,4.2,1.2,2
 98 | 5.7,2.9,4.2,1.3,2
 99 | 6.2,2.9,4.3,1.3,2
100 | 5.1,2.5,3.0,1.1,2
101 | 5.7,2.8,4.1,1.3,2
102 | 6.3,3.3,6.0,2.5,121
103 | 5.8,2.7,5.1,1.9,121
104 | 7.1,3.0,5.9,2.1,121
105 | 6.3,2.9,5.6,1.8,121
106 | 6.5,3.0,5.8,2.2,121
107 | 7.6,3.0,6.6,2.1,121
108 | 4.9,2.5,4.5,1.7,121
109 | 7.3,2.9,6.3,1.8,121
110 | 6.7,2.5,5.8,1.8,121
111 | 7.2,3.6,6.1,2.5,121
112 | 6.5,3.2,5.1,2.0,121
113 | 6.4,2.7,5.3,1.9,121
114 | 6.8,3.0,5.5,2.1,121
115 | 5.7,2.5,5.0,2.0,121
116 | 5.8,2.8,5.1,2.4,121
117 | 6.4,3.2,5.3,2.3,121
118 | 6.5,3.0,5.5,1.8,121
119 | 7.7,3.8,6.7,2.2,121
120 | 7.7,2.6,6.9,2.3,121
121 | 6.0,2.2,5.0,1.5,121
122 | 6.9,3.2,5.7,2.3,121
123 | 5.6,2.8,4.9,2.0,121
124 | 7.7,2.8,6.7,2.0,121
125 | 6.3,2.7,4.9,1.8,121
126 | 6.7,3.3,5.7,2.1,121
127 | 7.2,3.2,6.0,1.8,121
128 | 6.2,2.8,4.8,1.8,121
129 | 6.1,3.0,4.9,1.8,121
130 | 6.4,2.8,5.6,2.1,121
131 | 7.2,3.0,5.8,1.6,121
132 | 7.4,2.8,6.1,1.9,121
133 | 7.9,3.8,6.4,2.0,121
134 | 6.4,2.8,5.6,2.2,121
135 | 6.3,2.8,5.1,1.5,121
136 | 6.1,2.6,5.6,1.4,121
137 | 7.7,3.0,6.1,2.3,121
138 | 6.3,3.4,5.6,2.4,121
139 | 6.4,3.1,5.5,1.8,121
140 | 6.0,3.0,4.8,1.8,121
141 | 6.9,3.1,5.4,2.1,121
142 | 6.7,3.1,5.6,2.4,121
143 | 6.9,3.1,5.1,2.3,121
144 | 5.8,2.7,5.1,1.9,121
145 | 6.8,3.2,5.9,2.3,121
146 | 6.7,3.3,5.7,2.5,121
147 | 6.7,3.0,5.2,2.3,121
148 | 6.3,2.5,5.0,1.9,121
149 | 6.5,3.0,5.2,2.0,121
150 | 6.2,3.4,5.4,2.3,121
151 | 5.9,3.0,5.1,1.8,121
152 | 
153 | 


--------------------------------------------------------------------------------
/tests/tests_algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_algorithms/__init__.py


--------------------------------------------------------------------------------
/tests/tests_algorithms/test_baseline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_almost_equal
 6 | from sklearn import datasets
 7 | 
 8 | from supervised.algorithms.baseline import (
 9 |     BaselineClassifierAlgorithm,
10 |     BaselineRegressorAlgorithm,
11 | )
12 | from supervised.utils.metric import Metric
13 | 
14 | 
15 | class BaselineTest(unittest.TestCase):
16 |     @classmethod
17 |     def setUpClass(cls):
18 |         cls.X, cls.y = datasets.make_regression(
19 |             n_samples=100,
20 |             n_features=5,
21 |             n_informative=4,
22 |             n_targets=1,
23 |             shuffle=False,
24 |             random_state=0,
25 |         )
26 | 
27 |     def test_reproduce_fit_regression(self):
28 |         metric = Metric({"name": "rmse"})
29 |         prev_loss = None
30 |         for _ in range(3):
31 |             model = BaselineRegressorAlgorithm({"ml_task": "regression"})
32 |             model.fit(self.X, self.y)
33 |             y_predicted = model.predict(self.X)
34 |             loss = metric(self.y, y_predicted)
35 |             if prev_loss is not None:
36 |                 assert_almost_equal(prev_loss, loss)
37 |             prev_loss = loss
38 | 
39 |     def test_reproduce_fit_bin_class(self):
40 |         X, y = datasets.make_classification(
41 |             n_samples=100,
42 |             n_features=5,
43 |             n_informative=4,
44 |             n_redundant=1,
45 |             n_classes=2,
46 |             n_clusters_per_class=3,
47 |             n_repeated=0,
48 |             shuffle=False,
49 |             random_state=0,
50 |         )
51 |         metric = Metric({"name": "logloss"})
52 |         prev_loss = None
53 |         for _ in range(3):
54 |             model = BaselineClassifierAlgorithm({"ml_task": "binary_classification"})
55 |             model.fit(X, y)
56 |             y_predicted = model.predict(X)
57 |             loss = metric(y, y_predicted)
58 |             if prev_loss is not None:
59 |                 assert_almost_equal(prev_loss, loss)
60 |             prev_loss = loss
61 | 
62 |     def test_save_and_load(self):
63 |         metric = Metric({"name": "rmse"})
64 |         dt = BaselineRegressorAlgorithm({"ml_task": "regression"})
65 |         dt.fit(self.X, self.y)
66 |         y_predicted = dt.predict(self.X)
67 |         loss = metric(self.y, y_predicted)
68 | 
69 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
70 | 
71 |         dt.save(filename)
72 |         dt2 = BaselineRegressorAlgorithm({"ml_task": "regression"})
73 |         dt2.load(filename)
74 |         # Finished with the file, delete it
75 |         os.remove(filename)
76 | 
77 |         y_predicted = dt2.predict(self.X)
78 |         loss2 = metric(self.y, y_predicted)
79 |         assert_almost_equal(loss, loss2)
80 | 
81 |     def test_is_fitted(self):
82 |         model = BaselineRegressorAlgorithm({"ml_task": "regression"})
83 |         self.assertFalse(model.is_fitted())
84 |         model.fit(self.X, self.y)
85 |         self.assertTrue(model.is_fitted())
86 | 


--------------------------------------------------------------------------------
/tests/tests_algorithms/test_decision_tree.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_almost_equal
 6 | from sklearn import datasets
 7 | 
 8 | from supervised.algorithms.decision_tree import (
 9 |     DecisionTreeRegressorAlgorithm,
10 | )
11 | from supervised.utils.metric import Metric
12 | 
13 | 
14 | class DecisionTreeTest(unittest.TestCase):
15 |     @classmethod
16 |     def setUpClass(cls):
17 |         cls.X, cls.y = datasets.make_regression(
18 |             n_samples=100,
19 |             n_features=5,
20 |             n_informative=4,
21 |             n_targets=1,
22 |             shuffle=False,
23 |             random_state=0,
24 |         )
25 | 
26 |     def test_reproduce_fit_regression(self):
27 |         metric = Metric({"name": "rmse"})
28 |         params = {"max_depth": 1, "seed": 1, "ml_task": "regression"}
29 |         prev_loss = None
30 |         for _ in range(3):
31 |             model = DecisionTreeRegressorAlgorithm(params)
32 |             model.fit(self.X, self.y)
33 |             y_predicted = model.predict(self.X)
34 |             loss = metric(self.y, y_predicted)
35 |             if prev_loss is not None:
36 |                 assert_almost_equal(prev_loss, loss)
37 |             prev_loss = loss
38 | 
39 |     def test_save_and_load(self):
40 |         metric = Metric({"name": "rmse"})
41 |         dt = DecisionTreeRegressorAlgorithm({"ml_task": "regression"})
42 |         dt.fit(self.X, self.y)
43 |         y_predicted = dt.predict(self.X)
44 |         loss = metric(self.y, y_predicted)
45 | 
46 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
47 | 
48 |         dt.save(filename)
49 |         dt2 = DecisionTreeRegressorAlgorithm({"ml_task": "regression"})
50 |         dt2.load(filename)
51 | 
52 |         y_predicted = dt2.predict(self.X)
53 |         loss2 = metric(self.y, y_predicted)
54 |         assert_almost_equal(loss, loss2)
55 | 
56 |         # Finished with temp file, delete it
57 |         os.remove(filename)
58 | 
59 |     def test_is_fitted(self):
60 |         params = {"max_depth": 1, "seed": 1, "ml_task": "regression"}
61 |         model = DecisionTreeRegressorAlgorithm(params)
62 |         self.assertFalse(model.is_fitted())
63 |         model.fit(self.X, self.y)
64 |         self.assertTrue(model.is_fitted())
65 | 


--------------------------------------------------------------------------------
/tests/tests_algorithms/test_extra_trees.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_almost_equal
  6 | from sklearn import datasets
  7 | 
  8 | from supervised.algorithms.extra_trees import (
  9 |     ExtraTreesAlgorithm,
 10 |     ExtraTreesRegressorAlgorithm,
 11 |     additional,
 12 |     regression_additional,
 13 | )
 14 | from supervised.utils.metric import Metric
 15 | 
 16 | additional["trees_in_step"] = 1
 17 | regression_additional["trees_in_step"] = 1
 18 | additional["max_steps"] = 1
 19 | regression_additional["max_steps"] = 1
 20 | 
 21 | 
 22 | class ExtraTreesRegressorAlgorithmTest(unittest.TestCase):
 23 |     @classmethod
 24 |     def setUpClass(cls):
 25 |         cls.X, cls.y = datasets.make_regression(
 26 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
 27 |         )
 28 | 
 29 |     def test_reproduce_fit(self):
 30 |         metric = Metric({"name": "mse"})
 31 |         params = {"trees_in_step": 1, "seed": 1, "ml_task": "regression"}
 32 |         prev_loss = None
 33 |         for _ in range(3):
 34 |             model = ExtraTreesRegressorAlgorithm(params)
 35 |             model.fit(self.X, self.y)
 36 |             y_predicted = model.predict(self.X)
 37 |             loss = metric(self.y, y_predicted)
 38 |             if prev_loss is not None:
 39 |                 assert_almost_equal(prev_loss, loss)
 40 |             prev_loss = loss
 41 | 
 42 | 
 43 | class ExtraTreesAlgorithmTest(unittest.TestCase):
 44 |     @classmethod
 45 |     def setUpClass(cls):
 46 |         cls.X, cls.y = datasets.make_classification(
 47 |             n_samples=100,
 48 |             n_features=5,
 49 |             n_informative=4,
 50 |             n_redundant=1,
 51 |             n_classes=2,
 52 |             n_clusters_per_class=3,
 53 |             n_repeated=0,
 54 |             shuffle=False,
 55 |             random_state=0,
 56 |         )
 57 | 
 58 |     def test_reproduce_fit(self):
 59 |         metric = Metric({"name": "logloss"})
 60 |         params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"}
 61 |         prev_loss = None
 62 |         for _ in range(3):
 63 |             model = ExtraTreesAlgorithm(params)
 64 |             model.fit(self.X, self.y)
 65 |             y_predicted = model.predict(self.X)
 66 |             loss = metric(self.y, y_predicted)
 67 |             if prev_loss is not None:
 68 |                 assert_almost_equal(prev_loss, loss)
 69 |             prev_loss = loss
 70 | 
 71 |     def test_fit_predict(self):
 72 |         metric = Metric({"name": "logloss"})
 73 |         params = {"trees_in_step": 50, "ml_task": "binary_classification"}
 74 |         rf = ExtraTreesAlgorithm(params)
 75 | 
 76 |         rf.fit(self.X, self.y)
 77 |         y_predicted = rf.predict(self.X)
 78 |         self.assertTrue(metric(self.y, y_predicted) < 0.6)
 79 | 
 80 |     def test_copy(self):
 81 |         metric = Metric({"name": "logloss"})
 82 |         rf = ExtraTreesAlgorithm({"ml_task": "binary_classification"})
 83 |         rf.fit(self.X, self.y)
 84 |         y_predicted = rf.predict(self.X)
 85 |         loss = metric(self.y, y_predicted)
 86 | 
 87 |         rf2 = ExtraTreesAlgorithm({"ml_task": "binary_classification"})
 88 |         rf2 = rf.copy()
 89 |         self.assertEqual(type(rf), type(rf2))
 90 |         y_predicted = rf2.predict(self.X)
 91 |         loss2 = metric(self.y, y_predicted)
 92 |         assert_almost_equal(loss, loss2)
 93 | 
 94 |     def test_save_and_load(self):
 95 |         metric = Metric({"name": "logloss"})
 96 |         rf = ExtraTreesAlgorithm({"ml_task": "binary_classification"})
 97 |         rf.fit(self.X, self.y)
 98 |         y_predicted = rf.predict(self.X)
 99 |         loss = metric(self.y, y_predicted)
100 | 
101 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
102 | 
103 |         rf.save(filename)
104 |         rf2 = ExtraTreesAlgorithm({"ml_task": "binary_classification"})
105 |         rf2.load(filename)
106 |         # Finished with the file, delete it
107 |         os.remove(filename)
108 | 
109 |         y_predicted = rf2.predict(self.X)
110 |         loss2 = metric(self.y, y_predicted)
111 |         assert_almost_equal(loss, loss2)
112 | 
113 |     def test_is_fitted(self):
114 |         params = {"trees_in_step": 50, "ml_task": "binary_classification"}
115 |         model = ExtraTreesAlgorithm(params)
116 |         self.assertFalse(model.is_fitted())
117 |         model.fit(self.X, self.y)
118 |         self.assertTrue(model.is_fitted())
119 | 


--------------------------------------------------------------------------------
/tests/tests_algorithms/test_factory.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from supervised.algorithms.factory import AlgorithmFactory
 4 | from supervised.algorithms.xgboost import XgbAlgorithm
 5 | 
 6 | 
 7 | class AlgorithmFactoryTest(unittest.TestCase):
 8 |     def test_fit(self):
 9 |         params = {
10 |             "learner_type": "Xgboost",
11 |             "objective": "binary:logistic",
12 |             "eval_metric": "logloss",
13 |         }
14 |         learner = AlgorithmFactory.get_algorithm(params)
15 |         self.assertEqual(
16 |             learner.algorithm_short_name, XgbAlgorithm.algorithm_short_name
17 |         )
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/tests/tests_algorithms/test_knn.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from numpy.testing import assert_almost_equal
 5 | from sklearn import datasets
 6 | 
 7 | from supervised.algorithms.knn import KNeighborsAlgorithm, KNeighborsRegressorAlgorithm
 8 | from supervised.utils.metric import Metric
 9 | 
10 | 
11 | class KNeighborsRegressorAlgorithmTest(unittest.TestCase):
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.X, cls.y = datasets.make_regression(
15 |             n_samples=100,
16 |             n_features=5, 
17 |             n_informative=4, 
18 |             shuffle=False, 
19 |             random_state=0
20 |         )
21 | 
22 |     def test_reproduce_fit(self):
23 |         metric = Metric({"name": "mse"})
24 |         params = {"seed": 1, "ml_task": "regression"}
25 |         prev_loss = None
26 |         for _ in range(2):
27 |             model = KNeighborsRegressorAlgorithm(params)
28 |             model.fit(self.X, self.y)
29 |             y_predicted = model.predict(self.X)
30 |             loss = metric(self.y, y_predicted)
31 |             if prev_loss is not None:
32 |                 assert_almost_equal(prev_loss, loss)
33 |             prev_loss = loss
34 | 
35 | 
36 | class KNeighborsAlgorithmTest(unittest.TestCase):
37 |     @classmethod
38 |     def setUpClass(cls):
39 |         cls.X, cls.y = datasets.make_classification(
40 |             n_samples=100,
41 |             n_features=5,
42 |             n_informative=4,
43 |             n_redundant=1,
44 |             n_classes=2,
45 |             n_clusters_per_class=3,
46 |             n_repeated=0,
47 |             shuffle=False,
48 |             random_state=0,
49 |         )
50 | 
51 |     def test_reproduce_fit(self):
52 |         metric = Metric({"name": "logloss"})
53 |         params = {"seed": 1, "ml_task": "binary_classification"}
54 |         prev_loss = None
55 |         for _ in range(2):
56 |             model = KNeighborsAlgorithm(params)
57 |             model.fit(self.X, self.y)
58 |             y_predicted = model.predict(self.X)
59 |             loss = metric(self.y, y_predicted)
60 |             if prev_loss is not None:
61 |                 assert_almost_equal(prev_loss, loss)
62 |             prev_loss = loss
63 | 
64 |     def test_fit_predict(self):
65 |         metric = Metric({"name": "logloss"})
66 |         params = {"ml_task": "binary_classification"}
67 |         la = KNeighborsAlgorithm(params)
68 | 
69 |         la.fit(self.X, self.y)
70 |         y_predicted = la.predict(self.X)
71 |         self.assertTrue(metric(self.y, y_predicted) < 0.6)
72 | 
73 |     def test_is_fitted(self):
74 |         params = {"ml_task": "binary_classification"}
75 |         model = KNeighborsAlgorithm(params)
76 |         self.assertFalse(model.is_fitted())
77 |         model.fit(self.X, self.y)
78 |         self.assertTrue(model.is_fitted())
79 | 
80 |     def test_classes_attribute(self):
81 |         params = {"ml_task": "binary_classification"}
82 |         model = KNeighborsAlgorithm(params)
83 |         model.fit(self.X,self.y)
84 | 
85 |         try:
86 |             classes = model._classes  
87 |         except AttributeError:
88 |             classes = None
89 | 
90 |         self.assertTrue(np.array_equal(np.unique(self.y), classes))
91 | 


--------------------------------------------------------------------------------
/tests/tests_algorithms/test_linear.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_almost_equal
  6 | from sklearn import datasets
  7 | 
  8 | from supervised.algorithms.linear import LinearAlgorithm, LinearRegressorAlgorithm
  9 | from supervised.utils.metric import Metric
 10 | 
 11 | 
 12 | class LinearRegressorAlgorithmTest(unittest.TestCase):
 13 |     @classmethod
 14 |     def setUpClass(cls):
 15 |         cls.X, cls.y = datasets.make_regression(
 16 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
 17 |         )
 18 | 
 19 |     def test_reproduce_fit(self):
 20 |         metric = Metric({"name": "mse"})
 21 |         params = {"seed": 1, "ml_task": "regression"}
 22 |         prev_loss = None
 23 |         for _ in range(3):
 24 |             model = LinearRegressorAlgorithm(params)
 25 |             model.fit(self.X, self.y)
 26 |             y_predicted = model.predict(self.X)
 27 |             loss = metric(self.y, y_predicted)
 28 |             if prev_loss is not None:
 29 |                 assert_almost_equal(prev_loss, loss)
 30 |             prev_loss = loss
 31 | 
 32 | 
 33 | class LinearAlgorithmTest(unittest.TestCase):
 34 |     @classmethod
 35 |     def setUpClass(cls):
 36 |         cls.X, cls.y = datasets.make_classification(
 37 |             n_samples=100,
 38 |             n_features=5,
 39 |             n_informative=4,
 40 |             n_redundant=1,
 41 |             n_classes=2,
 42 |             n_clusters_per_class=3,
 43 |             n_repeated=0,
 44 |             shuffle=False,
 45 |             random_state=0,
 46 |         )
 47 | 
 48 |     def test_reproduce_fit(self):
 49 |         metric = Metric({"name": "logloss"})
 50 |         params = {"seed": 1, "ml_task": "binary_classification"}
 51 |         prev_loss = None
 52 |         for _ in range(3):
 53 |             model = LinearAlgorithm(params)
 54 |             model.fit(self.X, self.y)
 55 |             y_predicted = model.predict(self.X)
 56 |             loss = metric(self.y, y_predicted)
 57 |             if prev_loss is not None:
 58 |                 assert_almost_equal(prev_loss, loss)
 59 |             prev_loss = loss
 60 | 
 61 |     def test_fit_predict(self):
 62 |         metric = Metric({"name": "logloss"})
 63 |         params = {"ml_task": "binary_classification"}
 64 |         la = LinearAlgorithm(params)
 65 | 
 66 |         la.fit(self.X, self.y)
 67 |         y_predicted = la.predict(self.X)
 68 |         self.assertTrue(metric(self.y, y_predicted) < 0.6)
 69 | 
 70 |     def test_copy(self):
 71 |         metric = Metric({"name": "logloss"})
 72 |         model = LinearAlgorithm({"ml_task": "binary_classification"})
 73 |         model.fit(self.X, self.y)
 74 |         y_predicted = model.predict(self.X)
 75 |         loss = metric(self.y, y_predicted)
 76 | 
 77 |         model2 = LinearAlgorithm({})
 78 |         model2 = model.copy()
 79 |         self.assertEqual(type(model), type(model2))
 80 |         y_predicted = model2.predict(self.X)
 81 |         loss2 = metric(self.y, y_predicted)
 82 |         assert_almost_equal(loss, loss2)
 83 | 
 84 |     def test_save_and_load(self):
 85 |         metric = Metric({"name": "logloss"})
 86 |         model = LinearAlgorithm({"ml_task": "binary_classification"})
 87 |         model.fit(self.X, self.y)
 88 |         y_predicted = model.predict(self.X)
 89 |         loss = metric(self.y, y_predicted)
 90 | 
 91 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
 92 | 
 93 |         model.save(filename)
 94 |         model2 = LinearAlgorithm({"ml_task": "binary_classification"})
 95 |         model2.load(filename)
 96 |         # Finished with the file, delete it
 97 |         os.remove(filename)
 98 | 
 99 |         y_predicted = model2.predict(self.X)
100 |         loss2 = metric(self.y, y_predicted)
101 |         assert_almost_equal(loss, loss2)
102 | 
103 |     def test_is_fitted(self):
104 |         model = LinearAlgorithm({"ml_task": "binary_classification"})
105 |         self.assertFalse(model.is_fitted())
106 |         model.fit(self.X, self.y)
107 |         self.assertTrue(model.is_fitted())
108 | 


--------------------------------------------------------------------------------
/tests/tests_algorithms/test_random_forest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_almost_equal
  6 | from sklearn import datasets
  7 | 
  8 | from supervised.algorithms.random_forest import (
  9 |     RandomForestAlgorithm,
 10 |     RandomForestRegressorAlgorithm,
 11 |     additional,
 12 |     regression_additional,
 13 | )
 14 | from supervised.utils.metric import Metric
 15 | 
 16 | additional["trees_in_step"] = 1
 17 | regression_additional["trees_in_step"] = 1
 18 | additional["max_steps"] = 1
 19 | regression_additional["max_steps"] = 1
 20 | 
 21 | 
 22 | class RandomForestRegressorAlgorithmTest(unittest.TestCase):
 23 |     @classmethod
 24 |     def setUpClass(cls):
 25 |         cls.X, cls.y = datasets.make_regression(
 26 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
 27 |         )
 28 | 
 29 |     def test_reproduce_fit(self):
 30 |         metric = Metric({"name": "mse"})
 31 |         params = {"trees_in_step": 1, "seed": 1, "ml_task": "regression"}
 32 |         prev_loss = None
 33 |         for _ in range(3):
 34 |             model = RandomForestRegressorAlgorithm(params)
 35 |             model.fit(self.X, self.y)
 36 |             y_predicted = model.predict(self.X)
 37 |             loss = metric(self.y, y_predicted)
 38 |             if prev_loss is not None:
 39 |                 assert_almost_equal(prev_loss, loss)
 40 |             prev_loss = loss
 41 | 
 42 | 
 43 | class RandomForestAlgorithmTest(unittest.TestCase):
 44 |     @classmethod
 45 |     def setUpClass(cls):
 46 |         cls.X, cls.y = datasets.make_classification(
 47 |             n_samples=100,
 48 |             n_features=5,
 49 |             n_informative=4,
 50 |             n_redundant=1,
 51 |             n_classes=2,
 52 |             n_clusters_per_class=3,
 53 |             n_repeated=0,
 54 |             shuffle=False,
 55 |             random_state=0,
 56 |         )
 57 | 
 58 |     def test_reproduce_fit(self):
 59 |         metric = Metric({"name": "logloss"})
 60 |         params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"}
 61 |         prev_loss = None
 62 |         for _ in range(3):
 63 |             model = RandomForestAlgorithm(params)
 64 |             model.fit(self.X, self.y)
 65 |             y_predicted = model.predict(self.X)
 66 |             loss = metric(self.y, y_predicted)
 67 |             if prev_loss is not None:
 68 |                 assert_almost_equal(prev_loss, loss)
 69 |             prev_loss = loss
 70 | 
 71 |     def test_fit_predict(self):
 72 |         metric = Metric({"name": "logloss"})
 73 |         params = {"ml_task": "binary_classification"}
 74 |         rf = RandomForestAlgorithm(params)
 75 | 
 76 |         rf.fit(self.X, self.y)
 77 |         y_predicted = rf.predict(self.X)
 78 |         self.assertTrue(metric(self.y, y_predicted) < 1.5)
 79 | 
 80 |     def test_copy(self):
 81 |         metric = Metric({"name": "logloss"})
 82 |         rf = RandomForestAlgorithm({"ml_task": "binary_classification"})
 83 |         rf.fit(self.X, self.y)
 84 |         y_predicted = rf.predict(self.X)
 85 |         loss = metric(self.y, y_predicted)
 86 | 
 87 |         rf2 = RandomForestAlgorithm({"ml_task": "binary_classification"})
 88 |         rf2 = rf.copy()
 89 |         self.assertEqual(type(rf), type(rf2))
 90 |         y_predicted = rf2.predict(self.X)
 91 |         loss2 = metric(self.y, y_predicted)
 92 |         assert_almost_equal(loss, loss2)
 93 | 
 94 |     def test_save_and_load(self):
 95 |         metric = Metric({"name": "logloss"})
 96 |         rf = RandomForestAlgorithm({"ml_task": "binary_classification"})
 97 |         rf.fit(self.X, self.y)
 98 |         y_predicted = rf.predict(self.X)
 99 |         loss = metric(self.y, y_predicted)
100 | 
101 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
102 | 
103 |         rf.save(filename)
104 |         rf2 = RandomForestAlgorithm({"ml_task": "binary_classification"})
105 |         rf2.load(filename)
106 |         # Finished with the file, delete it
107 |         os.remove(filename)
108 | 
109 |         y_predicted = rf2.predict(self.X)
110 |         loss2 = metric(self.y, y_predicted)
111 |         assert_almost_equal(loss, loss2)
112 | 
113 |     def test_is_fitted(self):
114 |         model = RandomForestAlgorithm({"ml_task": "binary_classification"})
115 |         self.assertFalse(model.is_fitted())
116 |         model.fit(self.X, self.y)
117 |         self.assertTrue(model.is_fitted())
118 | 


--------------------------------------------------------------------------------
/tests/tests_algorithms/test_registry.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from supervised.algorithms.registry import AlgorithmsRegistry
 4 | 
 5 | 
 6 | class AlgorithmsRegistryTest(unittest.TestCase):
 7 |     def test_add_to_registry(self):
 8 |         class Model1:
 9 |             algorithm_short_name = ""
10 | 
11 |         model1 = {
12 |             "task_name": "binary_classification",
13 |             "model_class": Model1,
14 |             "model_params": {},
15 |             "required_preprocessing": {},
16 |             "additional": {},
17 |             "default_params": {},
18 |         }
19 |         AlgorithmsRegistry.add(**model1)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     unittest.main()
24 | 


--------------------------------------------------------------------------------
/tests/tests_automl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_automl/__init__.py


--------------------------------------------------------------------------------
/tests/tests_automl/test_adjust_validation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class AutoMLAdjustValidationTest(unittest.TestCase):
11 |     automl_dir = "automl_testing"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_custom_init(self):
17 |         X = np.random.uniform(size=(60, 2))
18 |         y = np.random.randint(0, 2, size=(60,))
19 | 
20 |         automl = AutoML(
21 |             results_path=self.automl_dir,
22 |             model_time_limit=10,
23 |             algorithms=["Xgboost"],
24 |             mode="Compete",
25 |             explain_level=0,
26 |             start_random_models=1,
27 |             hill_climbing_steps=0,
28 |             top_models_to_improve=0,
29 |             kmeans_features=False,
30 |             golden_features=False,
31 |             features_selection=False,
32 |             boost_on_errors=False,
33 |         )
34 |         automl.fit(X, y)
35 | 
36 |         self.assertFalse(
37 |             os.path.exists(os.path.join(self.automl_dir, "1_DecisionTree"))
38 |         )
39 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_automl_init.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | 
 6 | from supervised import AutoML
 7 | 
 8 | 
 9 | class AutoMLInitTest(unittest.TestCase):
10 |     automl_dir = "AutoMLInitTest"
11 | 
12 |     def tearDown(self):
13 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
14 | 
15 |     def test_custom_init(self):
16 |         X = np.random.uniform(size=(30, 2))
17 |         y = np.random.randint(0, 2, size=(30,))
18 | 
19 |         automl = AutoML(
20 |             results_path=self.automl_dir,
21 |             model_time_limit=1,
22 |             algorithms=["Xgboost"],
23 |             explain_level=0,
24 |             train_ensemble=False,
25 |             stack_models=False,
26 |             validation_strategy={"validation_type": "split"},
27 |             start_random_models=3,
28 |             hill_climbing_steps=1,
29 |             top_models_to_improve=1,
30 |         )
31 | 
32 |         automl.fit(X, y)
33 |         self.assertGreater(len(automl._models), 3)
34 | 
35 |     def test_get_results_path(self):
36 |         automl = AutoML(algorithms=["Baseline"], total_time_limit=1)
37 |         first_path = automl._get_results_path()
38 |         self.assertEqual(first_path, automl._get_results_path())
39 |         shutil.rmtree(first_path, ignore_errors=True)
40 | 
41 |         automl = AutoML(
42 |             algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir
43 |         )
44 |         self.assertEqual(self.automl_dir, automl._get_results_path())
45 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
46 | 
47 |         # get results path after save
48 |         automl = AutoML(
49 |             algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir
50 |         )
51 |         X = np.random.uniform(size=(30, 2))
52 |         y = np.random.randint(0, 2, size=(30,))
53 |         automl.fit(X, y)
54 |         self.assertEqual(self.automl_dir, automl._get_results_path())
55 | 
56 |         automl2 = AutoML(
57 |             algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir
58 |         )
59 |         self.assertEqual(self.automl_dir, automl2._get_results_path())
60 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_automl_report.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import pytest
 9 | from sklearn import datasets
10 | from sklearn.decomposition import PCA
11 | from sklearn.pipeline import make_pipeline
12 | 
13 | from supervised import AutoML
14 | from supervised.exceptions import AutoMLException
15 | 
16 | iris = datasets.load_iris()
17 | 
18 | class AutoMLReportTest(unittest.TestCase):
19 |     automl_dir = "AutoMLTest"
20 | 
21 |     def tearDown(self):
22 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
23 | 
24 |     def setUp(self):
25 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
26 | 
27 |     def test_report(self):
28 |         """Tests AutoML in the iris dataset (Multiclass classification)"""
29 |         model = AutoML(
30 |             algorithms=["Baseline"],
31 |             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
32 |         )
33 |         model.fit(iris.data, iris.target)
34 |         model.report()
35 | 
36 |         report_path = os.path.join(self.automl_dir, "README.html")
37 |         self.assertTrue(os.path.exists(report_path))
38 | 
39 |         content = None
40 |         with open(report_path, "r") as fin:
41 |             content = fin.read()
42 | 
43 | 
44 |         #print(content)
45 |         link = '<a href="1_Baseline/README.html">'
46 |         self.assertFalse(link in content)
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_automl_sample_weight.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | from numpy.testing import assert_almost_equal
 6 | from sklearn import datasets
 7 | 
 8 | from supervised import AutoML
 9 | 
10 | iris = datasets.load_iris()
11 | housing = datasets.fetch_california_housing()
12 | # limit data size for faster tests
13 | housing.data = housing.data[:500]
14 | housing.target = housing.target[:500]
15 | breast_cancer = datasets.load_breast_cancer()
16 | 
17 | 
18 | class AutoMLSampleWeightTest(unittest.TestCase):
19 |     automl_dir = "AutoMLSampleWeightTest"
20 | 
21 |     def tearDown(self):
22 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
23 | 
24 |     def test_iris_dataset_sample_weight(self):
25 |         """Tests AutoML in the iris dataset (Multiclass classification)
26 |         without and with sample weight"""
27 |         model = AutoML(
28 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
29 |         )
30 |         score_1 = model.fit(iris.data, iris.target).score(iris.data, iris.target)
31 |         self.assertGreater(score_1, 0.5)
32 | 
33 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
34 |         model = AutoML(
35 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
36 |         )
37 |         sample_weight = np.ones(iris.data.shape[0])
38 |         score_2 = model.fit(iris.data, iris.target, sample_weight=sample_weight).score(
39 |             iris.data, iris.target, sample_weight=sample_weight
40 |         )
41 |         assert_almost_equal(score_1, score_2)
42 | 
43 |     def test_housing_dataset(self):
44 |         """Tests AutoML in the housing dataset (Regression)
45 |         without and with sample weight"""
46 |         model = AutoML(
47 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
48 |         )
49 |         score_1 = model.fit(housing.data, housing.target).score(
50 |             housing.data, housing.target
51 |         )
52 |         self.assertGreater(score_1, 0.5)
53 | 
54 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
55 |         model = AutoML(
56 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
57 |         )
58 |         sample_weight = np.ones(housing.data.shape[0])
59 |         score_2 = model.fit(
60 |             housing.data, housing.target, sample_weight=sample_weight
61 |         ).score(housing.data, housing.target, sample_weight=sample_weight)
62 |         assert_almost_equal(score_1, score_2)
63 | 
64 |     def test_breast_cancer_dataset(self):
65 |         """Tests AutoML in the breast cancer (binary classification)
66 |         without and with sample weight"""
67 |         model = AutoML(
68 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
69 |         )
70 |         score_1 = model.fit(breast_cancer.data, breast_cancer.target).score(
71 |             breast_cancer.data, breast_cancer.target
72 |         )
73 |         self.assertGreater(score_1, 0.5)
74 | 
75 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
76 |         model = AutoML(
77 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
78 |         )
79 |         sample_weight = np.ones(breast_cancer.data.shape[0])
80 |         score_2 = model.fit(
81 |             breast_cancer.data, breast_cancer.target, sample_weight=sample_weight
82 |         ).score(breast_cancer.data, breast_cancer.target, sample_weight=sample_weight)
83 |         assert_almost_equal(score_1, score_2)
84 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_automl_time_constraints.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import time
  3 | import unittest
  4 | 
  5 | from supervised import AutoML
  6 | from supervised.tuner.time_controller import TimeController
  7 | 
  8 | 
  9 | class AutoMLTimeConstraintsTest(unittest.TestCase):
 10 |     automl_dir = "automl_tests"
 11 | 
 12 |     def tearDown(self):
 13 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 14 | 
 15 |     def test_set_total_time_limit(self):
 16 |         model_type = "Xgboost"
 17 |         automl = AutoML(
 18 |             results_path=self.automl_dir, total_time_limit=100, algorithms=[model_type]
 19 |         )
 20 | 
 21 |         automl._time_ctrl = TimeController(
 22 |             time.time(), 100, None, ["simple_algorithms", "not_so_random"], "Xgboost"
 23 |         )
 24 | 
 25 |         time_spend = 0
 26 |         for i in range(12):
 27 |             automl._start_time -= 10
 28 |             automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10)
 29 |             if automl._time_ctrl.enough_time(model_type, "not_so_random"):
 30 |                 time_spend += 10
 31 | 
 32 |         self.assertTrue(time_spend < 100)
 33 | 
 34 |     def test_set_model_time_limit(self):
 35 |         model_type = "Xgboost"
 36 |         automl = AutoML(
 37 |             results_path=self.automl_dir, model_time_limit=10, algorithms=[model_type]
 38 |         )
 39 |         automl._time_ctrl = TimeController(
 40 |             time.time(), None, 10, ["simple_algorithms", "not_so_random"], "Xgboost"
 41 |         )
 42 | 
 43 |         for i in range(12):
 44 |             automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10)
 45 |             # should be always true
 46 |             self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random"))
 47 | 
 48 |     def test_set_model_time_limit_omit_total_time(self):
 49 |         model_type = "Xgboost"
 50 |         automl = AutoML(
 51 |             results_path=self.automl_dir,
 52 |             total_time_limit=10,
 53 |             model_time_limit=10,
 54 |             algorithms=[model_type],
 55 |         )
 56 |         automl._time_ctrl = TimeController(
 57 |             time.time(), 10, 10, ["simple_algorithms", "not_so_random"], "Xgboost"
 58 |         )
 59 | 
 60 |         for i in range(12):
 61 |             automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10)
 62 |             # should be always true
 63 |             self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random"))
 64 | 
 65 |     def test_enough_time_to_train(self):
 66 |         model_type = "Xgboost"
 67 |         model_type_2 = "LightGBM"
 68 | 
 69 |         model_type = "Xgboost"
 70 |         automl = AutoML(
 71 |             results_path=self.automl_dir,
 72 |             total_time_limit=10,
 73 |             model_time_limit=10,
 74 |             algorithms=[model_type, model_type_2],
 75 |         )
 76 |         automl._time_ctrl = TimeController(
 77 |             time.time(),
 78 |             10,
 79 |             10,
 80 |             ["simple_algorithms", "not_so_random"],
 81 |             [model_type, model_type_2],
 82 |         )
 83 | 
 84 |         for i in range(5):
 85 |             automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 1)
 86 |             # should be always true
 87 |             self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random"))
 88 | 
 89 |         for i in range(5):
 90 |             automl._time_ctrl.log_time(
 91 |                 f"LightGBM_{i}", model_type_2, "not_so_random", 1
 92 |             )
 93 |             # should be always true
 94 |             self.assertTrue(
 95 |                 automl._time_ctrl.enough_time(model_type_2, "not_so_random")
 96 |             )
 97 | 
 98 |     def test_expected_learners_cnt(self):
 99 |         automl = AutoML(results_path=self.automl_dir)
100 |         automl._validation_strategy = {"k_folds": 7, "repeats": 6}
101 |         self.assertEqual(automl._expected_learners_cnt(), 42)
102 | 
103 |         automl._validation_strategy = {"k_folds": 7}
104 |         self.assertEqual(automl._expected_learners_cnt(), 7)
105 |         automl._validation_strategy = {}
106 |         self.assertEqual(automl._expected_learners_cnt(), 1)
107 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_data_types.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class AutoMLDataTypesTest(unittest.TestCase):
11 |     automl_dir = "automl_tests"
12 |     rows = 250
13 | 
14 |     def tearDown(self):
15 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
16 | 
17 |     def test_category_data_type(self):
18 |         X = np.random.rand(self.rows, 3)
19 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
20 |         y = np.random.randint(0, 2, self.rows)
21 | 
22 |         X["f1"] = X["f1"].astype("category")
23 | 
24 |         automl = AutoML(
25 |             results_path=self.automl_dir,
26 |             total_time_limit=1,
27 |             algorithms=["CatBoost"],
28 |             train_ensemble=False,
29 |             explain_level=0,
30 |             start_random_models=1,
31 |         )
32 |         automl.fit(X, y)
33 | 
34 |     def test_encoding_strange_characters(self):
35 |         X = np.random.rand(self.rows, 3)
36 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
37 |         y = np.random.permutation(["ɛ", "🂲"] * int(self.rows / 2))
38 | 
39 |         automl = AutoML(
40 |             results_path=self.automl_dir,
41 |             total_time_limit=1,
42 |             algorithms=["Baseline"],
43 |             train_ensemble=False,
44 |             explain_level=0,
45 |             start_random_models=1,
46 |         )
47 |         automl.fit(X, y)
48 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_dir_change.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_almost_equal
 7 | from sklearn import datasets
 8 | 
 9 | from supervised import AutoML
10 | 
11 | 
12 | class AutoMLDirChangeTest(unittest.TestCase):
13 |     automl_dir_a = "automl_testing_A"
14 |     automl_dir_b = "automl_testing_B"
15 |     automl_dir = "automl_testing"
16 | 
17 |     def tearDown(self):
18 |         shutil.rmtree(self.automl_dir_a, ignore_errors=True)
19 |         shutil.rmtree(self.automl_dir_b, ignore_errors=True)
20 | 
21 |     def create_dir(self, dir_path):
22 |         if not os.path.exists(dir_path):
23 |             try:
24 |                 os.mkdir(dir_path)
25 |             except Exception as e:
26 |                 pass
27 | 
28 |     def test_create_report_after_dir_change(self):
29 |         #
30 |         # test for https://github.com/mljar/mljar-supervised/issues/384
31 |         #
32 |         self.create_dir(self.automl_dir_a)
33 |         self.create_dir(self.automl_dir_b)
34 | 
35 |         path_a = os.path.join(self.automl_dir_a, self.automl_dir)
36 |         path_b = os.path.join(self.automl_dir_b, self.automl_dir)
37 | 
38 |         X = np.random.uniform(size=(30, 2))
39 |         y = np.random.randint(0, 2, size=(30,))
40 | 
41 |         automl = AutoML(results_path=path_a, algorithms=["Baseline"], explain_level=0)
42 |         automl.fit(X, y)
43 | 
44 |         shutil.move(path_a, path_b)
45 | 
46 |         automl2 = AutoML(
47 |             results_path=path_b,
48 |         )
49 |         automl2.report()
50 | 
51 |     def test_compute_predictions_after_dir_change(self):
52 |         #
53 |         # test for https://github.com/mljar/mljar-supervised/issues/384
54 |         #
55 |         self.create_dir(self.automl_dir_a)
56 |         self.create_dir(self.automl_dir_b)
57 | 
58 |         path_a = os.path.join(self.automl_dir_a, self.automl_dir)
59 |         path_b = os.path.join(self.automl_dir_b, self.automl_dir)
60 | 
61 |         X, y = datasets.make_regression(
62 |             n_samples=100,
63 |             n_features=5,
64 |             n_informative=4,
65 |             n_targets=1,
66 |             shuffle=False,
67 |             random_state=0,
68 |         )
69 | 
70 |         automl = AutoML(
71 |             results_path=path_a,
72 |             explain_level=0,
73 |             ml_task="regression",
74 |             total_time_limit=10,
75 |         )
76 |         automl.fit(X, y)
77 |         p = automl.predict(X[:3])
78 | 
79 |         shutil.move(path_a, path_b)
80 | 
81 |         automl2 = AutoML(
82 |             results_path=path_b,
83 |         )
84 |         p2 = automl2.predict(X[:3])
85 | 
86 |         for i in range(3):
87 |             assert_almost_equal(p[i], p2[i])
88 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_golden_features.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import shutil
  4 | import unittest
  5 | 
  6 | import pandas as pd
  7 | from sklearn import datasets
  8 | 
  9 | from supervised import AutoML
 10 | 
 11 | 
 12 | class AutoMLGoldenFeaturesTest(unittest.TestCase):
 13 |     automl_dir = "automl_tests"
 14 |     rows = 50
 15 | 
 16 |     def tearDown(self):
 17 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 18 | 
 19 |     def test_no_golden_features(self):
 20 |         N_COLS = 10
 21 |         X, y = datasets.make_classification(
 22 |             n_samples=100,
 23 |             n_features=N_COLS,
 24 |             n_informative=6,
 25 |             n_redundant=1,
 26 |             n_classes=2,
 27 |             n_clusters_per_class=3,
 28 |             n_repeated=0,
 29 |             shuffle=False,
 30 |             random_state=0,
 31 |         )
 32 | 
 33 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
 34 | 
 35 |         automl = AutoML(
 36 |             results_path=self.automl_dir,
 37 |             total_time_limit=50,
 38 |             algorithms=["Xgboost"],
 39 |             train_ensemble=False,
 40 |             golden_features=False,
 41 |             explain_level=0,
 42 |             start_random_models=1,
 43 |         )
 44 |         automl.fit(X, y)
 45 | 
 46 |         self.assertEqual(len(automl._models), 1)
 47 | 
 48 |     def test_golden_features(self):
 49 |         N_COLS = 10
 50 |         X, y = datasets.make_classification(
 51 |             n_samples=100,
 52 |             n_features=N_COLS,
 53 |             n_informative=6,
 54 |             n_redundant=1,
 55 |             n_classes=2,
 56 |             n_clusters_per_class=3,
 57 |             n_repeated=0,
 58 |             shuffle=False,
 59 |             random_state=0,
 60 |         )
 61 | 
 62 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
 63 | 
 64 |         automl = AutoML(
 65 |             results_path=self.automl_dir,
 66 |             total_time_limit=50,
 67 |             algorithms=["Xgboost"],
 68 |             train_ensemble=False,
 69 |             golden_features=True,
 70 |             explain_level=0,
 71 |             start_random_models=1,
 72 |         )
 73 |         automl.fit(X, y)
 74 | 
 75 |         self.assertEqual(len(automl._models), 2)
 76 | 
 77 |         # there should be 10 golden features
 78 |         with open(os.path.join(self.automl_dir, "golden_features.json")) as fin:
 79 |             d = json.loads(fin.read())
 80 |             self.assertEqual(len(d["new_features"]), 10)
 81 | 
 82 |     def test_golden_features_count(self):
 83 |         N_COLS = 10
 84 |         X, y = datasets.make_classification(
 85 |             n_samples=100,
 86 |             n_features=N_COLS,
 87 |             n_informative=6,
 88 |             n_redundant=1,
 89 |             n_classes=2,
 90 |             n_clusters_per_class=3,
 91 |             n_repeated=0,
 92 |             shuffle=False,
 93 |             random_state=0,
 94 |         )
 95 | 
 96 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
 97 | 
 98 |         automl = AutoML(
 99 |             results_path=self.automl_dir,
100 |             total_time_limit=50,
101 |             algorithms=["Xgboost"],
102 |             train_ensemble=False,
103 |             golden_features=50,
104 |             explain_level=0,
105 |             start_random_models=1,
106 |         )
107 |         automl.fit(X, y)
108 | 
109 |         self.assertEqual(len(automl._models), 2)
110 | 
111 |         # there should be 50 golden features
112 |         with open(os.path.join(self.automl_dir, "golden_features.json")) as fin:
113 |             d = json.loads(fin.read())
114 |             self.assertEqual(len(d["new_features"]), 50)
115 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_handle_imbalance.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from supervised import AutoML
  8 | from supervised.algorithms.random_forest import additional
  9 | from supervised.algorithms.registry import MULTICLASS_CLASSIFICATION
 10 | 
 11 | additional["max_steps"] = 1
 12 | additional["trees_in_step"] = 1
 13 | 
 14 | from supervised.algorithms.xgboost import additional
 15 | 
 16 | additional["max_rounds"] = 1
 17 | 
 18 | 
 19 | class AutoMLHandleImbalanceTest(unittest.TestCase):
 20 |     automl_dir = "AutoMLHandleImbalanceTest"
 21 | 
 22 |     def tearDown(self):
 23 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 24 | 
 25 |     def test_handle_drastic_imbalance(self):
 26 |         a = AutoML(
 27 |             results_path=self.automl_dir,
 28 |             total_time_limit=10,
 29 |             algorithms=["Random Forest"],
 30 |             train_ensemble=False,
 31 |             validation_strategy={
 32 |                 "validation_type": "kfold",
 33 |                 "k_folds": 10,
 34 |                 "shuffle": True,
 35 |                 "stratify": True,
 36 |             },
 37 |             start_random_models=1,
 38 |         )
 39 | 
 40 |         rows = 100
 41 |         X = pd.DataFrame(
 42 |             {
 43 |                 "f1": np.random.rand(rows),
 44 |                 "f2": np.random.rand(rows),
 45 |                 "f3": np.random.rand(rows),
 46 |             }
 47 |         )
 48 |         y = np.ones(rows)
 49 | 
 50 |         y[:8] = 0
 51 |         y[10:12] = 2
 52 |         y = pd.Series(np.array(y), name="target")
 53 |         a._ml_task = MULTICLASS_CLASSIFICATION
 54 |         a._handle_drastic_imbalance(X, y)
 55 | 
 56 |         self.assertEqual(X.shape[0], 130)
 57 |         self.assertEqual(X.shape[1], 3)
 58 |         self.assertEqual(y.shape[0], 130)
 59 | 
 60 |     def test_handle_drastic_imbalance_sample_weight(self):
 61 |         a = AutoML(
 62 |             results_path=self.automl_dir,
 63 |             total_time_limit=10,
 64 |             algorithms=["Random Forest"],
 65 |             train_ensemble=False,
 66 |             validation_strategy={
 67 |                 "validation_type": "kfold",
 68 |                 "k_folds": 10,
 69 |                 "shuffle": True,
 70 |                 "stratify": True,
 71 |             },
 72 |             start_random_models=1,
 73 |         )
 74 | 
 75 |         rows = 100
 76 |         X = pd.DataFrame(
 77 |             {
 78 |                 "f1": np.random.rand(rows),
 79 |                 "f2": np.random.rand(rows),
 80 |                 "f3": np.random.rand(rows),
 81 |             }
 82 |         )
 83 |         y = np.ones(rows)
 84 |         sample_weight = pd.Series(np.array(range(rows)), name="sample_weight")
 85 | 
 86 |         y[:1] = 0
 87 |         y[10:11] = 2
 88 | 
 89 |         y = pd.Series(np.array(y), name="target")
 90 |         a._ml_task = MULTICLASS_CLASSIFICATION
 91 |         a._handle_drastic_imbalance(X, y, sample_weight)
 92 | 
 93 |         self.assertEqual(X.shape[0], 138)
 94 |         self.assertEqual(X.shape[1], 3)
 95 |         self.assertEqual(y.shape[0], 138)
 96 | 
 97 |         self.assertEqual(np.sum(sample_weight[100:119]), 0)
 98 |         self.assertEqual(np.sum(sample_weight[119:138]), 19 * 10)
 99 | 
100 |     def test_imbalance_dont_change_data_after_fit(self):
101 |         a = AutoML(
102 |             results_path=self.automl_dir,
103 |             total_time_limit=5,
104 |             train_ensemble=False,
105 |             validation_strategy={
106 |                 "validation_type": "kfold",
107 |                 "k_folds": 10,
108 |                 "shuffle": True,
109 |                 "stratify": True,
110 |             },
111 |             start_random_models=1,
112 |             explain_level=0,
113 |         )
114 | 
115 |         rows = 100
116 |         X = pd.DataFrame(
117 |             {
118 |                 "f1": np.random.rand(rows),
119 |                 "f2": np.random.rand(rows),
120 |                 "f3": np.random.rand(rows),
121 |             }
122 |         )
123 |         y = np.ones(rows)
124 | 
125 |         y[:8] = 0
126 |         y[10:12] = 2
127 |         sample_weight = np.ones(rows)
128 | 
129 |         a.fit(X, y, sample_weight=sample_weight)
130 | 
131 |         # original data **without** inserted samples to handle imbalance
132 |         self.assertEqual(X.shape[0], rows)
133 |         self.assertEqual(y.shape[0], rows)
134 |         self.assertEqual(sample_weight.shape[0], rows)
135 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_joblib_version.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import shutil
 4 | import unittest
 5 | 
 6 | import joblib
 7 | import numpy as np
 8 | 
 9 | from supervised import AutoML
10 | from supervised.exceptions import AutoMLException
11 | 
12 | 
13 | class TestJoblibVersion(unittest.TestCase):
14 |     automl_dir = "TestJoblibVersion"
15 | 
16 |     def tearDown(self):
17 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
18 | 
19 |     def test_joblib_good_version(self):
20 |         X = np.random.uniform(size=(60, 2))
21 |         y = np.random.randint(0, 2, size=(60,))
22 | 
23 |         automl = AutoML(
24 |             results_path=self.automl_dir,
25 |             model_time_limit=10,
26 |             algorithms=["Xgboost"],
27 |             mode="Explain",
28 |             explain_level=0,
29 |             start_random_models=1,
30 |             hill_climbing_steps=0,
31 |             top_models_to_improve=0,
32 |             kmeans_features=False,
33 |             golden_features=False,
34 |             features_selection=False,
35 |             boost_on_errors=False,
36 |         )
37 |         automl.fit(X, y)
38 | 
39 |         # Test if joblib is in json
40 |         json_path = os.path.join(self.automl_dir, "1_Default_Xgboost", "framework.json")
41 | 
42 |         with open(json_path) as file:
43 |             frame = json.load(file)
44 | 
45 |         json_version = frame["joblib_version"]
46 |         expected_result = joblib.__version__
47 | 
48 |         self.assertEqual(expected_result, json_version)
49 | 
50 |     def test_joblib_wrong_version(self):
51 |         X = np.random.uniform(size=(60, 2))
52 |         y = np.random.randint(0, 2, size=(60,))
53 | 
54 |         automl = AutoML(
55 |             results_path=self.automl_dir,
56 |             model_time_limit=10,
57 |             algorithms=["Xgboost"],
58 |             mode="Explain",
59 |             explain_level=0,
60 |             start_random_models=1,
61 |             hill_climbing_steps=0,
62 |             top_models_to_improve=0,
63 |             kmeans_features=False,
64 |             golden_features=False,
65 |             features_selection=False,
66 |             boost_on_errors=False,
67 |         )
68 |         automl.fit(X, y)
69 | 
70 |         json_path = os.path.join(self.automl_dir, "1_Default_Xgboost", "framework.json")
71 | 
72 |         with open(json_path) as file:
73 |             frame = json.load(file)
74 | 
75 |         # Injection of wrong joblib version
76 |         frame["joblib_version"] = "0.2.0"
77 | 
78 |         with open(json_path, "w") as file:
79 |             json.dump(frame, file)
80 | 
81 |         with self.assertRaises(AutoMLException):
82 |             automl_2 = AutoML(results_path=self.automl_dir)
83 |             automl_2.predict(X)
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_models_needed_for_predict.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import tempfile
 4 | import unittest
 5 | 
 6 | from supervised import AutoML
 7 | from supervised.exceptions import AutoMLException
 8 | 
 9 | 
10 | class AutoMLModelsNeededForPredictTest(unittest.TestCase):
11 |     # models_needed_on_predict
12 | 
13 |     def test_models_needed_on_predict(self):
14 |         with tempfile.TemporaryDirectory() as tmpdir:
15 |             params = {
16 |                 "saved": [
17 |                     "model_1",
18 |                     "model_2",
19 |                     "model_3",
20 |                     "unused_model",
21 |                     "Ensemble",
22 |                     "model_4_Stacked",
23 |                     "Stacked_Ensemble",
24 |                 ],
25 |                 "stacked": ["Ensemble", "model_1", "model_2"],
26 |             }
27 |             with open(os.path.join(tmpdir, "params.json"), "w") as fout:
28 |                 fout.write(json.dumps(params))
29 |             os.mkdir(os.path.join(tmpdir, "Ensemble"))
30 |             with open(os.path.join(tmpdir, "Ensemble", "ensemble.json"), "w") as fout:
31 |                 params = {
32 |                     "selected_models": [
33 |                         {"model": "model_2"},
34 |                         {"model": "model_3"},
35 |                     ]
36 |                 }
37 |                 fout.write(json.dumps(params))
38 |             os.mkdir(os.path.join(tmpdir, "Stacked_Ensemble"))
39 |             with open(
40 |                 os.path.join(tmpdir, "Stacked_Ensemble", "ensemble.json"), "w"
41 |             ) as fout:
42 |                 params = {
43 |                     "selected_models": [
44 |                         {"model": "Ensemble"},
45 |                         {"model": "model_4_Stacked"},
46 |                     ]
47 |                 }
48 |                 fout.write(json.dumps(params))
49 | 
50 |             automl = AutoML(results_path=tmpdir)
51 |             with self.assertRaises(AutoMLException) as context:
52 |                 l = automl.models_needed_on_predict("missing_model")
53 |             l = automl.models_needed_on_predict("model_1")
54 |             self.assertTrue("model_1" in l)
55 |             self.assertTrue(len(l) == 1)
56 |             l = automl.models_needed_on_predict("model_3")
57 |             self.assertTrue("model_3" in l)
58 |             self.assertTrue(len(l) == 1)
59 |             l = automl.models_needed_on_predict("Ensemble")
60 |             self.assertTrue("model_2" in l)
61 |             self.assertTrue("model_3" in l)
62 |             self.assertTrue("Ensemble" in l)
63 |             self.assertTrue(len(l) == 3)
64 |             l = automl.models_needed_on_predict("model_4_Stacked")
65 |             self.assertTrue("model_1" in l)
66 |             self.assertTrue("model_2" in l)
67 |             self.assertTrue("model_3" in l)
68 |             self.assertTrue("Ensemble" in l)
69 |             self.assertTrue("model_4_Stacked" in l)
70 |             self.assertTrue(len(l) == 5)
71 |             l = automl.models_needed_on_predict("Stacked_Ensemble")
72 |             self.assertTrue("model_1" in l)
73 |             self.assertTrue("model_2" in l)
74 |             self.assertTrue("model_3" in l)
75 |             self.assertTrue("Ensemble" in l)
76 |             self.assertTrue("model_4_Stacked" in l)
77 |             self.assertTrue("Stacked_Ensemble" in l)
78 |             self.assertTrue(len(l) == 6)
79 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_prediction_after_load.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | 
 4 | from numpy.testing import assert_almost_equal
 5 | from sklearn import datasets
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | from supervised import AutoML
 9 | 
10 | 
11 | class AutoMLPredictionAfterLoadTest(unittest.TestCase):
12 |     automl_dir = "AutoMLPredictionAfterLoadTest"
13 | 
14 |     def tearDown(self):
15 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
16 | 
17 |     def test_integration(self):
18 |         a = AutoML(
19 |             results_path=self.automl_dir,
20 |             mode="Compete",
21 |             algorithms=["Baseline", "CatBoost", "LightGBM", "Xgboost"],
22 |             stack_models=True,
23 |             total_time_limit=60,
24 |             validation_strategy={
25 |                 "validation_type": "kfold",
26 |                 "k_folds": 3,
27 |                 "shuffle": True,
28 |                 "stratify": True,
29 |                 "random_seed": 123,
30 |             },
31 |         )
32 | 
33 |         X, y = datasets.make_classification(
34 |             n_samples=1000,
35 |             n_features=30,
36 |             n_informative=29,
37 |             n_redundant=1,
38 |             n_classes=8,
39 |             n_clusters_per_class=3,
40 |             n_repeated=0,
41 |             shuffle=False,
42 |             random_state=0,
43 |         )
44 |         X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
45 | 
46 |         a.fit(X_train, y_train)
47 |         p = a.predict_all(X_test)
48 | 
49 |         a2 = AutoML(results_path=self.automl_dir)
50 |         p2 = a2.predict_all(X_test)
51 | 
52 |         assert_almost_equal(p["prediction_0"].iloc[0], p2["prediction_0"].iloc[0])
53 |         assert_almost_equal(p["prediction_7"].iloc[0], p2["prediction_7"].iloc[0])
54 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_repeated_validation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import unittest
  4 | 
  5 | import pandas as pd
  6 | from sklearn import datasets
  7 | 
  8 | from supervised import AutoML
  9 | from supervised.algorithms.random_forest import additional
 10 | from supervised.utils.common import construct_learner_name
 11 | 
 12 | additional["max_steps"] = 1
 13 | additional["trees_in_step"] = 1
 14 | 
 15 | from supervised.algorithms.xgboost import additional
 16 | 
 17 | additional["max_rounds"] = 1
 18 | 
 19 | 
 20 | class AutoMLRepeatedValidationTest(unittest.TestCase):
 21 |     automl_dir = "AutoMLRepeatedValidationTest"
 22 | 
 23 |     def tearDown(self):
 24 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 25 | 
 26 |     def test_repeated_kfold(self):
 27 |         REPEATS = 3
 28 |         FOLDS = 2
 29 | 
 30 |         a = AutoML(
 31 |             results_path=self.automl_dir,
 32 |             total_time_limit=10,
 33 |             algorithms=["Random Forest"],
 34 |             train_ensemble=False,
 35 |             validation_strategy={
 36 |                 "validation_type": "kfold",
 37 |                 "k_folds": FOLDS,
 38 |                 "repeats": REPEATS,
 39 |                 "shuffle": True,
 40 |                 "stratify": True,
 41 |             },
 42 |             start_random_models=1,
 43 |         )
 44 | 
 45 |         X, y = datasets.make_classification(
 46 |             n_samples=100,
 47 |             n_features=5,
 48 |             n_informative=4,
 49 |             n_redundant=1,
 50 |             n_classes=2,
 51 |             n_clusters_per_class=3,
 52 |             n_repeated=0,
 53 |             shuffle=False,
 54 |             random_state=0,
 55 |         )
 56 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
 57 | 
 58 |         a.fit(X, y)
 59 | 
 60 |         result_files = os.listdir(
 61 |             os.path.join(self.automl_dir, "1_Default_RandomForest")
 62 |         )
 63 | 
 64 |         cnt = 0
 65 |         for repeat in range(REPEATS):
 66 |             for fold in range(FOLDS):
 67 |                 learner_name = construct_learner_name(fold, repeat, REPEATS)
 68 |                 self.assertTrue(f"{learner_name}.random_forest" in result_files)
 69 |                 self.assertTrue(f"{learner_name}_training.log" in result_files)
 70 |                 cnt += 1
 71 |         self.assertTrue(cnt, 6)
 72 | 
 73 |     def test_repeated_split(self):
 74 |         REPEATS = 3
 75 |         FOLDS = 1
 76 | 
 77 |         a = AutoML(
 78 |             results_path=self.automl_dir,
 79 |             total_time_limit=10,
 80 |             algorithms=["Random Forest"],
 81 |             train_ensemble=False,
 82 |             validation_strategy={
 83 |                 "validation_type": "split",
 84 |                 "repeats": REPEATS,
 85 |                 "shuffle": True,
 86 |                 "stratify": True,
 87 |             },
 88 |             start_random_models=1,
 89 |         )
 90 | 
 91 |         X, y = datasets.make_classification(
 92 |             n_samples=100,
 93 |             n_features=5,
 94 |             n_informative=4,
 95 |             n_redundant=1,
 96 |             n_classes=2,
 97 |             n_clusters_per_class=3,
 98 |             n_repeated=0,
 99 |             shuffle=False,
100 |             random_state=0,
101 |         )
102 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
103 | 
104 |         a.fit(X, y)
105 | 
106 |         result_files = os.listdir(
107 |             os.path.join(self.automl_dir, "1_Default_RandomForest")
108 |         )
109 |         cnt = 0
110 |         for repeat in range(REPEATS):
111 |             for fold in range(FOLDS):
112 |                 learner_name = construct_learner_name(fold, repeat, REPEATS)
113 |                 self.assertTrue(f"{learner_name}.random_forest" in result_files)
114 |                 self.assertTrue(f"{learner_name}_training.log" in result_files)
115 |                 cnt += 1
116 |         self.assertTrue(cnt, 3)
117 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_restore.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import shutil
 4 | import unittest
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from supervised import AutoML
10 | from supervised.algorithms.xgboost import additional
11 | 
12 | additional["max_rounds"] = 1
13 | 
14 | 
15 | class AutoMLRestoreTest(unittest.TestCase):
16 |     automl_dir = "automl_tests"
17 |     rows = 50
18 | 
19 |     def tearDown(self):
20 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
21 | 
22 |     def test_tune_only_default(self):
23 |         X = np.random.rand(self.rows, 3)
24 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
25 |         y = np.random.randint(0, 2, self.rows)
26 | 
27 |         automl = AutoML(
28 |             results_path=self.automl_dir,
29 |             total_time_limit=3,
30 |             algorithms=["Decision Tree"],
31 |             explain_level=0,
32 |             train_ensemble=False,
33 |         )
34 |         automl.fit(X, y)
35 | 
36 |         # Get number of starting models
37 |         n1 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()])
38 | 
39 |         with open(os.path.join(self.automl_dir, "progress.json"), "r") as file:
40 |             progress = json.load(file)
41 |         progress["fit_level"] = "default_algorithms"
42 | 
43 |         with open(os.path.join(self.automl_dir, "progress.json"), "w") as fout:
44 |             fout.write(json.dumps(progress, indent=4))
45 | 
46 |         automl = AutoML(
47 |             results_path=self.automl_dir,
48 |             total_time_limit=3,
49 |             algorithms=["Decision Tree", "Xgboost"],
50 |             explain_level=0,
51 |             train_ensemble=False,
52 |         )
53 |         automl.fit(X, y)
54 |         # Get number of models after second fit
55 |         n2 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()])
56 |         # number of models should be equal
57 |         # user cannot overwrite parameters
58 |         self.assertEqual(n2, n1)
59 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_stack_models_constraints.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | 
 6 | from supervised import AutoML
 7 | 
 8 | 
 9 | class AutoMLStackModelsConstraintsTest(unittest.TestCase):
10 |     automl_dir = "AutoMLStackModelsConstraintsTest"
11 | 
12 |     def tearDown(self):
13 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
14 | 
15 |     def test_allow_stack_models(self):
16 |         X = np.random.uniform(size=(100, 2))
17 |         y = np.random.randint(0, 2, size=(100,))
18 |         X[:, 0] = y
19 |         X[:, 1] = -y
20 | 
21 |         automl = AutoML(
22 |             results_path=self.automl_dir,
23 |             total_time_limit=5,
24 |             mode="Compete",
25 |             validation_strategy={"validation_type": "kfold", "k_folds": 5},
26 |         )
27 |         automl.fit(X, y)
28 |         self.assertTrue(automl._stack_models)
29 |         self.assertTrue(automl.tuner._stack_models)
30 |         self.assertTrue(automl._time_ctrl._is_stacking)
31 | 
32 |     def test_disable_stack_models(self):
33 |         X = np.random.uniform(size=(100, 2))
34 |         y = np.random.randint(0, 2, size=(100,))
35 |         X[:, 0] = y
36 |         X[:, 1] = -y
37 | 
38 |         automl = AutoML(
39 |             results_path=self.automl_dir,
40 |             total_time_limit=5,
41 |             mode="Compete",
42 |             validation_strategy={"validation_type": "split"},
43 |         )
44 |         automl.fit(X, y)
45 |         self.assertFalse(automl._stack_models)
46 |         self.assertFalse(automl.tuner._stack_models)
47 |         self.assertFalse(automl._time_ctrl._is_stacking)
48 | 
49 |     def test_disable_stack_models_adjusted_validation(self):
50 |         X = np.random.uniform(size=(100, 2))
51 |         y = np.random.randint(0, 2, size=(100,))
52 |         X[:, 0] = y
53 |         X[:, 1] = -y
54 | 
55 |         automl = AutoML(
56 |             results_path=self.automl_dir, total_time_limit=5, mode="Compete"
57 |         )
58 |         automl.fit(X, y)
59 |         # the stacking should be disabled
60 |         # because of small time limit
61 |         self.assertFalse(automl._stack_models)
62 |         self.assertFalse(automl.tuner._stack_models)
63 |         self.assertFalse(automl._time_ctrl._is_stacking)
64 | 


--------------------------------------------------------------------------------
/tests/tests_automl/test_update_errors_report.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class AutoMLUpdateErrorsReportTest(unittest.TestCase):
11 |     automl_dir = "automl_testing"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_custom_init(self):
17 |         X = np.random.uniform(size=(30, 2))
18 |         y = np.random.randint(0, 2, size=(30,))
19 | 
20 |         automl = AutoML(results_path=self.automl_dir)
21 |         automl._update_errors_report("model_1", "bad error")
22 | 
23 |         errors_filename = os.path.join(self.automl_dir, "errors.md")
24 |         self.assertTrue(os.path.exists(errors_filename))
25 |         with open(errors_filename) as file:
26 |             self.assertTrue("bad error" in file.read())
27 | 


--------------------------------------------------------------------------------
/tests/tests_callbacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_callbacks/__init__.py


--------------------------------------------------------------------------------
/tests/tests_callbacks/test_total_time_constraint.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import unittest
 3 | 
 4 | from supervised.callbacks.total_time_constraint import TotalTimeConstraint
 5 | from supervised.exceptions import NotTrainedException
 6 | 
 7 | 
 8 | class TotalTimeConstraintTest(unittest.TestCase):
 9 |     def test_stop_on_first_learner(self):
10 |         params = {
11 |             "total_time_limit": 100,
12 |             "total_time_start": time.time(),
13 |             "expected_learners_cnt": 1001,
14 |         }
15 |         callback = TotalTimeConstraint(params)
16 |         callback.add_and_set_learner(learner={})
17 |         callback.on_learner_train_start(logs=None)
18 |         time.sleep(0.1)
19 |         with self.assertRaises(NotTrainedException) as context:
20 |             callback.on_learner_train_end(logs=None)
21 |         self.assertTrue("Stop training after the first fold" in str(context.exception))
22 | 
23 |     def test_stop_on_not_first_learner(self):
24 |         params = {
25 |             "total_time_limit": 100,
26 |             "total_time_start": time.time(),
27 |             "expected_learners_cnt": 10,
28 |         }
29 |         callback = TotalTimeConstraint(params)
30 |         callback.add_and_set_learner(learner={})
31 |         callback.on_learner_train_start(logs=None)
32 |         callback.on_learner_train_end(logs=None)
33 |         with self.assertRaises(NotTrainedException) as context:
34 |             #
35 |             # hardcoded change just for tests!
36 |             callback.total_time_start = time.time() - 600 - 100 - 1
37 |             #
38 |             callback.add_and_set_learner(learner={})
39 |             callback.on_learner_train_start(logs=None)
40 |             callback.on_learner_train_end(logs=None)
41 |         self.assertTrue("Force to stop" in str(context.exception))
42 | 
43 |     def test_dont_stop(self):
44 |         params = {
45 |             "total_time_limit": 100,
46 |             "total_time_start": time.time(),
47 |             "expected_learners_cnt": 10,
48 |         }
49 |         callback = TotalTimeConstraint(params)
50 | 
51 |         for i in range(10):
52 |             callback.add_and_set_learner(learner={})
53 |             callback.on_learner_train_start(logs=None)
54 |             callback.on_learner_train_end(logs=None)
55 | 


--------------------------------------------------------------------------------
/tests/tests_ensemble/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_ensemble/__init__.py


--------------------------------------------------------------------------------
/tests/tests_ensemble/test_save_load.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import pandas as pd
 5 | from sklearn import datasets
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class EnsembleSaveLoadTest(unittest.TestCase):
11 |     automl_dir = "EnsembleSaveLoadTest"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_save_load(self):
17 |         a = AutoML(
18 |             results_path=self.automl_dir,
19 |             total_time_limit=10,
20 |             explain_level=0,
21 |             mode="Explain",
22 |             train_ensemble=True,
23 |             start_random_models=1,
24 |         )
25 | 
26 |         X, y = datasets.make_classification(
27 |             n_samples=100,
28 |             n_features=5,
29 |             n_informative=4,
30 |             n_redundant=1,
31 |             n_classes=2,
32 |             n_clusters_per_class=3,
33 |             n_repeated=0,
34 |             shuffle=False,
35 |             random_state=0,
36 |         )
37 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
38 | 
39 |         a.fit(X, y)
40 |         p = a.predict(X)
41 | 
42 |         a2 = AutoML(results_path=self.automl_dir)
43 |         p2 = a2.predict(X)
44 | 
45 |         self.assertTrue((p == p2).all())
46 | 


--------------------------------------------------------------------------------
/tests/tests_fairness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_fairness/__init__.py


--------------------------------------------------------------------------------
/tests/tests_fairness/test_multi_class_classification.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class FairnessInMultiClassClassificationTest(unittest.TestCase):
11 |     automl_dir = "automl_fairness_testing"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_init(self):
17 |         X = np.random.uniform(size=(30, 2))
18 |         y = np.array(["A", "B", "C"] * 10)
19 |         S = pd.DataFrame({"sensitive": ["D", "E"] * 15})
20 | 
21 |         automl = AutoML(
22 |             results_path=self.automl_dir,
23 |             model_time_limit=10,
24 |             algorithms=["Xgboost"],
25 |             explain_level=0,
26 |             train_ensemble=False,
27 |             stack_models=False,
28 |             validation_strategy={"validation_type": "split"},
29 |             start_random_models=1,
30 |         )
31 | 
32 |         automl.fit(X, y, sensitive_features=S)
33 | 
34 |         self.assertGreater(len(automl._models), 0)
35 | 
36 |         sensitive_features_names = automl._models[0].get_sensitive_features_names()
37 |         self.assertEqual(len(sensitive_features_names), 3)
38 | 
39 |         self.assertTrue("sensitive__A" in sensitive_features_names)
40 |         self.assertTrue("sensitive__B" in sensitive_features_names)
41 |         self.assertTrue("sensitive__C" in sensitive_features_names)
42 | 
43 |         self.assertTrue(
44 |             automl._models[0].get_fairness_metric("sensitive__A") is not None
45 |         )
46 |         self.assertTrue(
47 |             automl._models[0].get_fairness_metric("sensitive__B") is not None
48 |         )
49 |         self.assertTrue(
50 |             automl._models[0].get_fairness_metric("sensitive__C") is not None
51 |         )
52 | 
53 |         self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1)
54 |         self.assertTrue(automl._models[0].get_worst_fairness() is not None)
55 |         self.assertTrue(automl._models[0].get_best_fairness() is not None)
56 | 


--------------------------------------------------------------------------------
/tests/tests_fairness/test_regression.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class FairnessInRegressionTest(unittest.TestCase):
11 |     automl_dir = "automl_fairness_testing"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_init(self):
17 |         X = np.random.uniform(size=(30, 2))
18 |         y = np.random.randint(0, 100, size=(30,))
19 |         S = pd.DataFrame({"sensitive": ["A", "B"] * 15})
20 | 
21 |         automl = AutoML(
22 |             results_path=self.automl_dir,
23 |             model_time_limit=10,
24 |             algorithms=["Xgboost"],
25 |             explain_level=0,
26 |             train_ensemble=False,
27 |             stack_models=False,
28 |             validation_strategy={"validation_type": "split"},
29 |             start_random_models=1,
30 |         )
31 | 
32 |         automl.fit(X, y, sensitive_features=S)
33 | 
34 |         self.assertGreater(len(automl._models), 0)
35 | 
36 |         sensitive_features_names = automl._models[0].get_sensitive_features_names()
37 |         self.assertEqual(len(sensitive_features_names), 1)
38 |         self.assertTrue("sensitive" in sensitive_features_names)
39 | 
40 |         self.assertTrue(automl._models[0].get_fairness_metric("sensitive") is not None)
41 |         self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1)
42 |         self.assertTrue(automl._models[0].get_worst_fairness() is not None)
43 |         self.assertTrue(automl._models[0].get_best_fairness() is not None)
44 | 
45 |     def test_two_sensitive_features(self):
46 |         X = np.random.uniform(size=(30, 2))
47 |         y = np.random.randint(0, 100, size=(30,))
48 |         S = pd.DataFrame(
49 |             {
50 |                 "sensitive_1": ["White", "Black"] * 15,
51 |                 "sensitive_2": ["Male", "Female"] * 15,
52 |             }
53 |         )
54 | 
55 |         automl = AutoML(
56 |             results_path=self.automl_dir,
57 |             model_time_limit=10,
58 |             algorithms=["Xgboost"],
59 |             explain_level=0,
60 |             train_ensemble=False,
61 |             stack_models=False,
62 |             start_random_models=1,
63 |         )
64 | 
65 |         automl.fit(X, y, sensitive_features=S)
66 | 
67 |         self.assertGreater(len(automl._models), 0)
68 | 
69 |         sensitive_features_names = automl._models[0].get_sensitive_features_names()
70 |         self.assertEqual(len(sensitive_features_names), 2)
71 | 


--------------------------------------------------------------------------------
/tests/tests_preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_preprocessing/__init__.py


--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_datetime_transformer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from supervised.preprocessing.datetime_transformer import DateTimeTransformer
 6 | 
 7 | 
 8 | class DateTimeTransformerTest(unittest.TestCase):
 9 |     def test_transformer(self):
10 |         d = {
11 |             "col1": [
12 |                 "2020/06/01",
13 |                 "2020/06/02",
14 |                 "2020/06/03",
15 |                 "2021/06/01",
16 |                 "2022/06/01",
17 |             ]
18 |         }
19 |         df = pd.DataFrame(data=d)
20 |         df["col1"] = pd.to_datetime(df["col1"])
21 |         df_org = df.copy()
22 | 
23 |         transf = DateTimeTransformer()
24 |         transf.fit(df, "col1")
25 |         df = transf.transform(df)
26 | 
27 |         self.assertTrue(df.shape[0] == 5)
28 |         self.assertTrue("col1" not in df.columns)
29 |         self.assertTrue("col1_Year" in df.columns)
30 | 
31 |         transf2 = DateTimeTransformer()
32 |         transf2.from_json(transf.to_json())
33 |         df2 = transf2.transform(df_org)
34 |         self.assertTrue("col1" not in df2.columns)
35 |         self.assertTrue("col1_Year" in df2.columns)
36 | 


--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_encoding_selector.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from supervised.preprocessing.encoding_selector import EncodingSelector
 6 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
 7 | 
 8 | 
 9 | class CategoricalIntegersTest(unittest.TestCase):
10 |     def test_selector(self):
11 |         d = {"col1": [f"{i}" for i in range(31)], "col2": ["a"] * 31}
12 |         df = pd.DataFrame(data=d)
13 | 
14 |         self.assertEqual(
15 |             EncodingSelector.get(df, None, "col1"),
16 |             PreprocessingCategorical.MANY_CATEGORIES,
17 |         )
18 |         self.assertEqual(
19 |             EncodingSelector.get(df, None, "col2"),
20 |             PreprocessingCategorical.FEW_CATEGORIES,
21 |         )
22 | 


--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_exclude_missing.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget
 7 | 
 8 | 
 9 | class ExcludeRowsMissingTargetTest(unittest.TestCase):
10 |     def test_transform(self):
11 |         d_test = {
12 |             "col1": [1, 1, np.nan, 3],
13 |             "col2": ["a", "a", np.nan, "a"],
14 |             "col3": [1, 1, 1, 3],
15 |             "col4": ["a", "a", "b", "c"],
16 |             "y": [np.nan, 1, np.nan, 2],
17 |         }
18 |         df_test = pd.DataFrame(data=d_test)
19 |         X = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
20 |         y = df_test.loc[:, "y"]
21 | 
22 |         self.assertEqual(X.shape[0], 4)
23 |         self.assertEqual(y.shape[0], 4)
24 |         X, y, _, _ = ExcludeRowsMissingTarget.transform(X, y)
25 |         self.assertEqual(X.shape[0], 2)
26 |         self.assertEqual(y.shape[0], 2)
27 | 
28 |         self.assertEqual(y[0], 1)
29 |         self.assertEqual(y[1], 2)
30 | 
31 |     def test_transform_with_sample_weight(self):
32 |         d_test = {
33 |             "col1": [1, 1, np.nan, 3],
34 |             "col2": ["a", "a", np.nan, "a"],
35 |             "col3": [1, 1, 1, 3],
36 |             "col4": ["a", "a", "b", "c"],
37 |             "sample_weight": [1, 2, 3, 4],
38 |             "y": [np.nan, 1, np.nan, 2],
39 |         }
40 |         df_test = pd.DataFrame(data=d_test)
41 |         X = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
42 |         y = df_test.loc[:, "y"]
43 |         sample_weight = df_test.loc[:, "sample_weight"]
44 | 
45 |         self.assertEqual(X.shape[0], 4)
46 |         self.assertEqual(y.shape[0], 4)
47 |         X, y, sw, _ = ExcludeRowsMissingTarget.transform(X, y, sample_weight)
48 |         self.assertEqual(X.shape[0], 2)
49 |         self.assertEqual(y.shape[0], 2)
50 |         self.assertEqual(sw.shape[0], 2)
51 | 
52 |         self.assertEqual(y[0], 1)
53 |         self.assertEqual(y[1], 2)
54 |         self.assertEqual(sw[0], 2)
55 |         self.assertEqual(sw[1], 4)
56 | 


--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_loo_encoder.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from numpy.testing import assert_almost_equal
 6 | 
 7 | from supervised.preprocessing.loo_encoder import LooEncoder
 8 | 
 9 | # disable tests
10 | # class LabelEncoderTest(unittest.TestCase):
11 | #     def test_fit(self):
12 | #         # training data
13 | #         d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"], "y": [1, 2, 0]}
14 | #         df = pd.DataFrame(data=d)
15 | #         le = LooEncoder(cols=["col1"])
16 | #         le.fit(df[["col1", "col2"]], df["y"])
17 | 
18 | #         self.assertTrue(le.enc is not None)
19 | #         self.assertTrue(le.enc._dim == 2)
20 | #         assert_almost_equal(le.enc._mean, 1.0)
21 | #         self.assertTrue("col1" in le.enc.mapping)
22 | #         self.assertTrue("col2" not in le.enc.mapping)
23 | 
24 | #     def test_transform(self):
25 | #         # training data
26 | #         d = {"col1": ["a", "a", "c"]}
27 | #         y = [1, 1, 0]
28 | #         df = pd.DataFrame(data=d)
29 | #         # fit encoder
30 | #         le = LooEncoder(cols=["col1"])
31 | #         le.fit(df, y)
32 | #         t1 = le.transform(df)
33 | 
34 | #         # test data
35 | #         d_test = {"col1": ["c", "c", "a"]}
36 | #         df_test = pd.DataFrame(data=d_test)
37 | #         # transform
38 | #         t2 = le.transform(df_test)
39 | #         assert_almost_equal(t1["col1"][0], t2["col1"][2])
40 | #         assert_almost_equal(t1["col1"][2], t2["col1"][1])
41 | 
42 | #     def test_transform_with_new_and_missing_values(self):
43 | #         # training data
44 | #         d = {"col1": ["a", "a", "c"]}
45 | #         y = [1, 1, 1]
46 | #         df = pd.DataFrame(data=d)
47 | #         # fit encoder
48 | #         le = LooEncoder(cols=["col1"])
49 | #         le.fit(df, y)
50 | #         # test data
51 | #         d_test = {"col1": ["c", "a", "d", "f", np.nan]}
52 | #         df_test = pd.DataFrame(data=d_test)
53 | #         # transform
54 | #         t = le.transform(df_test)
55 | #         assert_almost_equal(t["col1"][2], 1)
56 | #         assert_almost_equal(t["col1"][3], 1)
57 | #         assert_almost_equal(t["col1"][4], 1)
58 | 
59 | #     def test_to_and_from_json(self):
60 | #         # training data
61 | #         d = {"col1": ["a", "a", "c"]}
62 | #         y = [1, 1, 1]
63 | #         df = pd.DataFrame(data=d)
64 | #         # fit encoder
65 | #         le = LooEncoder()
66 | #         le.fit(df, y)
67 | 
68 | #         # new encoder
69 | #         new_le = LooEncoder()
70 | #         new_le.from_json(le.to_json())
71 | 
72 | #         # test data
73 | #         d_test = {"col1": ["c", "c", "a", "e"]}
74 | #         df_test = pd.DataFrame(data=d_test)
75 | #         # transform
76 | #         t = new_le.transform(df_test)
77 | #         self.assertEqual(t["col1"][0], 1)
78 | #         self.assertEqual(t["col1"][1], 1)
79 | #         self.assertEqual(t["col1"][2], 1)
80 | #         self.assertEqual(t["col1"][3], 1)
81 | 


--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_preprocessing_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
 7 | 
 8 | 
 9 | class PreprocessingUtilsTest(unittest.TestCase):
10 |     def test_get_type_numpy_number(self):
11 |         tmp = np.array([1, 2, 3])
12 |         tmp_type = PreprocessingUtils.get_type(tmp)
13 |         self.assertNotEqual(tmp_type, PreprocessingUtils.CATEGORICAL)
14 | 
15 |     def test_get_type_numpy_categorical(self):
16 |         tmp = np.array(["a", "b", "c"])
17 |         tmp_type = PreprocessingUtils.get_type(tmp)
18 |         self.assertEqual(tmp_type, PreprocessingUtils.CATEGORICAL)
19 | 
20 |     def test_get_type_pandas_bug(self):
21 |         d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]}
22 |         df = pd.DataFrame(data=d)
23 |         col1_type = PreprocessingUtils.get_type(df.loc[:, "col2"])
24 |         self.assertEqual(col1_type, PreprocessingUtils.CATEGORICAL)
25 | 
26 |     def test_get_type_pandas(self):
27 |         d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]}
28 |         df = pd.DataFrame(data=d)
29 |         col1_type = PreprocessingUtils.get_type(df["col1"])
30 |         self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
31 |         col2_type = PreprocessingUtils.get_type(df["col2"])
32 |         self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
33 | 
34 |     def test_get_stats(self):
35 |         tmp = np.array([1, np.nan, 2, 3, np.nan, np.nan])
36 |         self.assertEqual(1, PreprocessingUtils.get_min(tmp))
37 |         self.assertEqual(2, PreprocessingUtils.get_mean(tmp))
38 |         self.assertEqual(2, PreprocessingUtils.get_median(tmp))
39 |         d = {"col1": [1, 2, 1, 3, 1, np.nan], "col2": ["a", np.nan, "b", "a", "c", "a"]}
40 |         df = pd.DataFrame(data=d)
41 |         self.assertEqual(1, PreprocessingUtils.get_min(df["col1"]))
42 |         self.assertEqual(8.0 / 5.0, PreprocessingUtils.get_mean(df["col1"]))
43 |         self.assertEqual(1, PreprocessingUtils.get_median(df["col1"]))
44 | 
45 |         self.assertEqual(1, PreprocessingUtils.get_most_frequent(df["col1"]))
46 |         self.assertEqual("a", PreprocessingUtils.get_most_frequent(df["col2"]))
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_scale.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from numpy.testing import assert_almost_equal
 6 | 
 7 | from supervised.preprocessing.scale import Scale
 8 | 
 9 | 
10 | class ScaleTest(unittest.TestCase):
11 |     def test_fit_log_and_normal(self):
12 |         # training data
13 |         d = {
14 |             "col1": [12, 13, 3, 4, 5, 6, 7, 8000, 9000, 10000.0],
15 |             "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0],
16 |             "col3": [12, 2, 3, 4, 5, 6, 7, 8000, 9000, 10000.0],
17 |         }
18 |         df = pd.DataFrame(data=d)
19 | 
20 |         scale = Scale(["col1", "col3"], scale_method=Scale.SCALE_LOG_AND_NORMAL)
21 |         scale.fit(df)
22 |         df = scale.transform(df)
23 |         val = float(df["col1"][0])
24 | 
25 |         assert_almost_equal(np.mean(df["col1"]), 0)
26 |         self.assertTrue(
27 |             df["col1"][0] + 0.01 < df["col1"][1]
28 |         )  # in case of wrong scaling the small values will be squeezed
29 | 
30 |         df = scale.inverse_transform(df)
31 | 
32 |         scale2 = Scale()
33 |         scale_params = scale.to_json()
34 | 
35 |         scale2.from_json(scale_params)
36 |         df = scale2.transform(df)
37 |         assert_almost_equal(df["col1"][0], val)
38 | 
39 |     def test_fit(self):
40 |         # training data
41 |         d = {
42 |             "col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10.0],
43 |             "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0],
44 |         }
45 |         df = pd.DataFrame(data=d)
46 | 
47 |         scale = Scale(["col1"])
48 |         scale.fit(df)
49 |         df = scale.transform(df)
50 | 
51 |         assert_almost_equal(np.mean(df["col1"]), 0)
52 |         assert_almost_equal(np.mean(df["col2"]), 25.5)
53 | 
54 |         df = scale.inverse_transform(df)
55 |         assert_almost_equal(df["col1"][0], 1)
56 |         assert_almost_equal(df["col1"][1], 2)
57 | 
58 |     def test_to_and_from_json(self):
59 |         # training data
60 |         d = {
61 |             "col1": [1, 2, 3, 4, 5, 6, 7, 8.0, 9, 10],
62 |             "col2": [21, 22.0, 23, 24, 25, 26, 27, 28, 29, 30],
63 |         }
64 |         df = pd.DataFrame(data=d)
65 | 
66 |         scale = Scale(["col1"])
67 |         scale.fit(df)
68 |         # do not transform
69 |         assert_almost_equal(np.mean(df["col1"]), 5.5)
70 |         assert_almost_equal(np.mean(df["col2"]), 25.5)
71 |         # to and from json
72 | 
73 |         json_data = scale.to_json()
74 |         scale2 = Scale()
75 |         scale2.from_json(json_data)
76 |         # transform with loaded scaler
77 |         df = scale2.transform(df)
78 |         assert_almost_equal(np.mean(df["col1"]), 0)
79 |         assert_almost_equal(np.mean(df["col2"]), 25.5)
80 | 


--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_text_transformer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | from numpy.testing import assert_almost_equal
 5 | 
 6 | from supervised.preprocessing.text_transformer import TextTransformer
 7 | 
 8 | 
 9 | class TextTransformerTest(unittest.TestCase):
10 |     def test_transformer(self):
11 |         d = {
12 |             "col1": [
13 |                 "This is the first document.",
14 |                 "This document is the second document.",
15 |                 "And this is the third one.",
16 |                 None,
17 |                 "Is this the first document?",
18 |             ]
19 |         }
20 |         df = pd.DataFrame(data=d)
21 |         df_org = df.copy()
22 | 
23 |         transf = TextTransformer()
24 |         transf.fit(df, "col1")
25 |         df = transf.transform(df)
26 |         
27 |         self.assertTrue(df.shape[0] == 5)
28 |         self.assertTrue("col1" not in df.columns)
29 | 
30 |         transf2 = TextTransformer()
31 |         transf2.from_json(transf.to_json())
32 |         df2 = transf2.transform(df_org)
33 |         self.assertTrue("col1" not in df2.columns)
34 | 
35 |         assert_almost_equal(df.iloc[0, 0], df2.iloc[0, 0])
36 | 


--------------------------------------------------------------------------------
/tests/tests_tuner/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_tuner/__init__.py


--------------------------------------------------------------------------------
/tests/tests_tuner/test_hill_climbing.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from supervised.tuner.mljar_tuner import MljarTuner
 4 | 
 5 | 
 6 | class ModelMock:
 7 |     def __init__(self, name, model_type, final_loss, params):
 8 |         self.name = name
 9 |         self.model_type = model_type
10 |         self.final_loss = final_loss
11 |         self.params = params
12 | 
13 |     def get_name(self):
14 |         return self.name
15 | 
16 |     def get_type(self):
17 |         return self.model_type
18 | 
19 |     def get_final_loss(self):
20 |         return self.final_loss
21 | 
22 |     def get_train_time(self):
23 |         return 0.1
24 | 
25 | 
26 | class TunerHillClimbingTest(unittest.TestCase):
27 |     def test_hill_climbing(self):
28 |         models = []
29 |         models += [
30 |             ModelMock(
31 |                 "121_RandomForest",
32 |                 "Random Forest",
33 |                 0.1,
34 |                 {
35 |                     "learner": {"max_features": 0.4, "model_type": "Random Forest"},
36 |                     "preprocessing": {},
37 |                     "validation_strategy": {},
38 |                 },
39 |             )
40 |         ]
41 |         models += [
42 |             ModelMock(
43 |                 "1_RandomForest",
44 |                 "Random Forest",
45 |                 0.1,
46 |                 {
47 |                     "learner": {"max_features": 0.4, "model_type": "Random Forest"},
48 |                     "preprocessing": {},
49 |                     "validation_strategy": {},
50 |                 },
51 |             )
52 |         ]
53 |         tuner = MljarTuner(
54 |             {
55 |                 "start_random_models": 0,
56 |                 "hill_climbing_steps": 1,
57 |                 "top_models_to_improve": 2,
58 |             },
59 |             algorithms=["Random Foresrt"],
60 |             ml_task="binary_classification",
61 |             eval_metric="logloss",
62 |             validation_strategy={},
63 |             explain_level=2,
64 |             data_info={"columns_info": [], "target_info": []},
65 |             golden_features=False,
66 |             features_selection=False,
67 |             train_ensemble=False,
68 |             stack_models=False,
69 |             adjust_validation=False,
70 |             boost_on_errors=False,
71 |             kmeans_features=False,
72 |             mix_encoding=False,
73 |             optuna_time_budget=None,
74 |             optuna_init_params={},
75 |             optuna_verbose=True,
76 |             n_jobs=1,
77 |             seed=12,
78 |         )
79 |         ind = 121
80 |         score = 0.1
81 |         for _ in range(5):
82 |             for p in tuner.get_hill_climbing_params(models):
83 |                 models += [ModelMock(p["name"], "Random Forest", score, p)]
84 |                 score *= 0.1
85 |                 self.assertTrue(int(p["name"].split("_")[0]) > ind)
86 |                 ind += 1
87 | 


--------------------------------------------------------------------------------
/tests/tests_tuner/test_time_controller.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import unittest
 3 | 
 4 | from numpy.testing import assert_almost_equal
 5 | 
 6 | from supervised.tuner.time_controller import TimeController
 7 | 
 8 | 
 9 | class TimeControllerTest(unittest.TestCase):
10 |     def test_to_and_from_json(self):
11 |         tc = TimeController(
12 |             start_time=time.time(),
13 |             total_time_limit=10,
14 |             model_time_limit=None,
15 |             steps=["simple_algorithms"],
16 |             algorithms=["Baseline"],
17 |         )
18 |         tc.log_time("1_Baseline", "Baseline", "simple_algorithms", 123.1)
19 | 
20 |         tc2 = TimeController.from_json(tc.to_json())
21 | 
22 |         assert_almost_equal(tc2.step_spend("simple_algorithms"), 123.1)
23 |         assert_almost_equal(tc2.model_spend("Baseline"), 123.1)
24 | 
25 |     def test_enough_time_for_stacking(self):
26 |         for t in [5, 10, 20]:
27 |             tc = TimeController(
28 |                 start_time=time.time(),
29 |                 total_time_limit=100,
30 |                 model_time_limit=None,
31 |                 steps=[
32 |                     "default_algorithms",
33 |                     "not_so_random",
34 |                     "golden_features",
35 |                     "insert_random_feature",
36 |                     "features_selection",
37 |                     "hill_climbing_1",
38 |                     "hill_climbing_3",
39 |                     "hill_climbing_5",
40 |                     "ensemble",
41 |                     "stack",
42 |                     "ensemble_stacked",
43 |                 ],
44 |                 algorithms=["Xgboost"],
45 |             )
46 |             tc.log_time("1_Xgboost", "Xgboost", "default_algorithms", t)
47 |             tc.log_time("2_Xgboost", "Xgboost", "not_so_random", t)
48 |             tc.log_time("3_Xgboost", "Xgboost", "insert_random_feature", t)
49 |             tc.log_time("4_Xgboost", "Xgboost", "features_selection", t)
50 |             tc.log_time("5_Xgboost", "Xgboost", "hill_climbing_1", t)
51 |             tc.log_time("6_Xgboost", "Xgboost", "hill_climbing_2", t)
52 |             tc.log_time("7_Xgboost", "Xgboost", "hill_climbing_3", t)
53 | 
54 |             tc._start_time = time.time() - 7 * t
55 |             assert_almost_equal(tc.already_spend(), 7 * t)
56 |             if t < 20:
57 |                 self.assertTrue(tc.enough_time("Xgboost", "stack"))
58 |             else:
59 |                 self.assertFalse(tc.enough_time("Xgboost", "stack"))
60 |             self.assertTrue(tc.enough_time("Ensemble_Stacked", "ensemble_stacked"))
61 | 


--------------------------------------------------------------------------------
/tests/tests_tuner/test_tuner.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from supervised.tuner.mljar_tuner import MljarTuner
 4 | 
 5 | 
 6 | class TunerTest(unittest.TestCase):
 7 |     def test_key_params(self):
 8 |         params1 = {
 9 |             "preprocessing": {"p1": 1, "p2": 2},
10 |             "learner": {"p1": 1, "p2": 2},
11 |             "validation_strategy": {},
12 |         }
13 |         params2 = {
14 |             "preprocessing": {"p1": 1, "p2": 2},
15 |             "learner": {"p2": 2, "p1": 1},
16 |             "validation_strategy": {},
17 |         }
18 |         key1 = MljarTuner.get_params_key(params1)
19 |         key2 = MljarTuner.get_params_key(params2)
20 |         self.assertEqual(key1, key2)
21 | 


--------------------------------------------------------------------------------
/tests/tests_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_utils/__init__.py


--------------------------------------------------------------------------------
/tests/tests_utils/test_compute_additional_metrics.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | 
 5 | from supervised.algorithms.registry import BINARY_CLASSIFICATION, REGRESSION
 6 | from supervised.utils.additional_metrics import AdditionalMetrics
 7 | 
 8 | 
 9 | class ComputeAdditionalMetricsTest(unittest.TestCase):
10 |     def test_compute(self):
11 |         target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
12 |         pred = np.array([0.1, 0.8, 0.1, 0.1, 0.8, 0.1, 0.8, 0.8])
13 |         info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION)
14 |         details = info["metric_details"]
15 |         max_metrics = info["max_metrics"]
16 |         conf = info["confusion_matrix"]
17 |         self.assertEqual(conf.iloc[0, 0], 3)
18 |         self.assertEqual(conf.iloc[1, 1], 3)
19 |         self.assertTrue(details is not None)
20 |         self.assertTrue(max_metrics is not None)
21 | 
22 |     def test_compute_f1(self):
23 |         target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
24 |         pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8])
25 |         info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION)
26 |         details = info["metric_details"]
27 |         max_metrics = info["max_metrics"]
28 |         conf = info["confusion_matrix"]
29 |         self.assertEqual(max_metrics["f1"]["score"], 1)
30 |         self.assertTrue(details is not None)
31 |         self.assertTrue(conf is not None)
32 | 
33 |     def test_compute_for_regression(self):
34 |         target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
35 |         pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8])
36 |         info = AdditionalMetrics.compute(target, pred, None, REGRESSION)
37 |         all_metrics = list(info["max_metrics"]["Metric"].values)
38 |         for m in ["MAE", "MSE", "RMSE", "R2"]:
39 |             self.assertTrue(m in all_metrics)
40 | 
41 |     def test_compute_constant_preds(self):
42 |         target = np.array([0, 0, 1, 1, 0, 0, 0, 0])
43 |         pred = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
44 |         info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION)
45 |         details = info["metric_details"]
46 |         max_metrics = info["max_metrics"]
47 |         conf = info["confusion_matrix"]
48 |         self.assertTrue(max_metrics["f1"]["score"] < 1)
49 |         self.assertTrue(max_metrics["mcc"]["score"] < 1)
50 | 


--------------------------------------------------------------------------------
/tests/tests_utils/test_importance.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.tree import DecisionTreeClassifier
 8 | 
 9 | from supervised.utils.importance import PermutationImportance
10 | 
11 | 
12 | class PermutationImportanceTest(unittest.TestCase):
13 |     def test_compute_and_plot(self):
14 |         rows = 20
15 |         X = np.random.rand(rows, 3)
16 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
17 |         y = np.random.randint(0, 2, rows)
18 | 
19 |         model = DecisionTreeClassifier(max_depth=1)
20 |         model.fit(X, y)
21 | 
22 |         with tempfile.TemporaryDirectory() as tmpdir:
23 |             PermutationImportance.compute_and_plot(
24 |                 model,
25 |                 X_validation=X,
26 |                 y_validation=y,
27 |                 model_file_path=tmpdir,
28 |                 learner_name="learner_test",
29 |                 metric_name=None,
30 |                 ml_task="binary_classification",
31 |             )
32 |             self.assertTrue(
33 |                 os.path.exists(os.path.join(tmpdir, "learner_test_importance.csv"))
34 |             )
35 | 


--------------------------------------------------------------------------------
/tests/tests_utils/test_learning_curves.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from supervised.utils.learning_curves import LearningCurves
 5 | 
 6 | 
 7 | class LearningCurvesTest(unittest.TestCase):
 8 |     def test_plot_close(self):
 9 |         """
10 |         Test if we close plots. To avoid following warning:
11 |         RuntimeWarning: More than 20 figures have been opened.
12 |         Figures created through the pyplot interface (`matplotlib.pyplot.figure`)
13 |         are retained until explicitly closed and may consume too much memory.
14 |         """
15 |         for _ in range(
16 |             1
17 |         ):  # you can increase the range, for tests speed reason I keep it low
18 |             LearningCurves.plot_for_ensemble([3, 2, 1], "random_metrics", ".")
19 | 
20 |         os.remove(LearningCurves.output_file_name)
21 | 


--------------------------------------------------------------------------------
/tests/tests_utils/test_metric.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from numpy.testing import assert_almost_equal
 5 | 
 6 | from supervised.utils.metric import Metric
 7 | from supervised.utils.metric import UserDefinedEvalMetric
 8 | 
 9 | 
10 | class MetricTest(unittest.TestCase):
11 |     def test_create(self):
12 |         params = {"name": "logloss"}
13 |         m = Metric(params)
14 |         y_true = np.array([0, 0, 1, 1])
15 |         y_predicted = np.array([0, 0, 1, 1])
16 |         score = m(y_true, y_predicted)
17 |         self.assertTrue(score < 0.1)
18 |         y_true = np.array([0, 0, 1, 1])
19 |         y_predicted = np.array([1, 1, 0, 0])
20 |         score = m(y_true, y_predicted)
21 |         self.assertTrue(score > 1.0)
22 | 
23 |     def test_metric_improvement(self):
24 |         params = {"name": "logloss"}
25 |         m = Metric(params)
26 |         y_true = np.array([0, 0, 1, 1])
27 |         y_predicted = np.array([0, 0, 0, 1])
28 |         score_1 = m(y_true, y_predicted)
29 |         y_true = np.array([0, 0, 1, 1])
30 |         y_predicted = np.array([0, 0, 1, 1])
31 |         score_2 = m(y_true, y_predicted)
32 |         self.assertTrue(m.improvement(score_1, score_2))
33 | 
34 |     def test_sample_weight(self):
35 |         metrics = ["logloss", "auc", "acc", "rmse", "mse", "mae", "r2", "mape"]
36 |         for m in metrics:
37 |             metric = Metric({"name": m})
38 |             y_true = np.array([0, 0, 1, 1])
39 |             y_predicted = np.array([0, 0, 0, 1])
40 |             sample_weight = np.array([1, 1, 1, 1])
41 | 
42 |             score_1 = metric(y_true, y_predicted)
43 |             score_2 = metric(y_true, y_predicted, sample_weight)
44 |             assert_almost_equal(score_1, score_2)
45 | 
46 |     def test_r2_metric(self):
47 |         params = {"name": "r2"}
48 |         m = Metric(params)
49 |         y_true = np.array([0, 0, 1, 1])
50 |         y_predicted = np.array([0, 0, 1, 1])
51 |         score = m(y_true, y_predicted)
52 |         self.assertEqual(score, -1.0)  # negative r2
53 | 
54 |     def test_mape_metric(self):
55 |         params = {"name": "mape"}
56 |         m = Metric(params)
57 |         y_true = np.array([0, 0, 1, 1])
58 |         y_predicted = np.array([0, 0, 1, 1])
59 |         score = m(y_true, y_predicted)
60 |         self.assertEqual(score, 0.0)
61 | 
62 |     def test_user_defined_metric(self):
63 |         def custom(x, y, sample_weight=None):
64 |             return np.sum(x + y)
65 | 
66 |         UserDefinedEvalMetric().set_metric(custom)
67 | 
68 |         params = {"name": "user_defined_metric"}
69 |         m = Metric(params)
70 | 
71 |         a = np.array([1, 1, 1])
72 | 
73 |         score = m(a, a)
74 |         self.assertEqual(score, 6)
75 | 


--------------------------------------------------------------------------------
/tests/tests_utils/test_shap.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from supervised.utils.shap import PlotSHAP
 7 | 
 8 | 
 9 | class PlotSHAPTest(unittest.TestCase):
10 |     def test_get_sample_data_larger_1k(self):
11 |         """Get sample when data is larger than 1k"""
12 |         X = pd.DataFrame(np.random.uniform(size=(5763, 31)))
13 |         y = pd.Series(np.random.randint(0, 2, size=(5763,)))
14 | 
15 |         X_, y_ = PlotSHAP.get_sample(X, y)
16 | 
17 |         self.assertEqual(X_.shape[0], 1000)
18 |         self.assertEqual(y_.shape[0], 1000)
19 | 
20 |     def test_get_sample_data_smaller_1k(self):
21 |         """Get sample when data is smaller than 1k"""
22 |         SAMPLES = 100
23 |         X = pd.DataFrame(np.random.uniform(size=(SAMPLES, 31)))
24 |         y = pd.Series(np.random.randint(0, 2, size=(SAMPLES,)))
25 | 
26 |         X_, y_ = PlotSHAP.get_sample(X, y)
27 | 
28 |         self.assertEqual(X_.shape[0], SAMPLES)
29 |         self.assertEqual(y_.shape[0], SAMPLES)
30 | 


--------------------------------------------------------------------------------
/tests/tests_utils/test_subsample.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from supervised.algorithms.registry import REGRESSION
 7 | from supervised.utils.subsample import subsample
 8 | 
 9 | 
10 | class SubsampleTest(unittest.TestCase):
11 |     def test_subsample_regression_10k(self):
12 |         rows = 10000
13 |         cols = 51
14 |         X = np.random.rand(rows, cols)
15 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(cols)])
16 |         y = pd.Series(np.random.rand(rows), name="target")
17 | 
18 |         X_train, X_test, y_train, y_test = subsample(
19 |             X, y, train_size=1000, ml_task=REGRESSION
20 |         )
21 | 
22 |         self.assertTrue(X_train.shape[0], 1000)
23 |         self.assertTrue(X_test.shape[0], 9000)
24 |         self.assertTrue(y_train.shape[0], 1000)
25 |         self.assertTrue(y_test.shape[0], 9000)
26 | 


--------------------------------------------------------------------------------
/tests/tests_validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mljar/mljar-supervised/c19f8540e02d462c0df3c36493ff866762dbc430/tests/tests_validation/__init__.py


--------------------------------------------------------------------------------