├── .editorconfig ├── .github ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ ├── feature_request.md │ └── question.md └── workflows │ ├── CI.yml │ ├── docs.yml │ ├── mirror.yml │ ├── publish_pypi.yml │ ├── tests_macos.yml │ ├── tests_ubuntu.yml │ └── tests_windows.yml ├── .gitignore ├── .gitlab ├── .gitlab-ci.yml └── release.yml ├── .jupyter └── jupyter_notebook_config.py ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── LICENSE ├── README.md ├── _config.yml ├── check_docs.py ├── docs ├── Makefile ├── _static │ └── style.css ├── _templates │ ├── autosummary │ │ ├── class.rst │ │ └── module.rst │ ├── classtemplate.rst │ └── functiontemplate.rst ├── conf.py ├── index.rst ├── mock_docs.py ├── pages │ ├── Installation.rst │ ├── Kaggle_Kernels.rst │ ├── Others.rst │ ├── Python-API.rst │ ├── Tutorials.rst │ ├── modules │ │ ├── addons.rst │ │ ├── automl.rst │ │ ├── dataset.rst │ │ ├── image.rst │ │ ├── ml_algo.rst │ │ ├── ml_algo.torch_based.rst │ │ ├── ml_algo.tuning.rst │ │ ├── pipelines.features.rst │ │ ├── pipelines.ml.rst │ │ ├── pipelines.rst │ │ ├── pipelines.selection.rst │ │ ├── reader.rst │ │ ├── report.rst │ │ ├── tasks.losses.rst │ │ ├── tasks.rst │ │ ├── text.rst │ │ ├── transformers.rst │ │ ├── utils.rst │ │ └── validation.rst │ └── tutorials │ │ ├── Tutorial_10_relational_data_with_star_scheme.nblink │ │ ├── Tutorial_11_time_series.nblink │ │ ├── Tutorial_12_AA_Test.nblink │ │ ├── Tutorial_13_AA_Test_multigroup_split.nblink │ │ ├── Tutorial_14_AB_Test.nblink │ │ ├── Tutorial_15_Matching.nblink │ │ ├── Tutorial_16_Matching_without_replacement.nblink │ │ ├── Tutorial_17_Modeling_Limit_Distribution.nblink │ │ ├── Tutorial_18_Test_Limit_Distribution.nblink │ │ ├── Tutorial_1_basics.nblink │ │ ├── Tutorial_2_WhiteBox_AutoWoE.nblink │ │ ├── Tutorial_3_sql_data_source.nblink │ │ ├── Tutorial_4_NLP_Interpretation.nblink │ │ ├── Tutorial_5_uplift.nblink │ │ ├── Tutorial_6_custom_pipeline.nblink │ │ ├── Tutorial_7_ICE_and_PDP_interpretation.nblink │ │ ├── Tutorial_8_CV_preset.nblink │ │ └── Tutorial_9_neural_networks.nblink └── requirements.txt ├── examples ├── README.md ├── data │ ├── ai92_value_77.csv │ ├── avito1k_train.csv │ ├── jobs_train.csv │ ├── meal_delivery_company │ │ ├── fulfilment_center_info.csv │ │ ├── meal_info.csv │ │ └── relational_main.csv.zip │ ├── sampled_app_train.csv │ └── ts_data.csv ├── demo0.py ├── demo1.py ├── demo10.py ├── demo11.py ├── demo12.py ├── demo13.py ├── demo14.py ├── demo15.py ├── demo2.py ├── demo3.py ├── demo4.py ├── demo5.py ├── demo6.py ├── demo7.py ├── demo8.py ├── demo9.py ├── optimization │ ├── conditional_parameters.py │ ├── custom_search_space.py │ └── sequential_parameter_search.py ├── simple_tabular_classification.py └── tutorials │ ├── Tutorial_10_relational_data_with_star_scheme.ipynb │ ├── Tutorial_11_time_series.ipynb │ ├── Tutorial_12_AA_Test.ipynb │ ├── Tutorial_13_AA_Test_multigroup_split.ipynb │ ├── Tutorial_14_AB_Test.ipynb │ ├── Tutorial_15_Matching.ipynb │ ├── Tutorial_16_Matching_without_replacement.ipynb │ ├── Tutorial_17_Modeling_Limit_Distribution.ipynb │ ├── Tutorial_18_Test_Limit_Distribution.ipynb │ ├── Tutorial_1_basics.ipynb │ ├── Tutorial_2_WhiteBox_AutoWoE.ipynb │ ├── Tutorial_3_sql_data_source.ipynb │ ├── Tutorial_4_NLP_Interpretation.ipynb │ ├── Tutorial_5_uplift.ipynb │ ├── Tutorial_6_custom_pipeline.ipynb │ ├── Tutorial_7_ICE_and_PDP_interpretation.ipynb │ ├── Tutorial_8_CV_preset.ipynb │ └── Tutorial_9_neural_networks.ipynb ├── imgs ├── GENERALL2X2.jpg ├── LightAutoML_logo_big.png ├── LightAutoML_logo_small.png ├── Star_scheme_tables.png ├── TabularAutoML_model_descr.png ├── TabularUtilizedAutoML_model_descr.png ├── autoint.png ├── denselight.png ├── densenet.png ├── fttransformer.png ├── lightautoml_icon_color.png ├── lightautoml_logo_color.png ├── lime.jpg ├── node.png ├── resnet.png ├── swa.png ├── tutorial_11_case_problem_statement.png ├── tutorial_11_general_problem_statement.png ├── tutorial_11_history_step_params.png ├── tutorial_11_transformers_params.png ├── tutorial_1_initial_report.png ├── tutorial_1_laml_big.png ├── tutorial_1_ml_pipeline.png ├── tutorial_1_pipeline.png ├── tutorial_1_unfolded_report.png ├── tutorial_2_initial_report.png ├── tutorial_2_pipeline.png ├── tutorial_2_unfolded_report.png ├── tutorial_3_initial_report.png ├── tutorial_3_unfolded_report.png ├── tutorial_blackbox_pipeline.png ├── tutorial_whitebox_report_1.png ├── tutorial_whitebox_report_2.png ├── tutorial_whitebox_report_3.png └── tutorial_whitebox_report_4.png ├── lightautoml ├── __init__.py ├── addons │ ├── __init__.py │ ├── autots │ │ └── base.py │ ├── hypex │ │ └── __init__.py │ ├── interpretation │ │ ├── __init__.py │ │ ├── data_process.py │ │ ├── l2x.py │ │ ├── l2x_model.py │ │ ├── lime.py │ │ └── utils.py │ ├── tabular_interpretation │ │ ├── __init__.py │ │ └── sswarm.py │ ├── uplift │ │ ├── __init__.py │ │ ├── base.py │ │ ├── metalearners.py │ │ ├── metrics.py │ │ └── utils.py │ └── utilization │ │ ├── __init__.py │ │ └── utilization.py ├── automl │ ├── __init__.py │ ├── base.py │ ├── blend.py │ └── presets │ │ ├── __init__.py │ │ ├── base.py │ │ ├── image_config.yml │ │ ├── image_presets.py │ │ ├── tabular_config.yml │ │ ├── tabular_configs │ │ ├── conf_0_sel_type_0.yml │ │ ├── conf_1_sel_type_1.yml │ │ ├── conf_2_select_mode_1_no_typ.yml │ │ ├── conf_3_sel_type_1_no_inter_lgbm.yml │ │ ├── conf_4_sel_type_0_no_int.yml │ │ ├── conf_5_sel_type_1_tuning_full.yml │ │ └── conf_6_sel_type_1_tuning_full_no_int_lgbm.yml │ │ ├── tabular_presets.py │ │ ├── text_config.yml │ │ ├── text_presets.py │ │ ├── time_series_config.yml │ │ ├── utils.py │ │ ├── whitebox_config.yml │ │ └── whitebox_presets.py ├── dataset │ ├── __init__.py │ ├── base.py │ ├── np_pd_dataset.py │ ├── roles.py │ ├── seq_np_pd_dataset.py │ └── utils.py ├── image │ ├── __init__.py │ ├── image.py │ └── utils.py ├── ml_algo │ ├── __init__.py │ ├── base.py │ ├── boost_cb.py │ ├── boost_lgbm.py │ ├── boost_xgb.py │ ├── dl_model.py │ ├── linear_sklearn.py │ ├── random_forest.py │ ├── tabnet │ │ └── utils.py │ ├── torch_based │ │ ├── __init__.py │ │ ├── autoint │ │ │ ├── autoint_utils.py │ │ │ └── ghost_norm.py │ │ ├── fttransformer │ │ │ └── fttransformer_utils.py │ │ ├── linear_model.py │ │ ├── nn_models.py │ │ └── node_nn_model.py │ ├── tuning │ │ ├── __init__.py │ │ ├── base.py │ │ ├── hyperopt.py │ │ └── optuna.py │ ├── utils.py │ └── whitebox.py ├── pipelines │ ├── __init__.py │ ├── features │ │ ├── __init__.py │ │ ├── base.py │ │ ├── generator_pipeline.py │ │ ├── image_pipeline.py │ │ ├── lgb_pipeline.py │ │ ├── linear_pipeline.py │ │ ├── text_pipeline.py │ │ ├── torch_pipeline.py │ │ └── wb_pipeline.py │ ├── ml │ │ ├── __init__.py │ │ ├── base.py │ │ ├── nested_ml_pipe.py │ │ └── whitebox_ml_pipe.py │ ├── selection │ │ ├── __init__.py │ │ ├── base.py │ │ ├── importance_based.py │ │ ├── linear_selector.py │ │ └── permutation_importance_based.py │ └── utils.py ├── reader │ ├── __init__.py │ ├── base.py │ ├── guess_roles.py │ ├── seq.py │ ├── tabular_batch_generator.py │ └── utils.py ├── report │ ├── __init__.py │ ├── lama_report_templates │ │ ├── binary_inference_section.html │ │ ├── feature_importance_section.html │ │ ├── feature_importance_utillized_section.html │ │ ├── interpretation_section.html │ │ ├── interpretation_subsection.html │ │ ├── lama_base_template.html │ │ ├── model_section.html │ │ ├── model_section_utilized.html │ │ ├── multiclass_inference_section.html │ │ ├── nlp_section.html │ │ ├── nlp_subsection.html │ │ ├── preset_section.html │ │ ├── reg_inference_section.html │ │ ├── results_section.html │ │ ├── train_set_section.html │ │ ├── train_set_section_utilized.html │ │ ├── uplift_section.html │ │ ├── uplift_subsection.html │ │ ├── utilized_data_subsections.html │ │ └── whitebox_section.html │ └── report_deco.py ├── tasks │ ├── __init__.py │ ├── base.py │ ├── common_metric.py │ ├── losses │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cb.py │ │ ├── cb_custom.py │ │ ├── lgb.py │ │ ├── lgb_custom.py │ │ ├── sklearn.py │ │ ├── torch.py │ │ └── xgb.py │ └── utils.py ├── text │ ├── __init__.py │ ├── dl_transformers.py │ ├── dp_utils.py │ ├── embed.py │ ├── embed_dataset.py │ ├── nn_model.py │ ├── tokenizer.py │ ├── trainer.py │ ├── utils.py │ └── weighted_average_transformer.py ├── transformers │ ├── __init__.py │ ├── base.py │ ├── categorical.py │ ├── composite.py │ ├── datetime.py │ ├── decomposition.py │ ├── generator.py │ ├── groupby.py │ ├── image.py │ ├── numeric.py │ ├── seq.py │ ├── text.py │ └── utils.py ├── utils │ ├── __init__.py │ ├── installation.py │ ├── logging.py │ └── timer.py └── validation │ ├── __init__.py │ ├── base.py │ ├── np_iterators.py │ └── utils.py ├── pyproject.toml ├── scripts ├── README.md ├── exp_branch_push.py ├── experiments │ ├── run.py │ ├── run_tabular.py │ └── utils.py ├── poetry_fix.py └── run_tutorials.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── integration │ ├── integration_utils.py │ ├── test_custom_2_level_stacking.py │ ├── test_demo0.py │ ├── test_demo1.py │ ├── test_demo10.py │ ├── test_demo11.py │ ├── test_demo12.py │ ├── test_demo13.py │ ├── test_demo15.py │ ├── test_demo2.py │ ├── test_demo3.py │ ├── test_demo4.py │ ├── test_demo5.py │ ├── test_demo6.py │ ├── test_demo7.py │ └── test_demo8.py └── unit │ ├── __init__.py │ ├── test_addons │ └── __init__.py │ ├── test_automl │ ├── __init__.py │ └── test_presets │ │ ├── __init__.py │ │ ├── presets_utils.py │ │ ├── test_tabularautoml.py │ │ ├── test_tabularautoml_nn.py │ │ ├── test_tabularautoml_xgb.py │ │ ├── test_tabularnlpautoml.py │ │ ├── test_tabularutilizedautoml.py │ │ ├── test_uplift.py │ │ └── test_whiteboxpreset.py │ ├── test_dataset │ └── __init__.py │ ├── test_image │ └── __init__.py │ ├── test_ml_algo │ ├── __init__.py │ └── test_optimization │ │ └── optuna │ │ └── test_optuna_tuner.py │ ├── test_pipelines │ └── __init__.py │ ├── test_reader │ └── __init__.py │ ├── test_report │ └── __init__.py │ ├── test_tasks │ └── __init__.py │ ├── test_text │ └── __init__.py │ ├── test_transformers │ ├── __init__.py │ └── test_numeric.py │ ├── test_utils │ ├── __init__.py │ └── test_logging.py │ └── test_validation │ └── __init__.py └── tox.ini /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug 3 | about: Create a bug report 4 | title: '' 5 | labels: bug 6 | assignees: 7 | 8 | --- 9 | 10 | ## 🐛 Bug 11 | 12 | 13 | 14 | ## Environment 15 | 1. lightautoml version: 16 | 2. python version: 17 | 3. OS: 18 | 4. pip freeze file: 19 | 20 | ### To Reproduce 21 | Steps to reproduce the behavior: 22 | 1. first step 23 | 2. second step 24 | 25 | 26 | 27 | ### Expected behavior 28 | 29 | 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | title: "[Bug]: " 4 | labels: ["bug", "triage"] 5 | assignees: 6 | - octocat 7 | body: 8 | - type: markdown 9 | attributes: 10 | value: | 11 | Thanks for taking the time to fill out this bug report! 12 | - type: input 13 | id: contact 14 | attributes: 15 | label: Contact Details 16 | description: How can we get in touch with you if we need more info? 17 | placeholder: ex. email@example.com 18 | validations: 19 | required: false 20 | - type: textarea 21 | id: what-happened 22 | attributes: 23 | label: What happened? 24 | description: Also tell us, what did you expect to happen? 25 | placeholder: Tell us what you see! 26 | value: "A bug happened!" 27 | validations: 28 | required: true 29 | - type: dropdown 30 | id: version 31 | attributes: 32 | label: Version 33 | description: What version of our software are you running? 34 | options: 35 | - 1.0.2 (Default) 36 | - 1.0.3 (Edge) 37 | validations: 38 | required: true 39 | - type: dropdown 40 | id: browsers 41 | attributes: 42 | label: What browsers are you seeing the problem on? 43 | multiple: true 44 | options: 45 | - Firefox 46 | - Chrome 47 | - Safari 48 | - Microsoft Edge 49 | - type: textarea 50 | id: logs 51 | attributes: 52 | label: Relevant log output 53 | description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. 54 | render: shell 55 | - type: checkboxes 56 | id: terms 57 | attributes: 58 | label: Code of Conduct 59 | description: By submitting this issue, you agree to follow our [Code of Conduct](https://example.com) 60 | options: 61 | - label: I agree to follow this project's Code of Conduct 62 | required: true 63 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest a feature to implement 4 | title: '' 5 | labels: enhancement 6 | assignees: 7 | 8 | --- 9 | 10 | ## 🚀 Feature Request 11 | 12 | 13 | 14 | ### Motivation 15 | 16 | 17 | 18 | ### Proposal 19 | 20 | 21 | 22 | ### Alternatives 23 | 24 | 25 | 26 | ### Additional context 27 | 28 | 29 | 30 | ### Checklist 31 | - [ ] feature proposal description 32 | - [ ] motivation 33 | - [ ] additional context / proposal alternatives review 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: How to question 3 | about: Asking how-to questions 4 | title: '' 5 | labels: help wanted, question 6 | assignees: 7 | --- 8 | 9 | ## ❓ Questions and Help 10 | 11 | ### Before asking: 12 | 1. search the issues. 13 | 2. search the docs. 14 | 15 | 16 | #### What is your question? 17 | 18 | 19 | #### Code 20 | 21 | 22 | 23 | #### What have you tried? 24 | 25 | 26 | ### Additional context 27 | 28 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | # Manually triggerable in github 5 | workflow_dispatch: 6 | 7 | push: 8 | paths-ignore: 9 | - "docs/**" 10 | - "*.md" 11 | - ".github/workflows/mirror.yml" 12 | - ".gitlab/.gitlab-ci.yml" 13 | 14 | pull_request: 15 | paths-ignore: 16 | - "docs/**" 17 | - "*.md" 18 | - ".github/workflows/mirror.yml" 19 | - ".gitlab/.gitlab-ci.yml" 20 | 21 | jobs: 22 | pre-commit: 23 | runs-on: ubuntu-latest 24 | steps: 25 | - name: checkout 26 | uses: actions/checkout@v4 27 | 28 | - uses: actions/setup-python@v4 29 | with: 30 | python-version: "3.9" 31 | 32 | - name: pre-commit 33 | uses: pre-commit/action@v2.0.3 34 | 35 | linux-py39-tests: 36 | needs: pre-commit 37 | runs-on: ubuntu-latest 38 | if: | 39 | ( github.event_name == 'push' ) && ( needs.pre-commit.result == 'success' ) 40 | || 41 | ( github.event_name == 'pull_request' ) && ( needs.pre-commit.result == 'success' ) 42 | || 43 | ( github.event_name == 'workflow_dispatch' ) && ( needs.pre-commit.result == 'success' ) 44 | 45 | steps: 46 | - uses: actions/checkout@v4 47 | 48 | - name: Set up Python 49 | uses: actions/setup-python@v4 50 | 51 | - uses: Gr1N/setup-poetry@v8 52 | with: 53 | poetry-version: 1.1.7 54 | 55 | # - name: update pip if python 3.12 56 | # run: pip install setuptools && python -m ensurepip --upgrade 57 | 58 | - name: install deps for Ubuntu 59 | run: sudo apt-get install build-essential libcairo2 libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 libffi-dev shared-mime-info 60 | 61 | - name: install tox 62 | run: | 63 | python3 -m pip install --upgrade pip 64 | pip3 install tox==3.28.0 65 | pip3 install tox-gh-actions==2.12.0 66 | 67 | - name: test with tox 68 | run: | 69 | tox 70 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | 3 | on: 4 | # At 20:59 every tuesday (23:59 MSK) 5 | schedule: 6 | - cron: 59 20 * * 2 7 | 8 | # Manually triggerable in github 9 | workflow_dispatch: 10 | 11 | jobs: 12 | codespell: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: checkout 17 | uses: actions/checkout@v4 18 | 19 | - name: codespell 20 | uses: codespell-project/actions-codespell@v2 21 | 22 | docs: 23 | runs-on: ubuntu-latest 24 | steps: 25 | - name: checkout 26 | uses: actions/checkout@v4 27 | 28 | - name: setup Python 29 | uses: actions/setup-python@v4 30 | with: 31 | python-version: 3.8 32 | 33 | - uses: Gr1N/setup-poetry@v9 34 | with: 35 | poetry-version: 1.1.7 36 | 37 | - name: installation pandoc 38 | run: | 39 | wget https://github.com/jgm/pandoc/releases/download/2.14.0.3/pandoc-2.14.0.3-1-amd64.deb 40 | sudo dpkg -i pandoc-2.14.0.3-1-amd64.deb 41 | 42 | - name: poetry install 43 | run: | 44 | poetry run python scripts/poetry_fix.py -c 45 | poetry install -E all 46 | 47 | - name: make documentations 48 | run: | 49 | cd docs 50 | poetry run make html 51 | -------------------------------------------------------------------------------- /.github/workflows/mirror.yml: -------------------------------------------------------------------------------- 1 | name: Mirror repo 2 | 3 | on: 4 | push: 5 | 6 | # Manually triggerable in github 7 | workflow_dispatch: 8 | 9 | jobs: 10 | mirror: 11 | runs-on: "ubuntu-latest" 12 | steps: 13 | - name: Configure Private Key 14 | env: 15 | SSH_PRIVATE_KEY: ${{ secrets.DESITNATION_REPO_PRIVATE_KEY }} 16 | run: | 17 | mkdir -p ~/.ssh 18 | echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_rsa 19 | chmod 600 ~/.ssh/id_rsa 20 | echo "Host *" >> ~/.ssh/config 21 | echo " StrictHostKeyChecking no" >> ~/.ssh/config 22 | echo " CheckHostIP no" >> ~/.ssh/config 23 | echo " LogLevel ERROR" >> ~/.ssh/config 24 | echo " UserKnownHostsFile /dev/null" >> ~/.ssh/config 25 | 26 | - name: Push mirror 27 | env: 28 | SOURCE_REPO: "https://github.com/${{ github.repository }}.git" 29 | DESTINATION_REPO: "${{ secrets.DESTINATION_REPO }}" 30 | BASE_REPO: "https://github.com/${{ github.repository }}" 31 | run: | 32 | git clone --quiet "$SOURCE_REPO" && cd `basename "$BASE_REPO"` 33 | 34 | git config --global user.name "${{ github.actor }}" 35 | git config --global user.email "bot@example.com" 36 | 37 | BRANCH=${{ github.head_ref || github.ref_name }} 38 | EVENT_ACTION=${{ github.event_name }} 39 | if [[ "$branch" == "master" ]] || [[ "$branch" == "developer" ]] || [[ "$EVENT_ACTION" == "workflow_dispatch" ]] 40 | then 41 | FORCE_FLAG="--force" 42 | else 43 | FORCE_FLAG="" 44 | fi 45 | 46 | git checkout --quiet $BRANCH 47 | 48 | git remote set-url --push origin "$DESTINATION_REPO" 49 | git push $FORCE_FLAG --quiet -u origin $BRANCH 50 | -------------------------------------------------------------------------------- /.github/workflows/publish_pypi.yml: -------------------------------------------------------------------------------- 1 | name: build and publish to pypi 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | # workflow_dispatch: 8 | # inputs: 9 | # tag: 10 | # description: 'Tag' 11 | # required: true 12 | # default: 'v0.0.0' 13 | 14 | jobs: 15 | deploy: 16 | 17 | runs-on: ubuntu-latest 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - uses: JRubics/poetry-publish@v2.0 23 | with: 24 | pypi_token: ${{ secrets.LAMA_PYPI_TOKEN }} 25 | -------------------------------------------------------------------------------- /.github/workflows/tests_macos.yml: -------------------------------------------------------------------------------- 1 | name: tests_macos 2 | 3 | on: 4 | # # At 20:59 every day (23:59 MSK) 5 | # schedule: 6 | # - cron: 59 20 * * * 7 | 8 | # Manually triggerable in github 9 | workflow_dispatch: 10 | 11 | workflow_run: 12 | workflows: ["tests_ubuntu"] 13 | branches: [master] 14 | types: 15 | - completed 16 | 17 | jobs: 18 | macos-tests: 19 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 20 | runs-on: macos-latest 21 | strategy: 22 | fail-fast: true 23 | matrix: 24 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 25 | 26 | steps: 27 | - uses: actions/checkout@v2 28 | 29 | - name: Set up Python (Conda) 30 | uses: conda-incubator/setup-miniconda@v3 31 | with: 32 | auto-update-conda: true 33 | channels: conda-forge 34 | python-version: ${{ matrix.python-version }} 35 | auto-activate-base: true 36 | activate-environment: test 37 | 38 | - name: install deps for MacOS 39 | run: brew update && brew install libomp cairo pango gdk-pixbuf libffi 40 | 41 | - name: install conda dependencies 42 | run: | 43 | conda install pip numpy==1.26.4 44 | 45 | - name: install with pip 46 | run: | 47 | pip install tox==3.28.0 48 | pip install tox-gh-actions==2.12.0 49 | 50 | - name: test with tox 51 | run: | 52 | tox 53 | -------------------------------------------------------------------------------- /.github/workflows/tests_ubuntu.yml: -------------------------------------------------------------------------------- 1 | name: tests_ubuntu 2 | 3 | on: 4 | # # At 20:59 every day (23:59 MSK) 5 | # schedule: 6 | # - cron: 59 20 * * * 7 | 8 | # Manually triggerable in github 9 | workflow_dispatch: 10 | 11 | workflow_run: 12 | workflows: ["CI"] 13 | types: 14 | - completed 15 | 16 | jobs: 17 | ubuntu-tests: 18 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 19 | runs-on: ubuntu-latest 20 | strategy: 21 | fail-fast: true 22 | matrix: 23 | python-version: ["3.8", "3.10", "3.11", "3.12"] # "3.9" is tested in CI 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v4 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: install deps for Ubuntu 34 | run: sudo apt-get install build-essential libcairo2 libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 libffi-dev shared-mime-info 35 | 36 | - name: install tox 37 | run: | 38 | python3 -m pip install --upgrade pip 39 | pip3 install tox==3.28.0 40 | pip3 install tox-gh-actions==2.12.0 41 | 42 | - name: test with tox 43 | run: | 44 | tox 45 | -------------------------------------------------------------------------------- /.github/workflows/tests_windows.yml: -------------------------------------------------------------------------------- 1 | name: tests_windows 2 | 3 | on: 4 | # # At 20:59 every day (23:59 MSK) 5 | # schedule: 6 | # - cron: 59 20 * * * 7 | 8 | # Manually triggerable in github 9 | workflow_dispatch: 10 | 11 | workflow_run: 12 | workflows: ["tests_ubuntu"] 13 | branches: [master] 14 | types: 15 | - completed 16 | 17 | jobs: 18 | windows-tests: 19 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 20 | runs-on: windows-latest 21 | strategy: 22 | fail-fast: true 23 | matrix: 24 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 25 | 26 | steps: 27 | - uses: actions/checkout@v2 28 | 29 | - name: Set up Python ${{ matrix.python-version }} 30 | uses: actions/setup-python@v2 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | 34 | - name: setup-msys2 35 | uses: msys2/setup-msys2@v2 36 | with: 37 | msystem: MINGW64 38 | update: true 39 | install: >- 40 | mingw-w64-x86_64-cairo 41 | 42 | - name: install deps for Windows 43 | run: pip3 install pycairo 44 | 45 | - name: install tox 46 | run: | 47 | python3 -m pip install --upgrade pip 48 | pip3 install tox==3.28.0 49 | pip3 install tox-gh-actions==2.12.0 50 | 51 | - name: test with tox 52 | run: | 53 | tox 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Files 2 | *.csv 3 | *.png 4 | *.pickle 5 | *.html 6 | *.ipynb 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # DS_store 17 | .DS_Store 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | tabularAutoML_model_report/ 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .nox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | *.py,cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | cover/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | .pybuilder/ 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | # For a library or package, you might want to ignore these files since the code is 98 | # intended to run in multiple environments; otherwise, check them in: 99 | # .python-version 100 | 101 | # pipenv 102 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 103 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 104 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 105 | # install all needed dependencies. 106 | #Pipfile.lock 107 | 108 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 109 | __pypackages__/ 110 | 111 | # Celery stuff 112 | celerybeat-schedule 113 | celerybeat.pid 114 | 115 | # SageMath parsed files 116 | *.sage.py 117 | 118 | # Environments 119 | .env 120 | .venv 121 | env/ 122 | venv/ 123 | ENV/ 124 | env.bak/ 125 | venv.bak/ 126 | 127 | # Spyder project settings 128 | .spyderproject 129 | .spyproject 130 | 131 | # Rope project settings 132 | .ropeproject 133 | 134 | # VSCode 135 | .vscode 136 | 137 | # mkdocs documentation 138 | /site 139 | 140 | # mypy 141 | .mypy_cache/ 142 | .dmypy.json 143 | dmypy.json 144 | 145 | # Pyre type checker 146 | .pyre/ 147 | 148 | # pytype static type analyzer 149 | .pytype/ 150 | 151 | # Cython debug symbols 152 | cython_debug/ 153 | 154 | # VSCode 155 | .vscode/ 156 | 157 | .idea/ 158 | lama_venv/ 159 | *.db 160 | 161 | temp/ 162 | 163 | poetry.lock 164 | -------------------------------------------------------------------------------- /.gitlab/.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | workflow: 2 | rules: 3 | - if: $CI_PIPELINE_SOURCE == "push" 4 | when: always 5 | - when: never 6 | 7 | .job_template: &ssh_key_configuration 8 | before_script: 9 | - mkdir -p ~/.ssh 10 | - echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_rsa 11 | - chmod 600 ~/.ssh/id_rsa 12 | - echo "Host *" >> ~/.ssh/config 13 | - echo " StrictHostKeyChecking no" >> ~/.ssh/config 14 | - echo " CheckHostIP no" >> ~/.ssh/config 15 | - echo " LogLevel ERROR" >> ~/.ssh/config 16 | - echo " UserKnownHostsFile /dev/null" >> ~/.ssh/config 17 | 18 | cache: 19 | key: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" 20 | paths: 21 | - ~/.cache/pip/ 22 | 23 | stages: 24 | - mirror 25 | - default 26 | - all_pythons 27 | - docs 28 | 29 | 30 | default: 31 | stage: default 32 | image: python:3.10 33 | before_script: 34 | - pip install tox 35 | script: 36 | - tox -e lint 37 | - tox -e py310 38 | 39 | 40 | all_pythons: 41 | stage: all_pythons 42 | image: python:$PYTHON_VERSION 43 | before_script: 44 | - pip install tox 45 | script: 46 | - tox -e py${PYTHON_VERSION//./} 47 | parallel: 48 | matrix: 49 | - PYTHON_VERSION: ["3.8", "3.9", "3.11", "3.12"] 50 | 51 | docs: 52 | stage: docs 53 | image: python:3.10 54 | before_script: 55 | - pip install tox 56 | script: 57 | - tox -e codespell 58 | 59 | 60 | mirror-code: 61 | <<: *ssh_key_configuration 62 | stage: mirror 63 | script: | 64 | if [[ "${CI_COMMIT_REF_NAME}" == experiment* ]] || [[ "${CI_COMMIT_REF_NAME}" == AUTOML-* ]] 65 | then 66 | : 67 | else 68 | git branch -f ${CI_COMMIT_REF_NAME} 69 | git config remote.github.url >&- || git remote add github "$DESTINATION_REPO" 70 | git push -u github "${CI_COMMIT_REF_NAME}" 71 | fi 72 | -------------------------------------------------------------------------------- /.gitlab/release.yml: -------------------------------------------------------------------------------- 1 | workflow: 2 | rules: 3 | - if: $CI_PIPELINE_SOURCE == "push" 4 | 5 | .job_template: &ssh_key_configuration 6 | before_script: 7 | # TODO: add clearml config 8 | - mkdir -p ~/.ssh 9 | - echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_rsa 10 | - chmod 600 ~/.ssh/id_rsa 11 | - echo "Host *" >> ~/.ssh/config 12 | - echo " StrictHostKeyChecking no" >> ~/.ssh/config 13 | - echo " CheckHostIP no" >> ~/.ssh/config 14 | - echo " LogLevel ERROR" >> ~/.ssh/config 15 | - echo " UserKnownHostsFile /dev/null" >> ~/.ssh/config 16 | 17 | stages: 18 | - run_benchmark 19 | 20 | mirror-code: 21 | <<: *ssh_key_configuration 22 | stage: run_benchmark 23 | script: | 24 | sh ./experiments/run_bench_release.sh Releases 25 | -------------------------------------------------------------------------------- /.jupyter/jupyter_notebook_config.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # Configuration file for jupyter-notebook. 3 | 4 | # timeout of each cell 5 | c.ExecutePreprocessor.timeout = 60 * 15 6 | 7 | # Path to kernel 8 | c.ExecutePreprocessor.kernel_name = "python3" 9 | 10 | # Remove metadata 11 | c.ClearMetadataPreprocessor.enabled = True 12 | c.ClearMetadataPreprocessor.clear_cell_metadata = True 13 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_stages: 2 | - commit 3 | 4 | repos: 5 | - repo: https://github.com/psf/black 6 | rev: 20.8b1 7 | hooks: 8 | - id: black 9 | args: ["--config=pyproject.toml"] 10 | additional_dependencies: ["click==8.0.4"] 11 | 12 | # - repo: https://github.com/PyCQA/isort 13 | # rev: 5.12.0 14 | # hooks: 15 | # - id: isort 16 | # args: ["--settings-path pyproject.toml"] 17 | 18 | - repo: https://github.com/pre-commit/pre-commit-hooks 19 | rev: v3.4.0 20 | hooks: 21 | - id: trailing-whitespace 22 | - id: end-of-file-fixer 23 | - id: debug-statements 24 | - id: check-yaml 25 | 26 | - repo: https://github.com/PyCQA/flake8 27 | rev: 6.1.0 28 | hooks: 29 | - id: flake8 30 | additional_dependencies: [flake8-docstrings] 31 | 32 | - repo: https://github.com/myint/rstcheck 33 | rev: 3f92957478422df87bd730abde66f089cc1ee19b 34 | hooks: 35 | - id: rstcheck 36 | 37 | - repo: local 38 | hooks: 39 | - id: set-py-versions 40 | name: set python versions 41 | description: set python versions := [3.8, 3.13) to `pyproject.toml` 42 | language: python 43 | entry: python scripts/poetry_fix.py -f 44 | pass_filenames: false 45 | 46 | # - repo: https://github.com/python-jsonschema/check-jsonschema 47 | # rev: 0.18.2 48 | # hooks: 49 | # - id: check-github-workflows 50 | 51 | - repo: local 52 | hooks: 53 | - id: exp-branch-push 54 | name: experiment branch push 55 | description: prevent pushing 'experiment/*' branches to LAMA github 56 | stages: [push] 57 | language: python 58 | entry: python ./scripts/exp_branch_push.py 59 | pass_filenames: false 60 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.10" 13 | jobs: 14 | pre_create_environment: 15 | - asdf plugin add poetry 16 | - asdf install poetry 1.8.0 17 | - asdf global poetry 1.8.0 18 | - poetry config virtualenvs.create false 19 | - poetry run python scripts/poetry_fix.py -c 20 | post_install: 21 | - poetry install -E all 22 | 23 | # Build documentation in the docs/ directory with Sphinx 24 | sphinx: 25 | configuration: docs/conf.py 26 | 27 | # Optionally build your docs in additional formats such as PDF 28 | formats: all 29 | 30 | # Optionally set the version of Python and requirements required to build your docs 31 | python: 32 | install: 33 | - requirements: docs/requirements.txt 34 | - path: . 35 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-architect 2 | -------------------------------------------------------------------------------- /check_docs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | 5 | logging.basicConfig(format="[%(asctime)s] (%(levelname)s): %(message)s", level=logging.DEBUG) 6 | 7 | logging.debug("Check that all .rst files compile to .html.") 8 | 9 | DOCS_PATH = os.path.join(os.path.dirname(__file__), "docs") 10 | RSTS_PATH = os.path.join(DOCS_PATH, "generated") 11 | HTML_PATH = os.path.join(DOCS_PATH, os.path.join("_build", "html", "generated")) 12 | 13 | if not os.path.exists(RSTS_PATH): 14 | os.makedirs(RSTS_PATH) 15 | if not os.path.exists(HTML_PATH): 16 | os.makedirs(HTML_PATH) 17 | 18 | html_filenames = [os.path.splitext(name)[0] + ".html" for name in os.listdir(RSTS_PATH) if ".rst" in name] 19 | html_filenames = sorted(html_filenames) 20 | logging.debug(".rst filenames: {}".format(html_filenames)) 21 | 22 | for fname in html_filenames: 23 | fpath = os.path.join(HTML_PATH, fname) 24 | logging.debug("Check {}".format(fname)) 25 | assert os.path.exists(fpath), "File {} doesn`t exist.".format(fpath) 26 | 27 | logging.debug("All files exists.") 28 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | clean: 23 | sphinx-build -M clean "$(SOURCEDIR)" "$(BUILDDIR)" 24 | sphinx-build -M clean "$(SOURCEDIR)" "imgs" 25 | sphinx-build -M clean "$(SOURCEDIR)" "pages/modules/generated/" 26 | -------------------------------------------------------------------------------- /docs/_static/style.css: -------------------------------------------------------------------------------- 1 | .wy-nav-content { 2 | max-width: none; 3 | } 4 | 5 | .rst-content code.xref { 6 | /* !important prevents the common CSS stylesheets from overriding 7 | this as on RTD they are loaded after this stylesheet */ 8 | color: #E74C3C 9 | } 10 | 11 | html.writer-html4 .rst-content dl:not(.docutils) dl:not(.field-list)>dt, html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list)>dt { 12 | border-left-color: rgb(9, 183, 14) 13 | } 14 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | 11 | 12 | .. 13 | autogenerated from source/_templates/autosummary/class.rst 14 | note it does not have :inherited-members: 15 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/module.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | {{ name | underline }} 5 | 6 | .. automodule:: {{ fullname }} 7 | 8 | {% block classes %} 9 | {% if classes %} 10 | .. rubric:: {{ _('Classes') }} 11 | 12 | .. autosummary:: 13 | :toctree: generated 14 | :nosignatures: 15 | :template: classtemplate.rst 16 | {% for item in classes %} 17 | {{ item }} 18 | {%- endfor %} 19 | {% endif %} 20 | {% endblock %} 21 | 22 | {% block functions %} 23 | {% if functions %} 24 | .. rubric:: {{ _('Functions') }} 25 | 26 | .. autosummary:: 27 | :toctree: generated 28 | :nosignatures: 29 | :template: functiontemplate.rst 30 | {% for item in functions %} 31 | {{ item }} 32 | {%- endfor %} 33 | {% endif %} 34 | {% endblock %} 35 | 36 | 37 | {% block modules %} 38 | {% if modules %} 39 | .. rubric:: {{ _('Modules') }} 40 | 41 | .. autosummary:: 42 | :toctree: 43 | :recursive: 44 | {% for item in modules %} 45 | {{ item }} 46 | {%- endfor %} 47 | {% endif %} 48 | {% endblock %} 49 | -------------------------------------------------------------------------------- /docs/_templates/classtemplate.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline }} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | 11 | 12 | .. 13 | autogenerated from source/_templates/classtemplate.rst 14 | note it does not have :inherited-members: 15 | -------------------------------------------------------------------------------- /docs/_templates/functiontemplate.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | {{ name | underline }} 6 | 7 | .. autofunction:: {{ fullname }} 8 | 9 | .. 10 | autogenerated from source/_templates/functiontemplate.rst 11 | note it does not have :inherited-members: 12 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | LightAutoML documentation 2 | ========================= 3 | 4 | `LightAutoML `_ is open-source Python library aimed at automated machine learning. 5 | It is designed to be lightweight and efficient for various tasks with tabular, text data. 6 | LightAutoML provides easy-to-use pipeline creation, that enables: 7 | 8 | - Automatic hyperparameter tuning, data processing. 9 | - Automatic typing, feature selection. 10 | - Automatic time utilization. 11 | - Automatic report creation. 12 | - Easy-to-use modular scheme to create your own pipelines. 13 | 14 | 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: Contents 19 | 20 | Installation Guide 21 | Tutorials 22 | Kaggle kernel examples of LightAutoML usage 23 | Courses, videos and papers 24 | Python-API 25 | 26 | 27 | 28 | Indices and Tables 29 | ================== 30 | 31 | * :ref:`genindex` 32 | -------------------------------------------------------------------------------- /docs/mock_docs.py: -------------------------------------------------------------------------------- 1 | """A one line summary of the module or program, terminated by a period. 2 | 3 | Leave one blank line. The rest of this docstring should contain an 4 | overall description of the module or program. Optionally, it may also 5 | contain a brief description of exported classes and functions and/or usage 6 | examples. 7 | 8 | Typical usage example: 9 | 10 | >>> print('something') 11 | something 12 | >>> a = MyClass('be', 'or', 'not') 13 | 14 | """ 15 | 16 | import datetime 17 | 18 | 19 | class MyClass: 20 | """Description of class. 21 | 22 | Really do nothing. 23 | 24 | Attributes: 25 | attr1 (str): Description of `attr1`. 26 | attr2 (str): Description of `attr2`. 27 | 28 | Args: 29 | attr1: Description of `attr1`. 30 | attr2: Description of `attr2`. 31 | 32 | 33 | """ 34 | 35 | def __init__(self, attr1: str, attr2: str): 36 | self.attr1 = attr1 37 | self.attr2 = attr2 38 | date = datetime.datetime.now() 39 | print("{}.{}.{} {}:{}:{}".format(date.day, date.month, date.year, date.hour, date.minute, date.second)) 40 | 41 | 42 | # .. toctree:: 43 | # :glob: 44 | # :maxdepth: 1 45 | # :caption: Tutorials 46 | # 47 | # tutorials/tutor_1.ipynb 48 | # tutorials/tutor_2.ipynb 49 | # tutorials/tutor_3.ipynb 50 | -------------------------------------------------------------------------------- /docs/pages/Installation.rst: -------------------------------------------------------------------------------- 1 | Installation Guide 2 | ================== 3 | 4 | 5 | Basic 6 | ----- 7 | 8 | You can install library `LightAutoML` from PyPI. 9 | 10 | .. code-block:: bash 11 | 12 | pip install lightautoml 13 | 14 | 15 | Development 16 | ----------- 17 | 18 | You can also clone repository and install with poetry. 19 | First, install `poetry `_. 20 | Then, 21 | 22 | .. code-block:: bash 23 | 24 | git clone git@github.com:AILab-MLTools/LightAutoML.git 25 | cd LightAutoML 26 | 27 | # Create virtual environment inside your project directory 28 | poetry config virtualenvs.in-project true 29 | 30 | # If you want to update dependencies, run the command: 31 | poetry lock 32 | 33 | # Installation 34 | poetry install 35 | -------------------------------------------------------------------------------- /docs/pages/Kaggle_Kernels.rst: -------------------------------------------------------------------------------- 1 | Kaggle Kernels 2 | ============== 3 | 4 | * `Tabular Playground Series April 2021 competition solution `_ 5 | * `Titanic competition solution (80% accuracy) `_ 6 | * `Titanic **12-code-lines** competition solution (78% accuracy) `_ 7 | * `House prices competition solution `_ 8 | * `Natural Language Processing with Disaster Tweets solution `_ 9 | * `Tabular Playground Series March 2021 competition solution `_ 10 | * `Tabular Playground Series February 2021 competition solution `_ 11 | * `Interpretable WhiteBox solution `_ 12 | * `Custom ML pipeline elements inside existing ones `_ 13 | * `Tabular Playground Series November 2022 competition solution with Neural Networks `_ 14 | -------------------------------------------------------------------------------- /docs/pages/Others.rst: -------------------------------------------------------------------------------- 1 | Others 2 | ====== 3 | 4 | 5 | LightAutoML crash courses 6 | ------------------------- 7 | 8 | `(Russian) AutoML course for OpenDataScience community `_ 9 | 10 | 11 | Video guides 12 | ------------ 13 | 14 | * (Russian) `LightAutoML webinar for Sberloga community `_ (`Alexander Ryzhkov `__), `Dmitry Simakov `__) 15 | * (Russian) `LightAutoML hands-on tutorial in Kaggle Kernels `_ (`Alexander Ryzhkov `__) 16 | * (English) `Automated Machine Learning with LightAutoML: theory and practice `_ (`Alexander Ryzhkov `__) 17 | * (English) `LightAutoML framework general overview, benchmarks and advantages for business `_ (`Alexander Ryzhkov `__) 18 | * (English) `LightAutoML practical guide - ML pipeline presets overview `_ (`Dmitry Simakov `__) 19 | 20 | 21 | Papers 22 | ------ 23 | 24 | Anton Vakhrushev, Alexander Ryzhkov, Dmitry Simakov, Rinchin Damdinov, Maxim Savchenko, Alexander Tuzhilin `"LightAutoML: AutoML Solution for a Large Financial Services Ecosystem" `_. arXiv:2109.01528, 2021. 25 | 26 | 27 | Articles about LightAutoML 28 | -------------------------- 29 | 30 | * (English) `LightAutoML vs Titanic: 80% accuracy in several lines of code (Medium) `_ 31 | * (English) `Hands-On Python Guide to LightAutoML – An Automatic ML Model Creation Framework (Analytic Indian Mag) `_ 32 | -------------------------------------------------------------------------------- /docs/pages/Python-API.rst: -------------------------------------------------------------------------------- 1 | Python-API 2 | ========== 3 | 4 | 5 | .. toctree:: 6 | :maxdepth: 1 7 | :caption: Main modules 8 | 9 | modules/automl 10 | modules/addons 11 | modules/dataset 12 | modules/image 13 | modules/ml_algo 14 | modules/ml_algo.tuning 15 | modules/ml_algo.torch_based 16 | modules/pipelines 17 | modules/pipelines.selection 18 | modules/pipelines.features 19 | modules/pipelines.ml 20 | modules/reader 21 | modules/report 22 | modules/tasks 23 | modules/tasks.losses 24 | modules/text 25 | modules/transformers 26 | modules/utils 27 | modules/validation 28 | -------------------------------------------------------------------------------- /docs/pages/Tutorials.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | This section contains tutorials for both **LightAutoML** and **HypEx**, covering a wide range of use cases from basic model training to advanced hypothesis testing. 5 | 6 | LightAutoML Tutorials 7 | --------------------- 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | :caption: Core Features 13 | 14 | tutorials/Tutorial_1_basics.nblink 15 | tutorials/Tutorial_2_WhiteBox_AutoWoE.nblink 16 | tutorials/Tutorial_3_sql_data_source.nblink 17 | tutorials/Tutorial_4_NLP_Interpretation.nblink 18 | 19 | 20 | .. toctree:: 21 | :maxdepth: 1 22 | :caption: Advanced Topics 23 | 24 | tutorials/Tutorial_5_uplift.nblink 25 | tutorials/Tutorial_6_custom_pipeline.nblink 26 | tutorials/Tutorial_7_ICE_and_PDP_interpretation.nblink 27 | tutorials/Tutorial_8_CV_preset.nblink 28 | tutorials/Tutorial_9_neural_networks.nblink 29 | tutorials/Tutorial_10_relational_data_with_star_scheme.nblink 30 | tutorials/Tutorial_11_time_series.nblink 31 | 32 | 33 | HypEx Tutorials 34 | --------------- 35 | 36 | 37 | .. toctree:: 38 | :maxdepth: 1 39 | :caption: A/B and A/A Testing 40 | 41 | tutorials/Tutorial_12_AA_Test.nblink 42 | tutorials/Tutorial_13_AA_Test_multigroup_split.nblink 43 | tutorials/Tutorial_14_AB_Test.nblink 44 | 45 | 46 | .. toctree:: 47 | :maxdepth: 1 48 | :caption: Matching 49 | 50 | tutorials/Tutorial_15_Matching.nblink 51 | tutorials/Tutorial_16_Matching_without_replacement.nblink 52 | 53 | 54 | .. toctree:: 55 | :maxdepth: 1 56 | :caption: Modeling and Testing Limits 57 | 58 | tutorials/Tutorial_17_Modeling_Limit_Distribution.nblink 59 | tutorials/Tutorial_18_Test_Limit_Distribution.nblink 60 | -------------------------------------------------------------------------------- /docs/pages/modules/addons.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | lightautoml.addons 5 | ================== 6 | 7 | Extensions of core functionality. 8 | 9 | Utilization 10 | ----------- 11 | 12 | .. currentmodule:: lightautoml.addons.utilization 13 | 14 | .. autosummary:: 15 | :toctree: ./generated 16 | :nosignatures: 17 | :template: classtemplate.rst 18 | 19 | ~utilization.TimeUtilization 20 | 21 | HypEx -- Hypothesises and Experiments 22 | ------------------------------------- 23 | 24 | The official HypEx documentation can be found at: 25 | 26 | `HypEx Documentation `_ 27 | 28 | For a detailed reference, visit the HypEx API documentation. 29 | -------------------------------------------------------------------------------- /docs/pages/modules/automl.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | lightautoml.automl 5 | ====================== 6 | 7 | The main module, which includes the AutoML class, blenders and ready-made presets. 8 | 9 | .. currentmodule:: lightautoml.automl.base 10 | 11 | .. autosummary:: 12 | :toctree: ./generated 13 | :nosignatures: 14 | :template: classtemplate.rst 15 | 16 | AutoML 17 | 18 | 19 | Presets 20 | ------- 21 | 22 | Presets for end-to-end model training for special tasks. 23 | 24 | .. currentmodule:: lightautoml.automl.presets 25 | 26 | .. autosummary:: 27 | :toctree: ./generated 28 | :nosignatures: 29 | :template: classtemplate.rst 30 | 31 | base.AutoMLPreset 32 | tabular_presets.TabularAutoML 33 | tabular_presets.TabularUtilizedAutoML 34 | .. image_presets.TabularCVAutoML 35 | text_presets.TabularNLPAutoML 36 | whitebox_presets.WhiteBoxPreset 37 | 38 | 39 | Blenders 40 | -------- 41 | 42 | .. currentmodule:: lightautoml.automl.blend 43 | 44 | .. autosummary:: 45 | :toctree: ./generated 46 | :nosignatures: 47 | :template: classtemplate.rst 48 | 49 | Blender 50 | BestModelSelector 51 | MeanBlender 52 | WeightedBlender 53 | -------------------------------------------------------------------------------- /docs/pages/modules/dataset.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | lightautoml.dataset 5 | =================== 6 | 7 | Provides an internal interface for working with data. 8 | 9 | Dataset Interfaces 10 | ------------------- 11 | 12 | .. currentmodule:: lightautoml.dataset 13 | 14 | .. autosummary:: 15 | :toctree: ./generated 16 | :nosignatures: 17 | :template: classtemplate.rst 18 | 19 | base.LAMLColumn 20 | base.LAMLDataset 21 | np_pd_dataset.NumpyDataset 22 | np_pd_dataset.PandasDataset 23 | np_pd_dataset.CSRSparseDataset 24 | 25 | Roles 26 | ----------- 27 | 28 | Role contains information about the column, which determines how it is processed. 29 | 30 | .. currentmodule:: lightautoml.dataset.roles 31 | 32 | .. autosummary:: 33 | :toctree: ./generated 34 | :nosignatures: 35 | :template: classtemplate.rst 36 | 37 | ColumnRole 38 | NumericRole 39 | CategoryRole 40 | TextRole 41 | DatetimeRole 42 | TargetRole 43 | GroupRole 44 | DropRole 45 | WeightsRole 46 | FoldsRole 47 | PathRole 48 | 49 | 50 | Utils 51 | ------------ 52 | 53 | Utilities for working with the structure of a dataset. 54 | 55 | .. currentmodule:: lightautoml.dataset.utils 56 | 57 | .. autosummary:: 58 | :toctree: ./generated 59 | :nosignatures: 60 | :template: functiontemplate.rst 61 | 62 | roles_parser 63 | get_common_concat 64 | numpy_and_pandas_concat 65 | concatenate 66 | -------------------------------------------------------------------------------- /docs/pages/modules/image.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | lightautoml.image 5 | ================= 6 | 7 | Provides an internal interface for working with image features. 8 | 9 | Image Feature Extractors 10 | ------------------------ 11 | 12 | Image feature extractors based on color histograms and CNN embeddings. 13 | 14 | .. currentmodule:: lightautoml.image.image 15 | 16 | .. autosummary:: 17 | :toctree: ./generated 18 | :nosignatures: 19 | :template: classtemplate.rst 20 | 21 | CreateImageFeatures 22 | TimmModelEmbedder 23 | 24 | 25 | PyTorch Image Datasets 26 | ------------------------ 27 | 28 | .. currentmodule:: lightautoml.image.image 29 | 30 | .. autosummary:: 31 | :toctree: ./generated 32 | :nosignatures: 33 | :template: classtemplate.rst 34 | 35 | ImageTimmDataset 36 | DeepTimmImageEmbedder 37 | 38 | 39 | Utils 40 | --------- 41 | 42 | .. currentmodule:: lightautoml.image.utils 43 | 44 | .. autosummary:: 45 | :toctree: ./generated 46 | :nosignatures: 47 | :template: functiontemplate.rst 48 | 49 | pil_loader 50 | -------------------------------------------------------------------------------- /docs/pages/modules/ml_algo.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | lightautoml.ml_algo 5 | =================== 6 | 7 | Models used for machine learning pipelines. 8 | 9 | Base Classes 10 | ------------------------ 11 | 12 | .. currentmodule:: lightautoml.ml_algo.base 13 | 14 | .. autosummary:: 15 | :toctree: ./generated 16 | :nosignatures: 17 | :template: classtemplate.rst 18 | 19 | MLAlgo 20 | TabularMLAlgo 21 | 22 | 23 | Linear Models 24 | ------------------------- 25 | 26 | .. currentmodule:: lightautoml.ml_algo 27 | 28 | .. autosummary:: 29 | :toctree: ./generated 30 | :nosignatures: 31 | :template: classtemplate.rst 32 | 33 | ~linear_sklearn.LinearLBFGS 34 | ~linear_sklearn.LinearL1CD 35 | ~dl_model.TorchModel 36 | 37 | Boosted Trees 38 | ------------------------- 39 | 40 | .. currentmodule:: lightautoml.ml_algo 41 | 42 | .. autosummary:: 43 | :toctree: ./generated 44 | :nosignatures: 45 | :template: classtemplate.rst 46 | 47 | ~boost_lgbm.BoostLGBM 48 | ~boost_cb.BoostCB 49 | 50 | 51 | Neural Networks 52 | ------------------------- 53 | 54 | .. currentmodule:: lightautoml.ml_algo.torch_based 55 | 56 | .. autosummary:: 57 | :toctree: ./generated 58 | :nosignatures: 59 | :template: classtemplate.rst 60 | 61 | ~nn_models.MLP 62 | ~nn_models.DenseLightModel 63 | ~nn_models.DenseModel 64 | ~nn_models.ResNetModel 65 | ~nn_models.SNN 66 | 67 | 68 | WhiteBox 69 | ------------------------- 70 | 71 | .. currentmodule:: lightautoml.ml_algo 72 | 73 | .. autosummary:: 74 | :toctree: ./generated 75 | :nosignatures: 76 | :template: classtemplate.rst 77 | 78 | ~whitebox.WbMLAlgo 79 | -------------------------------------------------------------------------------- /docs/pages/modules/ml_algo.torch_based.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | lightautoml.ml_algo 5 | =================== 6 | 7 | Torch utils. 8 | 9 | Pooling Strategies 10 | ------------------------------ 11 | 12 | .. currentmodule:: lightautoml.ml_algo.torch_based.nn_models 13 | 14 | .. autosummary:: 15 | :toctree: ./generated 16 | :nosignatures: 17 | :template: classtemplate.rst 18 | 19 | SequenceAbstractPooler 20 | SequenceClsPooler 21 | SequenceMaxPooler 22 | SequenceSumPooler 23 | SequenceAvgPooler 24 | SequenceIndentityPooler 25 | -------------------------------------------------------------------------------- /docs/pages/modules/ml_algo.tuning.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | lightautoml.ml_algo.tuning 5 | ========================== 6 | 7 | Bunch of classes for hyperparameters tuning. 8 | 9 | Base Classes 10 | ------------------------ 11 | 12 | .. currentmodule:: lightautoml.ml_algo.tuning.base 13 | 14 | .. autosummary:: 15 | :toctree: ./generated 16 | :nosignatures: 17 | :template: classtemplate.rst 18 | 19 | ParamsTuner 20 | DefaultTuner 21 | 22 | 23 | Tuning with Optuna 24 | ------------------ 25 | 26 | .. currentmodule:: lightautoml.ml_algo.tuning.optuna 27 | 28 | .. autosummary:: 29 | :toctree: ./generated 30 | :nosignatures: 31 | :template: classtemplate.rst 32 | 33 | OptunaTuner 34 | DLOptunaTuner 35 | -------------------------------------------------------------------------------- /docs/pages/modules/pipelines.features.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.pipelines.features 6 | ============================== 7 | 8 | Pipelines for features generation. 9 | 10 | Base Classes 11 | ----------------- 12 | 13 | .. currentmodule:: lightautoml.pipelines.features.base 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: classtemplate.rst 19 | 20 | FeaturesPipeline 21 | EmptyFeaturePipeline 22 | TabularDataFeatures 23 | 24 | 25 | 26 | Feature Pipelines for Boosting Models 27 | ----------------------------------------- 28 | 29 | .. currentmodule:: lightautoml.pipelines.features.lgb_pipeline 30 | 31 | .. autosummary:: 32 | :toctree: ./generated 33 | :nosignatures: 34 | :template: classtemplate.rst 35 | 36 | LGBSimpleFeatures 37 | LGBAdvancedPipeline 38 | 39 | 40 | Feature Pipelines for Linear Models 41 | ----------------------------------- 42 | 43 | .. currentmodule:: lightautoml.pipelines.features.linear_pipeline 44 | 45 | .. autosummary:: 46 | :toctree: ./generated 47 | :nosignatures: 48 | :template: classtemplate.rst 49 | 50 | LinearFeatures 51 | 52 | Feature Pipelines for WhiteBox 53 | ------------------------------ 54 | 55 | .. currentmodule:: lightautoml.pipelines.features.wb_pipeline 56 | 57 | .. autosummary:: 58 | :toctree: ./generated 59 | :nosignatures: 60 | :template: classtemplate.rst 61 | 62 | WBFeatures 63 | 64 | 65 | Image Feature Pipelines 66 | ---------------------------------- 67 | 68 | .. currentmodule:: lightautoml.pipelines.features.image_pipeline 69 | 70 | .. autosummary:: 71 | :toctree: ./generated 72 | :nosignatures: 73 | :template: classtemplate.rst 74 | 75 | ImageDataFeatures 76 | ImageSimpleFeatures 77 | ImageAutoFeatures 78 | 79 | 80 | Text Feature Pipelines 81 | ------------------------------ 82 | 83 | .. currentmodule:: lightautoml.pipelines.features.text_pipeline 84 | 85 | .. autosummary:: 86 | :toctree: ./generated 87 | :nosignatures: 88 | :template: classtemplate.rst 89 | 90 | NLPDataFeatures 91 | TextAutoFeatures 92 | NLPTFiDFFeatures 93 | TextBertFeatures 94 | 95 | 96 | Feature Pipelines for Neural Networks Models 97 | ------------------------------------------------------ 98 | 99 | .. currentmodule:: lightautoml.pipelines.features.torch_pipeline 100 | 101 | .. autosummary:: 102 | :toctree: ./generated 103 | :nosignatures: 104 | :template: classtemplate.rst 105 | 106 | TorchSimpleFeatures 107 | -------------------------------------------------------------------------------- /docs/pages/modules/pipelines.ml.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.pipelines.ml 6 | ============================== 7 | 8 | Pipelines that merge together single model training steps. 9 | 10 | Base Classes 11 | ----------------- 12 | 13 | .. currentmodule:: lightautoml.pipelines.ml.base 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: classtemplate.rst 19 | 20 | MLPipeline 21 | 22 | 23 | Pipeline for Nested Cross-Validation 24 | ------------------------------------ 25 | 26 | .. currentmodule:: lightautoml.pipelines.ml.nested_ml_pipe 27 | 28 | .. autosummary:: 29 | :toctree: ./generated 30 | :nosignatures: 31 | :template: classtemplate.rst 32 | 33 | NestedTabularMLAlgo 34 | NestedTabularMLPipeline 35 | 36 | Pipeline for WhiteBox 37 | --------------------- 38 | 39 | .. currentmodule:: lightautoml.pipelines.ml.whitebox_ml_pipe 40 | 41 | .. autosummary:: 42 | :toctree: ./generated 43 | :nosignatures: 44 | :template: classtemplate.rst 45 | 46 | WBPipeline 47 | -------------------------------------------------------------------------------- /docs/pages/modules/pipelines.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.pipelines 6 | ===================== 7 | 8 | Pipelines for solving different tasks. 9 | 10 | Utils 11 | ------- 12 | 13 | .. currentmodule:: lightautoml.pipelines.utils 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: functiontemplate.rst 19 | 20 | map_pipeline_names 21 | get_columns_by_role 22 | -------------------------------------------------------------------------------- /docs/pages/modules/pipelines.selection.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.pipelines.selection 6 | =============================== 7 | 8 | Feature selection module for ML pipelines. 9 | 10 | Base Classes 11 | ----------------- 12 | 13 | .. currentmodule:: lightautoml.pipelines.selection.base 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: classtemplate.rst 19 | 20 | ImportanceEstimator 21 | SelectionPipeline 22 | 23 | Importance Based Selectors 24 | -------------------------- 25 | 26 | .. currentmodule:: lightautoml.pipelines.selection 27 | 28 | .. autosummary:: 29 | :toctree: ./generated 30 | :nosignatures: 31 | :template: classtemplate.rst 32 | 33 | ~importance_based.ModelBasedImportanceEstimator 34 | ~importance_based.ImportanceCutoffSelector 35 | ~permutation_importance_based.NpPermutationImportanceEstimator 36 | ~permutation_importance_based.NpIterativeFeatureSelector 37 | 38 | Other Selectors 39 | ---------------------- 40 | 41 | .. currentmodule:: lightautoml.pipelines.selection 42 | 43 | .. autosummary:: 44 | :toctree: ./generated 45 | :nosignatures: 46 | :template: classtemplate.rst 47 | 48 | ~linear_selector.HighCorrRemoval 49 | -------------------------------------------------------------------------------- /docs/pages/modules/reader.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.reader 6 | ===================== 7 | 8 | Utils for reading, training and analysing data. 9 | 10 | Readers 11 | ------------- 12 | 13 | .. currentmodule:: lightautoml.reader.base 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: classtemplate.rst 19 | 20 | Reader 21 | PandasToPandasReader 22 | 23 | 24 | Tabular Batch Generators 25 | ----------------------------- 26 | 27 | Batch Handler Classes 28 | ^^^^^^^^^^^^^^^^^^^^^ 29 | 30 | .. currentmodule:: lightautoml.reader.tabular_batch_generator 31 | 32 | .. autosummary:: 33 | :toctree: ./generated 34 | :nosignatures: 35 | :template: classtemplate.rst 36 | 37 | Batch 38 | FileBatch 39 | BatchGenerator 40 | DfBatchGenerator 41 | FileBatchGenerator 42 | 43 | Data Read Functions 44 | ^^^^^^^^^^^^^^^^^^^ 45 | 46 | .. currentmodule:: lightautoml.reader.tabular_batch_generator 47 | 48 | .. autosummary:: 49 | :toctree: ./generated 50 | :nosignatures: 51 | :template: functiontemplate.rst 52 | 53 | read_batch 54 | read_data 55 | -------------------------------------------------------------------------------- /docs/pages/modules/report.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.report 6 | ================== 7 | 8 | Report generators and templates. 9 | 10 | .. currentmodule:: lightautoml.report.report_deco 11 | 12 | .. autosummary:: 13 | :toctree: ./generated 14 | :nosignatures: 15 | :template: classtemplate.rst 16 | 17 | ReportDeco 18 | ReportDecoWhitebox 19 | -------------------------------------------------------------------------------- /docs/pages/modules/tasks.losses.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.tasks.losses 6 | ============================== 7 | 8 | Wrappers of loss and metric functions for different machine learning algorithms. 9 | 10 | Base Classes 11 | ------------ 12 | 13 | .. currentmodule:: lightautoml.tasks.losses.base 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: classtemplate.rst 19 | 20 | MetricFunc 21 | Loss 22 | 23 | 24 | Wrappers for LightGBM 25 | --------------------- 26 | 27 | Classes 28 | ^^^^^^^ 29 | 30 | .. currentmodule:: lightautoml.tasks.losses 31 | 32 | .. autosummary:: 33 | :toctree: ./generated 34 | :nosignatures: 35 | :template: classtemplate.rst 36 | 37 | ~lgb.LGBFunc 38 | ~lgb.LGBLoss 39 | 40 | Functions 41 | ^^^^^^^^^ 42 | 43 | .. currentmodule:: lightautoml.tasks.losses 44 | 45 | .. autosummary:: 46 | :toctree: ./generated 47 | :nosignatures: 48 | :template: functiontemplate.rst 49 | 50 | ~lgb_custom.softmax_ax1 51 | ~lgb_custom.lgb_f1_loss_multiclass 52 | 53 | 54 | 55 | Wrappers for CatBoost 56 | --------------------- 57 | 58 | Classes 59 | ^^^^^^^ 60 | 61 | .. currentmodule:: lightautoml.tasks.losses 62 | 63 | .. autosummary:: 64 | :toctree: ./generated 65 | :nosignatures: 66 | :template: classtemplate.rst 67 | 68 | ~cb.CBLoss 69 | ~cb_custom.CBCustomMetric 70 | ~cb_custom.CBRegressionMetric 71 | ~cb_custom.CBClassificationMetric 72 | ~cb_custom.CBMulticlassMetric 73 | 74 | 75 | Functions 76 | ^^^^^^^^^ 77 | 78 | .. currentmodule:: lightautoml.tasks.losses 79 | 80 | .. autosummary:: 81 | :toctree: ./generated 82 | :nosignatures: 83 | :template: functiontemplate.rst 84 | 85 | ~cb.cb_str_loss_wrapper 86 | 87 | 88 | Wrappers for Sklearn 89 | --------------------- 90 | 91 | Classes 92 | ^^^^^^^ 93 | 94 | .. currentmodule:: lightautoml.tasks.losses 95 | 96 | .. autosummary:: 97 | :toctree: ./generated 98 | :nosignatures: 99 | :template: classtemplate.rst 100 | 101 | ~sklearn.SKLoss 102 | 103 | 104 | Wrappers for Torch 105 | --------------------- 106 | 107 | Classes 108 | ^^^^^^^ 109 | 110 | .. currentmodule:: lightautoml.tasks.losses 111 | 112 | .. autosummary:: 113 | :toctree: ./generated 114 | :nosignatures: 115 | :template: classtemplate.rst 116 | 117 | ~torch.TorchLossWrapper 118 | ~torch.TORCHLoss 119 | 120 | 121 | Functions 122 | ^^^^^^^^^ 123 | 124 | .. currentmodule:: lightautoml.tasks.losses 125 | 126 | .. autosummary:: 127 | :toctree: ./generated 128 | :nosignatures: 129 | :template: functiontemplate.rst 130 | 131 | ~torch.torch_rmsle 132 | ~torch.torch_quantile 133 | ~torch.torch_fair 134 | ~torch.torch_huber 135 | ~torch.torch_f1 136 | ~torch.torch_mape 137 | -------------------------------------------------------------------------------- /docs/pages/modules/tasks.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.tasks 6 | ============================== 7 | 8 | 9 | Task Class 10 | ---------- 11 | 12 | .. currentmodule:: lightautoml.tasks.base 13 | 14 | .. autosummary:: 15 | :toctree: ./generated 16 | :nosignatures: 17 | :template: classtemplate.rst 18 | 19 | Task 20 | 21 | 22 | 23 | 24 | Common Metrics 25 | ----------------------- 26 | 27 | Classes 28 | ^^^^^^^^^^^ 29 | 30 | .. currentmodule:: lightautoml.tasks.common_metric 31 | 32 | .. autosummary:: 33 | :toctree: ./generated 34 | :nosignatures: 35 | :template: classtemplate.rst 36 | 37 | F1Factory 38 | BestClassBinaryWrapper 39 | BestClassMulticlassWrapper 40 | 41 | 42 | Functions 43 | ^^^^^^^^^^ 44 | 45 | .. currentmodule:: lightautoml.tasks.common_metric 46 | 47 | .. autosummary:: 48 | :toctree: ./generated 49 | :nosignatures: 50 | :template: functiontemplate.rst 51 | 52 | mean_quantile_error 53 | mean_huber_error 54 | mean_fair_error 55 | mean_absolute_percentage_error 56 | roc_auc_ovr 57 | rmsle 58 | auc_mu 59 | -------------------------------------------------------------------------------- /docs/pages/modules/text.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.text 6 | ============================== 7 | 8 | Provides an internal interface for working with text features. 9 | 10 | Sentence Embedders 11 | ------------------------------ 12 | 13 | .. currentmodule:: lightautoml.text 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: classtemplate.rst 19 | 20 | ~dl_transformers.DLTransformer 21 | ~dl_transformers.BOREP 22 | ~dl_transformers.RandomLSTM 23 | ~dl_transformers.BertEmbedder 24 | ~weighted_average_transformer.WeightedAverageTransformer 25 | 26 | 27 | Torch Datasets for Text 28 | ------------------------------ 29 | 30 | .. currentmodule:: lightautoml.text 31 | 32 | .. autosummary:: 33 | :toctree: ./generated 34 | :nosignatures: 35 | :template: classtemplate.rst 36 | 37 | ~embed_dataset.BertDataset 38 | ~embed_dataset.EmbedDataset 39 | 40 | 41 | Tokenizers 42 | ------------------------------ 43 | 44 | .. currentmodule:: lightautoml.text 45 | 46 | .. autosummary:: 47 | :toctree: ./generated 48 | :nosignatures: 49 | :template: classtemplate.rst 50 | 51 | ~tokenizer.BaseTokenizer 52 | ~tokenizer.SimpleRuTokenizer 53 | ~tokenizer.SimpleEnTokenizer 54 | 55 | 56 | Utils 57 | ------------------------------ 58 | 59 | .. currentmodule:: lightautoml.text 60 | 61 | .. autosummary:: 62 | :toctree: ./generated 63 | :nosignatures: 64 | :template: functiontemplate.rst 65 | 66 | ~utils.seed_everything 67 | ~utils.parse_devices 68 | ~utils.custom_collate 69 | ~utils.single_text_hash 70 | ~utils.get_textarr_hash 71 | -------------------------------------------------------------------------------- /docs/pages/modules/transformers.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.transformers 6 | ============================== 7 | 8 | Basic feature generation steps and helper utils. 9 | 10 | Base Classes 11 | ------------------------------ 12 | 13 | .. currentmodule:: lightautoml.transformers.base 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: classtemplate.rst 19 | 20 | LAMLTransformer 21 | SequentialTransformer 22 | UnionTransformer 23 | ColumnsSelector 24 | ColumnwiseUnion 25 | BestOfTransformers 26 | ConvertDataset 27 | ChangeRoles 28 | 29 | 30 | Numeric 31 | ------------------------------ 32 | 33 | .. currentmodule:: lightautoml.transformers.numeric 34 | 35 | .. autosummary:: 36 | :toctree: ./generated 37 | :nosignatures: 38 | :template: classtemplate.rst 39 | 40 | NaNFlags 41 | FillnaMedian 42 | FillnaMean 43 | FillInf 44 | LogOdds 45 | StandardScaler 46 | QuantileBinning 47 | QuantileTransformer 48 | 49 | 50 | Categorical 51 | ------------------------------ 52 | 53 | .. currentmodule:: lightautoml.transformers.categorical 54 | 55 | .. autosummary:: 56 | :toctree: ./generated 57 | :nosignatures: 58 | :template: classtemplate.rst 59 | 60 | LabelEncoder 61 | OHEEncoder 62 | FreqEncoder 63 | OrdinalEncoder 64 | TargetEncoder 65 | MultiClassTargetEncoder 66 | CatIntersectstions 67 | 68 | 69 | Datetime 70 | ------------------------------ 71 | 72 | .. currentmodule:: lightautoml.transformers.datetime 73 | 74 | .. autosummary:: 75 | :toctree: ./generated 76 | :nosignatures: 77 | :template: classtemplate.rst 78 | 79 | TimeToNum 80 | BaseDiff 81 | DateSeasons 82 | 83 | 84 | Decompositions 85 | ------------------------------ 86 | 87 | .. currentmodule:: lightautoml.transformers.decomposition 88 | 89 | .. autosummary:: 90 | :toctree: ./generated 91 | :nosignatures: 92 | :template: classtemplate.rst 93 | 94 | PCATransformer 95 | SVDTransformer 96 | 97 | 98 | Text 99 | ------------------------------ 100 | 101 | .. currentmodule:: lightautoml.transformers.text 102 | 103 | .. autosummary:: 104 | :toctree: ./generated 105 | :nosignatures: 106 | :template: classtemplate.rst 107 | 108 | TunableTransformer 109 | TfidfTextTransformer 110 | TokenizerTransformer 111 | OneToOneTransformer 112 | ConcatTextTransformer 113 | AutoNLPWrap 114 | 115 | 116 | Image 117 | ------------------------------ 118 | 119 | .. currentmodule:: lightautoml.transformers.image 120 | 121 | .. autosummary:: 122 | :toctree: ./generated 123 | :nosignatures: 124 | :template: classtemplate.rst 125 | 126 | ImageFeaturesTransformer 127 | AutoCVWrap 128 | -------------------------------------------------------------------------------- /docs/pages/modules/utils.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.utils 6 | ============================== 7 | 8 | Common util tools. 9 | 10 | Timer 11 | ------------------------------ 12 | 13 | 14 | .. currentmodule:: lightautoml.utils.timer 15 | 16 | .. autosummary:: 17 | :toctree: ./generated 18 | :nosignatures: 19 | :template: classtemplate.rst 20 | 21 | Timer 22 | PipelineTimer 23 | TaskTimer 24 | -------------------------------------------------------------------------------- /docs/pages/modules/validation.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | lightautoml.validation 6 | ============================== 7 | 8 | The module provide classes and functions for model validation. 9 | 10 | Iterators 11 | ------------------------------ 12 | 13 | .. currentmodule:: lightautoml.validation 14 | 15 | .. autosummary:: 16 | :toctree: ./generated 17 | :nosignatures: 18 | :template: classtemplate.rst 19 | 20 | ~base.TrainValidIterator 21 | ~base.DummyIterator 22 | ~base.HoldoutIterator 23 | ~base.CustomIterator 24 | ~np_iterators.FoldsIterator 25 | ~np_iterators.TimeSeriesIterator 26 | 27 | 28 | Iterators Getters and Utils 29 | ------------------------------ 30 | 31 | 32 | .. currentmodule:: lightautoml.validation 33 | 34 | .. autosummary:: 35 | :toctree: ./generated 36 | :nosignatures: 37 | :template: functiontemplate.rst 38 | 39 | ~utils.create_validation_iterator 40 | ~np_iterators.get_numpy_iterator 41 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_10_relational_data_with_star_scheme.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_10_relational_data_with_star_scheme.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_11_time_series.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_11_time_series.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_12_AA_Test.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_12_AA_Test.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_13_AA_Test_multigroup_split.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_13_AA_Test_multigroup_split.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_14_AB_Test.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_14_AB_Test.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_15_Matching.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_15_Matching.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_16_Matching_without_replacement.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_16_Matching_without_replacement.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_17_Modeling_Limit_Distribution.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_17_Modeling_Limit_Distribution.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_18_Test_Limit_Distribution.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_18_Test_Limit_Distribution.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_1_basics.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_1_basics.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_2_WhiteBox_AutoWoE.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_2_WhiteBox_AutoWoE.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_3_sql_data_source.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_3_sql_data_source.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_4_NLP_Interpretation.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_4_NLP_Interpretation.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_5_uplift.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_5_uplift.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_6_custom_pipeline.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_6_custom_pipeline.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_7_ICE_and_PDP_interpretation.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_7_ICE_and_PDP_interpretation.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_8_CV_preset.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_8_CV_preset.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/pages/tutorials/Tutorial_9_neural_networks.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../../examples/tutorials/Tutorial_9_neural_networks.ipynb", 3 | "extra-media": [ 4 | "../../../imgs" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | ipykernel 2 | nbsphinx 3 | nbsphinx-link 4 | sphinx-autodoc-typehints 5 | pandoc 6 | jupyter 7 | prompt-toolkit<3.0.0,!=3.0.1,>=2.0.0 8 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | ## Competitions 3 | | Place         | Competition   | Description | Solution | 4 | | ------ |:------------- | --------- | --------- | 5 | | 1st | [2024 AutoML Grand Prix](https://www.kaggle.com/automl-grand-prix) | Team "LightAutoML testers" | [1 stage](https://www.kaggle.com/competitions/playground-series-s4e5/discussion/500700), [3 stage](https://www.kaggle.com/competitions/playground-series-s4e7/discussion/516860), [4 stage](https://www.kaggle.com/competitions/playground-series-s4e8/discussion/523732), [5 stage](https://www.kaggle.com/competitions/playground-series-s4e9/discussion/531884) | 6 | 7 | 8 | ## Code snippets 9 | 1. `demo0.py` - building ML pipeline from blocks and fit + predict the pipeline itself. 10 | 2. `demo1.py` - several ML pipelines creation (using importances based cutoff feature selector) to build 2 level stacking using AutoML class 11 | 3. `demo2.py` - several ML pipelines creation (using iteartive feature selection algorithm) to build 2 level stacking using AutoML class 12 | 4. `demo3.py` - several ML pipelines creation (using combination of cutoff and iterative FS algos) to build 2 level stacking using AutoML class 13 | 5. `demo4.py` - creation of classification and regression tasks for AutoML with loss and evaluation metric setup 14 | 6. `demo5.py` - 2 level stacking using AutoML class with different algos on first level including LGBM, Linear and LinearL1 15 | 7. `demo6.py` - AutoML with nested CV usage 16 | 8. `demo7.py` - AutoML preset usage for tabular datasets (predefined structure of AutoML pipeline and simple interface for users without building from blocks) 17 | 9. `demo8.py` - creation pipelines from blocks to build AutoML, solving multiclass classification task 18 | 10. `demo9.py` - AutoML time utilization preset usage for tabular datasets (predefined structure of AutoML pipeline and simple interface for users without building from blocks) 19 | 11. `demo10.py` - creation pipelines from blocks (including CatBoost) to build AutoML, solving multiclass classification task 20 | 12. `demo11.py` - AutoML NLP preset usage for tabular datasets with text columns 21 | 13. `demo12.py` - AutoML tabular preset usage with custom validation scheme and multiprocessed inference 22 | 14. `demo13.py` - AutoML TS preset usage with lag and diff transformers' parameters selection 23 | 15. `demo14.py` - Groupby features (using TabularAutoML preset and custom pipeline) 24 | -------------------------------------------------------------------------------- /examples/data/meal_delivery_company/fulfilment_center_info.csv: -------------------------------------------------------------------------------- 1 | center_id,city_code,region_code,center_type,op_area 2 | 11,679,56,TYPE_A,3.7 3 | 13,590,56,TYPE_B,6.7 4 | 124,590,56,TYPE_C,4 5 | 66,648,34,TYPE_A,4.1 6 | 94,632,34,TYPE_C,3.6 7 | 64,553,77,TYPE_A,4.4 8 | 129,593,77,TYPE_A,3.9 9 | 139,693,34,TYPE_C,2.8 10 | 88,526,34,TYPE_A,4.1 11 | 143,562,77,TYPE_B,3.8 12 | 101,699,85,TYPE_C,2.8 13 | 86,699,85,TYPE_C,4 14 | 32,526,34,TYPE_A,3.8 15 | 149,478,77,TYPE_A,2.4 16 | 152,576,34,TYPE_B,4 17 | 92,526,34,TYPE_C,2.9 18 | 27,713,85,TYPE_A,4.5 19 | 14,654,56,TYPE_C,2.7 20 | 26,515,77,TYPE_C,3 21 | 104,647,56,TYPE_A,4.5 22 | 77,676,34,TYPE_A,3.8 23 | 23,698,23,TYPE_A,3.4 24 | 97,628,77,TYPE_A,4.6 25 | 146,526,34,TYPE_B,5 26 | 113,680,77,TYPE_C,4 27 | 145,620,77,TYPE_A,3.9 28 | 80,604,56,TYPE_C,5.1 29 | 55,647,56,TYPE_C,2 30 | 186,649,34,TYPE_A,3.4 31 | 99,596,71,TYPE_A,4.5 32 | 91,590,56,TYPE_C,0.9 33 | 20,522,56,TYPE_A,4 34 | 106,675,34,TYPE_A,4 35 | 81,526,34,TYPE_A,4 36 | 73,576,34,TYPE_A,4 37 | 29,526,34,TYPE_C,4 38 | 43,590,56,TYPE_A,5.1 39 | 102,593,77,TYPE_A,2.8 40 | 61,473,77,TYPE_A,4.5 41 | 50,556,77,TYPE_A,4.8 42 | 83,659,77,TYPE_A,5.3 43 | 57,541,77,TYPE_C,2.8 44 | 126,577,56,TYPE_A,2.7 45 | 177,683,56,TYPE_A,3.4 46 | 67,638,56,TYPE_B,7 47 | 174,700,56,TYPE_A,7 48 | 59,456,56,TYPE_A,4.2 49 | 58,695,77,TYPE_C,3.8 50 | 65,602,34,TYPE_A,4.8 51 | 39,526,34,TYPE_C,3.8 52 | 132,522,56,TYPE_A,3.9 53 | 89,703,56,TYPE_A,4.8 54 | 162,526,34,TYPE_C,2 55 | 75,651,77,TYPE_B,4.7 56 | 72,638,56,TYPE_C,3.9 57 | 41,590,56,TYPE_C,1.9 58 | 10,590,56,TYPE_B,6.3 59 | 110,485,77,TYPE_A,3.8 60 | 52,685,56,TYPE_B,5.6 61 | 93,461,34,TYPE_A,3.9 62 | 74,702,35,TYPE_A,2.8 63 | 34,615,34,TYPE_B,4.2 64 | 137,590,56,TYPE_A,4.4 65 | 153,590,56,TYPE_A,3.9 66 | 24,614,85,TYPE_B,3.6 67 | 109,599,56,TYPE_A,3.6 68 | 108,579,56,TYPE_B,4.4 69 | 36,517,56,TYPE_B,4.4 70 | 157,609,93,TYPE_A,4.1 71 | 17,517,56,TYPE_A,3.2 72 | 161,658,34,TYPE_B,3.9 73 | 42,561,77,TYPE_B,3.9 74 | 53,590,56,TYPE_A,3.8 75 | 30,604,56,TYPE_A,3.5 76 | 76,614,85,TYPE_A,3 77 | 68,676,34,TYPE_B,4.1 78 | 51,638,56,TYPE_A,7 79 | -------------------------------------------------------------------------------- /examples/data/meal_delivery_company/meal_info.csv: -------------------------------------------------------------------------------- 1 | meal_id,category,cuisine 2 | 1885,Beverages,Thai 3 | 1993,Beverages,Thai 4 | 2539,Beverages,Thai 5 | 1248,Beverages,Indian 6 | 2631,Beverages,Indian 7 | 1311,Extras,Thai 8 | 1062,Beverages,Italian 9 | 1778,Beverages,Italian 10 | 1803,Extras,Thai 11 | 1198,Extras,Thai 12 | 2707,Beverages,Italian 13 | 1847,Soup,Thai 14 | 1438,Soup,Thai 15 | 2494,Soup,Thai 16 | 2760,Other Snacks,Thai 17 | 2490,Salad,Italian 18 | 1109,Rice Bowl,Indian 19 | 2290,Rice Bowl,Indian 20 | 1525,Other Snacks,Thai 21 | 2704,Other Snacks,Thai 22 | 1878,Starters,Thai 23 | 2640,Starters,Thai 24 | 2577,Starters,Thai 25 | 1754,Sandwich,Italian 26 | 1971,Sandwich,Italian 27 | 2306,Pasta,Italian 28 | 2139,Beverages,Indian 29 | 2826,Sandwich,Italian 30 | 2664,Salad,Italian 31 | 2569,Salad,Italian 32 | 1230,Beverages,Continental 33 | 1207,Beverages,Continental 34 | 2322,Beverages,Continental 35 | 2492,Desert,Indian 36 | 1216,Pasta,Italian 37 | 1727,Rice Bowl,Indian 38 | 1902,Biryani,Indian 39 | 1247,Biryani,Indian 40 | 2304,Desert,Indian 41 | 1543,Desert,Indian 42 | 1770,Biryani,Indian 43 | 2126,Pasta,Italian 44 | 1558,Pizza,Continental 45 | 2581,Pizza,Continental 46 | 1962,Pizza,Continental 47 | 1571,Fish,Continental 48 | 2956,Fish,Continental 49 | 2104,Fish,Continental 50 | 2444,Seafood,Continental 51 | 2867,Seafood,Continental 52 | 1445,Seafood,Continental 53 | -------------------------------------------------------------------------------- /examples/data/meal_delivery_company/relational_main.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/examples/data/meal_delivery_company/relational_main.csv.zip -------------------------------------------------------------------------------- /examples/demo11.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import shutil 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from sklearn.metrics import mean_squared_error 10 | from sklearn.model_selection import train_test_split 11 | 12 | from lightautoml.automl.presets.text_presets import TabularNLPAutoML 13 | from lightautoml.tasks import Task 14 | 15 | 16 | np.random.seed(42) 17 | 18 | data = pd.read_csv("./data/avito1k_train.csv") 19 | 20 | train, test = train_test_split(data, test_size=500, random_state=42) 21 | 22 | roles = { 23 | "target": "deal_probability", 24 | "group": ["user_id"], 25 | "text": ["description", "title", "param_1", "param_2", "param_3"], 26 | } 27 | 28 | task = Task("reg") 29 | 30 | automl = TabularNLPAutoML(task=task, timeout=600) 31 | oof_pred = automl.fit_predict(train, roles=roles) 32 | test_pred = automl.predict(test) 33 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 34 | 35 | print("Check scores...") 36 | print("OOF score: {}".format(mean_squared_error(train[roles["target"]].values[not_nan], oof_pred.data[not_nan][:, 0]))) 37 | print("TEST score: {}".format(mean_squared_error(test[roles["target"]].values, test_pred.data[:, 0]))) 38 | 39 | shutil.rmtree("./models", ignore_errors=True) 40 | -------------------------------------------------------------------------------- /examples/demo12.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.metrics import roc_auc_score 8 | from sklearn.model_selection import train_test_split 9 | 10 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 11 | from lightautoml.tasks import Task 12 | from lightautoml.validation.np_iterators import TimeSeriesIterator 13 | 14 | 15 | ################################ 16 | # Features: 17 | # - working with np.arrays 18 | # - working with file 19 | # - custom time series split 20 | # - parallel/batch inference 21 | ################################ 22 | 23 | 24 | np.random.seed(42) 25 | 26 | data = pd.read_csv("./data/sampled_app_train.csv") 27 | 28 | data["BIRTH_DATE"] = (np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str) 29 | data["EMP_DATE"] = ( 30 | np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype(np.dtype("timedelta64[D]")) 31 | ).astype(str) 32 | 33 | data["report_dt"] = np.datetime64("2018-01-01") 34 | 35 | data["constant"] = 1 36 | data["allnan"] = np.nan 37 | 38 | data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) 39 | 40 | train, test = train_test_split(data, test_size=2000, random_state=42) 41 | # create time series iterator that is passed as cv_func 42 | cv_iter = TimeSeriesIterator(train["EMP_DATE"].astype(np.datetime64), n_splits=5, sorted_kfold=False) 43 | 44 | # train dataset may be passed as dict of np.ndarray 45 | train = { 46 | "data": train[["AMT_CREDIT", "AMT_ANNUITY"]].values, 47 | "target": train["TARGET"].values, 48 | } 49 | 50 | task = Task( 51 | "binary", 52 | ) 53 | 54 | automl = TabularAutoML( 55 | task=task, 56 | timeout=200, 57 | ) 58 | oof_pred = automl.fit_predict(train, train_features=["AMT_CREDIT", "AMT_ANNUITY"], cv_iter=cv_iter) 59 | # prediction can be made on file by 60 | test.to_csv("temp_test_data.csv", index=False) 61 | test_pred = automl.predict("temp_test_data.csv", batch_size=100, n_jobs=4) 62 | 63 | print("Check scores...") 64 | oof_prediction = oof_pred.data[:, 0] 65 | not_empty = np.logical_not(np.isnan(oof_prediction)) 66 | 67 | print(f'OOF score: {roc_auc_score(train["target"][not_empty], oof_prediction[not_empty])}') 68 | print(f'TEST score: {roc_auc_score(test["TARGET"].values, test_pred.data[:, 0])}') 69 | -------------------------------------------------------------------------------- /examples/demo13.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sklearn.metrics import mean_absolute_error 5 | 6 | from lightautoml.addons.autots.base import AutoTS 7 | from lightautoml.tasks import Task 8 | 9 | 10 | np.random.seed(42) 11 | 12 | data = pd.read_csv("data/ai92_value_77.csv") 13 | horizon = 30 14 | 15 | train = data[:-horizon] 16 | test = data[-horizon:] 17 | 18 | roles = {"target": "value", "datetime": "date"} 19 | 20 | seq_params = { 21 | "seq0": { 22 | "case": "next_values", 23 | "params": {"n_target": horizon, "history": np.maximum(7, horizon), "step": 1, "test_last": True}, 24 | }, 25 | } 26 | 27 | # True (then set default values) / False; int, list or np.array 28 | # default: lag_features=30, diff_features=7 29 | transformers_params = { 30 | "lag_features": [0, 1, 2, 3, 5, 10], 31 | "lag_time_features": [0, 1, 2], 32 | "diff_features": [0, 1, 3, 4], 33 | } 34 | 35 | task = Task("multi:reg", greater_is_better=False, metric="mae", loss="mae") 36 | 37 | automl = AutoTS( 38 | task, 39 | seq_params=seq_params, 40 | trend_params={ 41 | "trend": False, 42 | }, 43 | transformers_params=transformers_params, 44 | ) 45 | train_pred, _ = automl.fit_predict(train, roles, verbose=4) 46 | forecast, _ = automl.predict(train) 47 | 48 | print("Check scores...") 49 | print("TEST score: {}".format(mean_absolute_error(test[roles["target"]].values, forecast.data))) 50 | -------------------------------------------------------------------------------- /examples/demo14.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from lightautoml.addons.hypex import Matcher 4 | 5 | 6 | df = pd.read_csv("data/sampled_matching.csv").drop(["Unnamed: 0"], axis=1) 7 | 8 | print(df.shape) 9 | print(df.columns) 10 | 11 | target = "created_variable" 12 | treatment = "is_tb_pilot" 13 | 14 | 15 | matcher = Matcher(df, target, treatment, is_feature_select=False, quality_check=True) 16 | 17 | matcher.estimate() 18 | 19 | print(matcher.matcher.ATE) 20 | print(matcher.matcher.quality_dict) 21 | -------------------------------------------------------------------------------- /examples/demo6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | """AutoML with nested CV usage.""" 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from sklearn.metrics import roc_auc_score 11 | from sklearn.model_selection import train_test_split 12 | 13 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 14 | from lightautoml.dataset.roles import DatetimeRole 15 | from lightautoml.tasks import Task 16 | 17 | 18 | np.random.seed(42) 19 | 20 | data = pd.read_csv("./data/sampled_app_train.csv") 21 | 22 | data["BIRTH_DATE"] = (np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str) 23 | data["EMP_DATE"] = ( 24 | np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype(np.dtype("timedelta64[D]")) 25 | ).astype(str) 26 | 27 | data["report_dt"] = np.datetime64("2018-01-01") 28 | 29 | data["constant"] = 1 30 | data["allnan"] = np.nan 31 | 32 | data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) 33 | 34 | train, test = train_test_split(data, test_size=2000, random_state=42) 35 | 36 | roles = { 37 | "target": "TARGET", 38 | DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", 39 | } 40 | 41 | task = Task( 42 | "binary", 43 | ) 44 | 45 | automl = TabularAutoML( 46 | task=task, 47 | timeout=600, 48 | general_params={ 49 | "use_algos": [ 50 | [ 51 | "linear_l2", 52 | "lgb", 53 | ], 54 | ["linear_l2", "lgb"], 55 | ], 56 | "nested_cv": True, 57 | "skip_conn": True, 58 | }, 59 | nested_cv_params={"cv": 5, "n_folds": None}, 60 | ) 61 | 62 | oof_pred = automl.fit_predict(train, roles=roles) 63 | test_pred = automl.predict(test) 64 | 65 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 66 | 67 | print(f"OOF score: {roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])}") 68 | print(f"TEST score: {roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])}") 69 | -------------------------------------------------------------------------------- /examples/demo7.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.metrics import roc_auc_score 8 | from sklearn.model_selection import train_test_split 9 | 10 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 11 | from lightautoml.dataset.roles import DatetimeRole 12 | from lightautoml.tasks import Task 13 | 14 | 15 | np.random.seed(42) 16 | 17 | data = pd.read_csv("./data/sampled_app_train.csv") 18 | 19 | data["BIRTH_DATE"] = (np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str) 20 | data["EMP_DATE"] = ( 21 | np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype(np.dtype("timedelta64[D]")) 22 | ).astype(str) 23 | 24 | data["report_dt"] = np.datetime64("2018-01-01") 25 | 26 | data["constant"] = 1 27 | data["allnan"] = np.nan 28 | 29 | data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) 30 | 31 | train, test = train_test_split(data, test_size=2000, random_state=42) 32 | 33 | roles = { 34 | "target": "TARGET", 35 | DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", 36 | } 37 | 38 | task = Task( 39 | "binary", 40 | ) 41 | 42 | automl = TabularAutoML( 43 | task=task, 44 | timeout=3600, 45 | ) 46 | oof_pred = automl.fit_predict(train, roles=roles) 47 | test_pred = automl.predict(test) 48 | 49 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 50 | 51 | print("Check scores...") 52 | print("OOF score: {}".format(roc_auc_score(train[roles["target"]].values[not_nan], oof_pred.data[not_nan][:, 0]))) 53 | print("TEST score: {}".format(roc_auc_score(test[roles["target"]].values, test_pred.data[:, 0]))) 54 | -------------------------------------------------------------------------------- /examples/demo9.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """AutoML time utilization preset usage for tabular datasets. 5 | 6 | Predefined structure of AutoML pipeline and simple interface for users without building from blocks. 7 | 8 | """ 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | from sklearn.metrics import roc_auc_score 14 | from sklearn.model_selection import train_test_split 15 | 16 | from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML 17 | from lightautoml.dataset.roles import DatetimeRole 18 | from lightautoml.tasks import Task 19 | 20 | 21 | np.random.seed(42) 22 | 23 | data = pd.read_csv("./data/sampled_app_train.csv") 24 | 25 | data["BIRTH_DATE"] = (np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str) 26 | data["EMP_DATE"] = ( 27 | np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype(np.dtype("timedelta64[D]")) 28 | ).astype(str) 29 | 30 | data["report_dt"] = np.datetime64("2018-01-01") 31 | 32 | data["constant"] = 1 33 | data["allnan"] = np.nan 34 | 35 | data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) 36 | 37 | train, test = train_test_split(data, test_size=2000, random_state=42) 38 | 39 | roles = { 40 | "target": "TARGET", 41 | DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", 42 | } 43 | 44 | task = Task("binary") 45 | 46 | automl = TabularUtilizedAutoML( 47 | task=task, 48 | timeout=600, 49 | ) 50 | oof_pred = automl.fit_predict(train, roles=roles) 51 | test_pred = automl.predict(test) 52 | 53 | # use only not nan 54 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 55 | 56 | print(f"OOF score: {roc_auc_score(train['TARGET'].values[not_nan], oof_pred.data[not_nan])}") 57 | print(f"TEST score: {roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])}") 58 | -------------------------------------------------------------------------------- /examples/optimization/conditional_parameters.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | """Simple example for conditional parameters with OptunaTuner.""" 4 | 5 | import copy 6 | 7 | import optuna 8 | import pandas as pd 9 | 10 | from sklearn.metrics import roc_auc_score 11 | from sklearn.model_selection import train_test_split 12 | 13 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 14 | from lightautoml.tasks import Task 15 | 16 | 17 | # load and prepare data 18 | data = pd.read_csv("./data/sampled_app_train.csv") 19 | train_data, test_data = train_test_split(data, test_size=0.2, stratify=data["TARGET"], random_state=42) 20 | 21 | 22 | def sample(estimated_n_trials: int, trial: optuna.trial.Trial, suggested_params: dict): 23 | trial_values = copy.copy(suggested_params) 24 | trial_values["feature_fraction"] = trial.suggest_float("feature_fraction", low=0.5, high=1.0) 25 | 26 | if trial_values["feature_fraction"] > 0.7: 27 | trial_values["min_sum_hessian_in_leaf"] = trial.suggest_float( 28 | "min_sum_hessian_in_leaf", low=0.5, high=1, log=True 29 | ) 30 | else: 31 | trial_values["min_sum_hessian_in_leaf"] = trial.suggest_float( 32 | "min_sum_hessian_in_leaf", low=0, high=0.5, log=True 33 | ) 34 | 35 | return trial_values 36 | 37 | 38 | # run automl with custom search spaces 39 | automl = TabularAutoML( 40 | task=Task("binary"), 41 | lgb_params={"optimization_search_space": sample}, 42 | ) 43 | oof_predictions = automl.fit_predict(train_data, roles={"target": "TARGET", "drop": ["SK_ID_CURR"]}) 44 | te_pred = automl.predict(test_data) 45 | 46 | # calculate scores 47 | print(f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}") 48 | print(f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}") 49 | -------------------------------------------------------------------------------- /examples/optimization/custom_search_space.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | """Simple example for binary classification on tabular data.""" 4 | 5 | import pandas as pd 6 | 7 | from sklearn.metrics import roc_auc_score 8 | from sklearn.model_selection import train_test_split 9 | 10 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 11 | from lightautoml.ml_algo.tuning.base import Distribution 12 | from lightautoml.ml_algo.tuning.base import SearchSpace 13 | from lightautoml.tasks import Task 14 | 15 | 16 | # load and prepare data 17 | data = pd.read_csv("./data/sampled_app_train.csv") 18 | train_data, test_data = train_test_split(data, test_size=0.2, stratify=data["TARGET"], random_state=42) 19 | 20 | # run automl with custom search spaces 21 | automl = TabularAutoML( 22 | task=Task("binary"), 23 | lgb_params={ 24 | "optimization_search_space": { 25 | "feature_fraction": SearchSpace(Distribution.UNIFORM, low=0.5, high=1.0), 26 | "min_sum_hessian_in_leaf": SearchSpace(Distribution.LOGUNIFORM, low=1e-3, high=10.0), 27 | } 28 | }, 29 | ) 30 | oof_predictions = automl.fit_predict(train_data, roles={"target": "TARGET", "drop": ["SK_ID_CURR"]}) 31 | te_pred = automl.predict(test_data) 32 | 33 | # calculate scores 34 | print(f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}") 35 | print(f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}") 36 | -------------------------------------------------------------------------------- /examples/optimization/sequential_parameter_search.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | """Simple example for sequential parameter search with OptunaTuner.""" 4 | 5 | import copy 6 | 7 | import optuna 8 | import pandas as pd 9 | 10 | from sklearn.metrics import roc_auc_score 11 | from sklearn.model_selection import train_test_split 12 | 13 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 14 | from lightautoml.tasks import Task 15 | 16 | 17 | # load and prepare data 18 | data = pd.read_csv("./data/sampled_app_train.csv") 19 | train_data, test_data = train_test_split(data, test_size=0.2, stratify=data["TARGET"], random_state=42) 20 | 21 | 22 | def sample(estimated_n_trials: int, trial: optuna.trial.Trial, suggested_params: dict): 23 | trial_values = copy.copy(suggested_params) 24 | 25 | for feature_fraction in range(10): 26 | feature_fraction = feature_fraction / 10 27 | trial_values["feature_fraction"] = feature_fraction 28 | trial_values["min_sum_hessian_in_leaf"] = trial.suggest_float("min_sum_hessian_in_leaf", low=0.5, high=1) 29 | yield trial_values 30 | 31 | 32 | # run automl with custom search spaces 33 | automl = TabularAutoML( 34 | task=Task("binary"), 35 | lgb_params={"optimization_search_space": sample}, 36 | ) 37 | oof_predictions = automl.fit_predict(train_data, roles={"target": "TARGET", "drop": ["SK_ID_CURR"]}) 38 | te_pred = automl.predict(test_data) 39 | 40 | # calculate scores 41 | print(f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}") 42 | print(f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}") 43 | -------------------------------------------------------------------------------- /examples/simple_tabular_classification.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | """Simple example for binary classification on tabular data.""" 4 | 5 | import pandas as pd 6 | 7 | from sklearn.metrics import roc_auc_score 8 | from sklearn.model_selection import train_test_split 9 | 10 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 11 | from lightautoml.tasks import Task 12 | 13 | 14 | # load and prepare data 15 | data = pd.read_csv("./data/sampled_app_train.csv") 16 | train_data, test_data = train_test_split(data, test_size=0.2, stratify=data["TARGET"], random_state=42) 17 | 18 | # run automl 19 | automl = TabularAutoML(task=Task("binary")) 20 | oof_predictions = automl.fit_predict(train_data, roles={"target": "TARGET", "drop": ["SK_ID_CURR"]}) 21 | te_pred = automl.predict(test_data) 22 | 23 | # calculate scores 24 | print(f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}") 25 | print(f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}") 26 | -------------------------------------------------------------------------------- /imgs/GENERALL2X2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/GENERALL2X2.jpg -------------------------------------------------------------------------------- /imgs/LightAutoML_logo_big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/LightAutoML_logo_big.png -------------------------------------------------------------------------------- /imgs/LightAutoML_logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/LightAutoML_logo_small.png -------------------------------------------------------------------------------- /imgs/Star_scheme_tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/Star_scheme_tables.png -------------------------------------------------------------------------------- /imgs/TabularAutoML_model_descr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/TabularAutoML_model_descr.png -------------------------------------------------------------------------------- /imgs/TabularUtilizedAutoML_model_descr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/TabularUtilizedAutoML_model_descr.png -------------------------------------------------------------------------------- /imgs/autoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/autoint.png -------------------------------------------------------------------------------- /imgs/denselight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/denselight.png -------------------------------------------------------------------------------- /imgs/densenet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/densenet.png -------------------------------------------------------------------------------- /imgs/fttransformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/fttransformer.png -------------------------------------------------------------------------------- /imgs/lightautoml_icon_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/lightautoml_icon_color.png -------------------------------------------------------------------------------- /imgs/lightautoml_logo_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/lightautoml_logo_color.png -------------------------------------------------------------------------------- /imgs/lime.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/lime.jpg -------------------------------------------------------------------------------- /imgs/node.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/node.png -------------------------------------------------------------------------------- /imgs/resnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/resnet.png -------------------------------------------------------------------------------- /imgs/swa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/swa.png -------------------------------------------------------------------------------- /imgs/tutorial_11_case_problem_statement.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_11_case_problem_statement.png -------------------------------------------------------------------------------- /imgs/tutorial_11_general_problem_statement.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_11_general_problem_statement.png -------------------------------------------------------------------------------- /imgs/tutorial_11_history_step_params.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_11_history_step_params.png -------------------------------------------------------------------------------- /imgs/tutorial_11_transformers_params.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_11_transformers_params.png -------------------------------------------------------------------------------- /imgs/tutorial_1_initial_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_1_initial_report.png -------------------------------------------------------------------------------- /imgs/tutorial_1_laml_big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_1_laml_big.png -------------------------------------------------------------------------------- /imgs/tutorial_1_ml_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_1_ml_pipeline.png -------------------------------------------------------------------------------- /imgs/tutorial_1_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_1_pipeline.png -------------------------------------------------------------------------------- /imgs/tutorial_1_unfolded_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_1_unfolded_report.png -------------------------------------------------------------------------------- /imgs/tutorial_2_initial_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_2_initial_report.png -------------------------------------------------------------------------------- /imgs/tutorial_2_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_2_pipeline.png -------------------------------------------------------------------------------- /imgs/tutorial_2_unfolded_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_2_unfolded_report.png -------------------------------------------------------------------------------- /imgs/tutorial_3_initial_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_3_initial_report.png -------------------------------------------------------------------------------- /imgs/tutorial_3_unfolded_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_3_unfolded_report.png -------------------------------------------------------------------------------- /imgs/tutorial_blackbox_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_blackbox_pipeline.png -------------------------------------------------------------------------------- /imgs/tutorial_whitebox_report_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_whitebox_report_1.png -------------------------------------------------------------------------------- /imgs/tutorial_whitebox_report_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_whitebox_report_2.png -------------------------------------------------------------------------------- /imgs/tutorial_whitebox_report_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_whitebox_report_3.png -------------------------------------------------------------------------------- /imgs/tutorial_whitebox_report_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/imgs/tutorial_whitebox_report_4.png -------------------------------------------------------------------------------- /lightautoml/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | 6 | _root_logger = logging.getLogger() 7 | _logger = logging.getLogger(__name__) 8 | _logger.setLevel(logging.WARNING) 9 | 10 | # if root logger has handlers, propagate messages up and let root logger process them 11 | if not _root_logger.hasHandlers(): 12 | _logger.addHandler(logging.StreamHandler(sys.stdout)) 13 | _logger.propagate = False 14 | 15 | __all__ = [ 16 | "automl", 17 | "dataset", 18 | "ml_algo", 19 | "pipelines", 20 | "image", 21 | "reader", 22 | "transformers", 23 | "validation", 24 | "text", 25 | "tasks", 26 | "utils", 27 | "addons", 28 | "report", 29 | ] 30 | 31 | if os.getenv("DOCUMENTATION_ENV") is None: 32 | try: 33 | import importlib.metadata as importlib_metadata 34 | except ModuleNotFoundError: 35 | import importlib_metadata 36 | 37 | __version__ = importlib_metadata.version(__name__) 38 | -------------------------------------------------------------------------------- /lightautoml/addons/__init__.py: -------------------------------------------------------------------------------- 1 | """Extensions of core functionality.""" 2 | 3 | __all__ = ["utilization", "uplift", "interpretation", "autots", "hypex"] 4 | -------------------------------------------------------------------------------- /lightautoml/addons/hypex/__init__.py: -------------------------------------------------------------------------------- 1 | """HypEx Addon for LightAutoML. 2 | 3 | This module forwards all imports from the official HypEx package, 4 | maintaining the same API structure as in the original library. 5 | 6 | Requirements: 7 | - Install LightAutoML with HypEx support: 8 | `pip install lightautoml[hypex]` 9 | 10 | Examples: 11 | Importing models and utilities as in HypEx: 12 | 13 | >>> from lightautoml.addons.hypex import AATest 14 | >>> from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data 15 | 16 | Creating test data: 17 | >>> some_large_dataframe = create_test_data( 18 | ... rs=52, na_step=10, nan_cols=['age', 'gender'], num_users=100_000 19 | ... ) 20 | 21 | Raises: 22 | ImportError: If HypEx is not installed. 23 | """ 24 | 25 | import importlib 26 | import sys 27 | 28 | MODULE_NAME = "hypex" 29 | 30 | try: 31 | hypex = importlib.import_module(MODULE_NAME) 32 | except ImportError: 33 | raise ImportError( 34 | f"{MODULE_NAME} is not installed. Please install it using " f"'pip install lightautoml[{MODULE_NAME}]'." 35 | ) 36 | 37 | sys.modules["lightautoml.addons.hypex"] = hypex 38 | -------------------------------------------------------------------------------- /lightautoml/addons/interpretation/__init__.py: -------------------------------------------------------------------------------- 1 | from .l2x import L2XTextExplainer 2 | from .lime import LimeTextExplainer 3 | 4 | 5 | __all__ = ["LimeTextExplainer", "L2XTextExplainer", "SSWARM"] 6 | -------------------------------------------------------------------------------- /lightautoml/addons/tabular_interpretation/__init__.py: -------------------------------------------------------------------------------- 1 | from .sswarm import SSWARM 2 | 3 | 4 | __all__ = ["SSWARM"] 5 | -------------------------------------------------------------------------------- /lightautoml/addons/uplift/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/lightautoml/addons/uplift/__init__.py -------------------------------------------------------------------------------- /lightautoml/addons/utilization/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools to configure resources utilization.""" 2 | from .utilization import TimeUtilization 3 | 4 | 5 | __all__ = ["TimeUtilization"] 6 | -------------------------------------------------------------------------------- /lightautoml/automl/__init__.py: -------------------------------------------------------------------------------- 1 | """The main module, which includes the AutoML class, blenders and ready-made presets.""" 2 | 3 | __all__ = ["base", "presets", "blend"] 4 | -------------------------------------------------------------------------------- /lightautoml/automl/presets/__init__.py: -------------------------------------------------------------------------------- 1 | """Presets for end-to-end model training for special tasks.""" 2 | 3 | __all__ = [ 4 | "base", 5 | "tabular_presets", 6 | "image_presets", 7 | "text_presets", 8 | "whitebox_presets", 9 | ] 10 | -------------------------------------------------------------------------------- /lightautoml/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides an internal interface for working with data.""" 2 | 3 | __all__ = ["base", "roles", "np_pd_dataset", "utils"] 4 | -------------------------------------------------------------------------------- /lightautoml/image/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides an internal interface for working with image features.""" 2 | 3 | 4 | __all__ = ["image"] 5 | -------------------------------------------------------------------------------- /lightautoml/image/utils.py: -------------------------------------------------------------------------------- 1 | """Image utils.""" 2 | 3 | from PIL import Image 4 | 5 | 6 | def pil_loader(path: str) -> Image: 7 | """Load image from paths. 8 | 9 | Args: 10 | path: Image path. 11 | 12 | Returns: 13 | Loaded PIL Image in rgb. 14 | 15 | """ 16 | with open(path, "rb") as f: 17 | img = Image.open(f) 18 | return img.convert("RGB") 19 | -------------------------------------------------------------------------------- /lightautoml/ml_algo/__init__.py: -------------------------------------------------------------------------------- 1 | """Modules with machine learning algorithms and hyperparameters tuning tools.""" 2 | 3 | __all__ = [ 4 | "tuning", 5 | "base", 6 | "boost_lgbm", 7 | "boost_cb", 8 | "linear_sklearn", 9 | "dl_model", 10 | "torch_based", 11 | "whitebox", 12 | "utils", 13 | ] 14 | -------------------------------------------------------------------------------- /lightautoml/ml_algo/torch_based/__init__.py: -------------------------------------------------------------------------------- 1 | """Models based on Torch library.""" 2 | 3 | __all__ = ["linear_model", "nn_models"] 4 | -------------------------------------------------------------------------------- /lightautoml/ml_algo/torch_based/autoint/ghost_norm.py: -------------------------------------------------------------------------------- 1 | """Module for Ghost Batch Norm and variations. 2 | 3 | Ghost Batch Norm: https://arxiv.org/pdf/1705.08741.pdf 4 | 5 | """ 6 | 7 | from math import ceil 8 | from typing import Union 9 | 10 | import torch 11 | from torch import Tensor 12 | from torch import nn 13 | 14 | 15 | class GhostNorm(nn.Module): 16 | """Ghost Normalization. 17 | 18 | https://arxiv.org/pdf/1705.08741.pdf 19 | 20 | Args: 21 | inner_norm : torch.nn.Module (initialiezd) 22 | examples: `nn.BatchNorm1d`, `nn.LayerNorm` 23 | virtual_batch_size : int 24 | device : string or torch.device, optional 25 | default is "cpu" 26 | """ 27 | 28 | def __init__( 29 | self, 30 | inner_norm: nn.Module, 31 | virtual_batch_size: int, 32 | device: Union[str, torch.device] = "cpu", 33 | ): 34 | super().__init__() 35 | self.virtual_batch_size = virtual_batch_size 36 | self.inner_norm = inner_norm 37 | self.to(device) 38 | 39 | def forward(self, x: Tensor) -> Tensor: 40 | """Transform the input tensor. 41 | 42 | Args: 43 | x : torch.Tensor 44 | 45 | Returns: 46 | torch.Tensor 47 | 48 | """ 49 | chunk_size = int(ceil(x.shape[0] / self.virtual_batch_size)) 50 | chunk_norm = [self.inner_norm(chunk) for chunk in x.chunk(chunk_size, dim=0)] 51 | return torch.cat(chunk_norm, dim=0) 52 | 53 | 54 | class GhostBatchNorm(GhostNorm): 55 | """Ghost Normalization, using BatchNorm1d as inner normalization. 56 | 57 | https://arxiv.org/pdf/1705.08741.pdf 58 | 59 | Args: 60 | num_features : int 61 | virtual_batch_size : int, optional 62 | default is 64 63 | momentum : float, optional 64 | default is 0.1 65 | device : string or torch.device, optional 66 | default is "cpu" 67 | """ 68 | 69 | def __init__( 70 | self, 71 | num_features: int, 72 | virtual_batch_size: int = 64, 73 | momentum: float = 0.1, 74 | device: Union[str, torch.device] = "cpu", 75 | ): 76 | super().__init__( 77 | inner_norm=nn.BatchNorm1d(num_features, momentum=momentum), 78 | virtual_batch_size=virtual_batch_size, 79 | ) 80 | -------------------------------------------------------------------------------- /lightautoml/ml_algo/tuning/__init__.py: -------------------------------------------------------------------------------- 1 | """Bunch of classes for hyperparameters tuning.""" 2 | 3 | __all__ = ["base", "optuna"] 4 | -------------------------------------------------------------------------------- /lightautoml/ml_algo/tuning/base.py: -------------------------------------------------------------------------------- 1 | """Base classes to implement hyperparameter tuning.""" 2 | 3 | from abc import ABC 4 | from abc import abstractmethod 5 | from typing import Dict 6 | from typing import Optional 7 | from typing import Tuple 8 | from typing import overload 9 | 10 | from ...dataset.base import LAMLDataset 11 | 12 | # if TYPE_CHECKING: 13 | from ...ml_algo.base import MLAlgo 14 | from ...validation.base import TrainValidIterator 15 | 16 | 17 | class DistributionBase(ABC): 18 | """_summary_. 19 | 20 | Args: 21 | ABC (_type_): _description_ 22 | """ 23 | 24 | pass 25 | 26 | 27 | class Choice(DistributionBase): 28 | """_summary_. 29 | 30 | Args: 31 | DistributionBase (_type_): _description_ 32 | """ 33 | 34 | def __init__(self, options) -> None: 35 | self.options = options 36 | 37 | 38 | class Uniform(DistributionBase): 39 | """_summary_. 40 | 41 | Args: 42 | DistributionBase (_type_): _description_ 43 | """ 44 | 45 | def __init__(self, low, high, q=None, log=False) -> None: 46 | self.low = low 47 | self.high = high 48 | self.q = q 49 | self.log = log 50 | 51 | 52 | class Normal(DistributionBase): 53 | """_summary_. 54 | 55 | Args: 56 | DistributionBase (_type_): _description_ 57 | """ 58 | 59 | def __init__(self, low, high, q=None, log=False) -> None: 60 | self.low = low 61 | self.high = high 62 | self.q = q 63 | self.log = log 64 | 65 | 66 | class ParamsTuner(ABC): 67 | """Base abstract class for hyperparameters tuners.""" 68 | 69 | _name: str = "AbstractTuner" 70 | _best_params: Dict = None 71 | _fit_on_holdout: bool = False # if tuner should be fitted on holdout set 72 | 73 | @property 74 | def best_params(self) -> dict: 75 | """Get best params. 76 | 77 | Returns: 78 | Dict with best fitted params. 79 | 80 | """ 81 | assert hasattr(self, "_best_params"), "ParamsTuner should be fitted first" 82 | return self._best_params 83 | 84 | @overload 85 | def fit( 86 | self, 87 | ml_algo: "MLAlgo", 88 | train_valid_iterator: Optional[TrainValidIterator] = None, 89 | ) -> Tuple["MLAlgo", LAMLDataset]: 90 | ... 91 | 92 | @abstractmethod 93 | def fit( 94 | self, 95 | ml_algo: "MLAlgo", 96 | train_valid_iterator: Optional[TrainValidIterator] = None, 97 | ) -> Tuple[None, None]: 98 | """Tune model hyperparameters. 99 | 100 | Args: 101 | ml_algo: ML algorithm. 102 | train_valid_iterator: Classic cv-iterator. 103 | 104 | Returns: 105 | (None, None) if ml_algo is fitted or models are not fitted during training, 106 | (BestMLAlgo, BestPredictionsLAMLDataset) otherwise. 107 | 108 | """ 109 | 110 | 111 | class DefaultTuner(ParamsTuner): 112 | """Default realization of ParamsTuner - just take algo's defaults.""" 113 | 114 | _name: str = "DefaultTuner" 115 | 116 | def fit( 117 | self, 118 | ml_algo: "MLAlgo", 119 | train_valid_iterator: Optional[TrainValidIterator] = None, 120 | ) -> Tuple[None, None]: 121 | """Default fit method - just save defaults. 122 | 123 | Args: 124 | ml_algo: Algorithm that is tuned. 125 | train_valid_iterator: Empty. 126 | 127 | Returns: 128 | Tuple (None, None). 129 | """ 130 | self._best_params = ml_algo.init_params_on_input(train_valid_iterator=train_valid_iterator) 131 | return None, None 132 | -------------------------------------------------------------------------------- /lightautoml/ml_algo/tuning/hyperopt.py: -------------------------------------------------------------------------------- 1 | """Classes to implement hyperparameter tuning using HyperOpt.""" 2 | -------------------------------------------------------------------------------- /lightautoml/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | """Pipelines for solvinng different tasks.""" 2 | 3 | __all__ = ["ml", "features", "selection", "utils"] 4 | -------------------------------------------------------------------------------- /lightautoml/pipelines/features/__init__.py: -------------------------------------------------------------------------------- 1 | """Pipelines for features generation.""" 2 | 3 | __all__ = [ 4 | "base", 5 | "lgb_pipeline", 6 | "image_pipeline", 7 | "linear_pipeline", 8 | "text_pipeline", 9 | "wb_pipeline", 10 | "torch_pipeline", 11 | ] 12 | -------------------------------------------------------------------------------- /lightautoml/pipelines/features/wb_pipeline.py: -------------------------------------------------------------------------------- 1 | """Whitebox features.""" 2 | 3 | import numpy as np 4 | 5 | from ...dataset.np_pd_dataset import PandasDataset 6 | from ...dataset.roles import NumericRole 7 | from ...transformers.base import ColumnsSelector 8 | from ...transformers.base import LAMLTransformer 9 | from ...transformers.base import UnionTransformer 10 | from ..utils import get_columns_by_role 11 | from .base import FeaturesPipeline 12 | from .base import TabularDataFeatures 13 | 14 | 15 | class WBFeatures(FeaturesPipeline, TabularDataFeatures): 16 | """Simple WhiteBox pipeline. 17 | 18 | Just handles dates, other are handled inside WhiteBox. 19 | 20 | """ 21 | 22 | def create_pipeline(self, train: PandasDataset) -> LAMLTransformer: 23 | """Create pipeline for WhiteBox. 24 | 25 | Args: 26 | train: Dataset with train features. 27 | 28 | Returns: 29 | Transformer. 30 | 31 | """ 32 | others = get_columns_by_role(train, "Category") + get_columns_by_role(train, "Numeric") 33 | 34 | transformer_list = [ 35 | self.get_datetime_diffs(train), 36 | self.get_datetime_seasons(train, NumericRole(np.float32)), 37 | ColumnsSelector(others), 38 | ] 39 | 40 | # final pipeline 41 | union_all = UnionTransformer([x for x in transformer_list if x is not None]) 42 | 43 | return union_all 44 | -------------------------------------------------------------------------------- /lightautoml/pipelines/ml/__init__.py: -------------------------------------------------------------------------------- 1 | """Pipelines that merge together single model training steps.""" 2 | 3 | __all__ = ["base"] 4 | -------------------------------------------------------------------------------- /lightautoml/pipelines/selection/__init__.py: -------------------------------------------------------------------------------- 1 | """Feature selection module for ML pipelines.""" 2 | 3 | __all__ = [ 4 | "base", 5 | "importance_based", 6 | "permutation_importance_based", 7 | "linear_selector", 8 | ] 9 | -------------------------------------------------------------------------------- /lightautoml/pipelines/selection/importance_based.py: -------------------------------------------------------------------------------- 1 | """Importance based selectors.""" 2 | 3 | from typing import Optional 4 | from typing import TypeVar 5 | 6 | from lightautoml.validation.base import TrainValidIterator 7 | 8 | from ...dataset.base import LAMLDataset 9 | from ...ml_algo.base import MLAlgo 10 | from ..features.base import FeaturesPipeline 11 | from .base import ImportanceEstimator 12 | from .base import SelectionPipeline 13 | 14 | 15 | ImportanceEstimatedAlgo = TypeVar("ImportanceEstimatedAlgo", bound=ImportanceEstimator) 16 | 17 | 18 | class ModelBasedImportanceEstimator(ImportanceEstimator): 19 | """Base class for performing feature selection using model feature importances.""" 20 | 21 | def fit( 22 | self, 23 | train_valid: Optional[TrainValidIterator] = None, 24 | ml_algo: Optional[ImportanceEstimatedAlgo] = None, 25 | preds: Optional[LAMLDataset] = None, 26 | ): 27 | """Find the importances of features. 28 | 29 | Args: 30 | train_valid: dataset iterator. 31 | ml_algo: ML algorithm used for importance estimation. 32 | preds: predicted target values. 33 | 34 | """ 35 | assert ( 36 | ml_algo is not None 37 | ), "ModelBasedImportanceEstimator: raw importances are None and no MLAlgo to calculate them." 38 | self.raw_importances = ml_algo.get_features_score() 39 | 40 | 41 | class ImportanceCutoffSelector(SelectionPipeline): 42 | """Selector based on importance threshold. 43 | 44 | It is important that data which passed to ``.fit`` 45 | should be ok to fit `ml_algo` or preprocessing pipeline should be defined. 46 | 47 | Args: 48 | feature_pipeline: Composition of feature transforms. 49 | ml_algo: Tuple (MlAlgo, ParamsTuner). 50 | imp_estimator: Feature importance estimator. 51 | fit_on_holdout: If use the holdout iterator. 52 | cutoff: Threshold to cut-off features. 53 | 54 | """ 55 | 56 | def __init__( 57 | self, 58 | feature_pipeline: Optional[FeaturesPipeline], 59 | ml_algo: MLAlgo, 60 | imp_estimator: ImportanceEstimator, 61 | fit_on_holdout: bool = True, 62 | cutoff: float = 0.0, 63 | ): 64 | super().__init__(feature_pipeline, ml_algo, imp_estimator, fit_on_holdout) 65 | self.cutoff = cutoff 66 | 67 | def perform_selection(self, train_valid: Optional[TrainValidIterator] = None): 68 | """Select features based on cutoff value. 69 | 70 | Args: 71 | train_valid: Not used. 72 | 73 | """ 74 | imp = self.imp_estimator.get_features_score() 75 | self.map_raw_feature_importances(imp) 76 | selected = self.mapped_importances.index.values[self.mapped_importances.values > self.cutoff] 77 | if len(selected) == 0: 78 | selected = self.mapped_importances.index.values[:1] 79 | self._selected_features = list(selected) 80 | -------------------------------------------------------------------------------- /lightautoml/pipelines/selection/linear_selector.py: -------------------------------------------------------------------------------- 1 | """Selectors for linear models.""" 2 | 3 | from typing import Optional 4 | from typing import Union 5 | 6 | import numpy as np 7 | 8 | from scipy.sparse import linalg as sp_linalg 9 | 10 | from ...validation.base import TrainValidIterator 11 | from .base import SelectionPipeline 12 | 13 | 14 | class HighCorrRemoval(SelectionPipeline): 15 | """Selector to remove highly correlated features. 16 | 17 | Del totally correlated feats to speedup L1 regression models. 18 | For sparse data cosine will be used. 19 | It's not exact, but ok for remove very high correlations. 20 | 21 | Args: 22 | corr_co: Similarity threshold. 23 | subsample: Number (int) of samples, or frac (float) from full dataset. 24 | random_state: Random seed for subsample. 25 | **kwargs: Additional parameters. Used for initialization of parent class. 26 | 27 | """ 28 | 29 | def __init__(self, corr_co: float = 0.98, subsample: Union[int, float] = 100000, random_state: int = 42, **kwargs): 30 | super().__init__(**kwargs) 31 | self.corr_co = corr_co 32 | self.subsample = subsample 33 | self.random_state = random_state 34 | 35 | def perform_selection(self, train_valid: Optional[TrainValidIterator]): 36 | """Select features to save in dataset during selection. 37 | 38 | Method is used to perform selection based on features correlation. 39 | Should save ``_selected_features`` attribute in the end of working. 40 | 41 | Args: 42 | train_valid: Classic cv-iterator. 43 | 44 | """ 45 | train = train_valid.train.data 46 | target = train_valid.train.target 47 | 48 | if train.shape[1] == 1: 49 | self._selected_features = train_valid.features 50 | return 51 | 52 | if self.subsample != 1 and self.subsample < train.shape[0]: 53 | if self.subsample < 1: 54 | subsample = int(train.shape[0] * self.subsample) 55 | else: 56 | subsample = int(self.subsample) 57 | 58 | idx = np.random.RandomState(self.random_state + 1).permutation(train.shape[0])[:subsample] 59 | train, target = train[idx], target[idx] 60 | 61 | # correlation or cosine 62 | if type(train) is np.ndarray: 63 | corr = np.corrcoef(train, rowvar=False) 64 | 65 | else: 66 | xtx = train.T * train 67 | norm = sp_linalg.norm(train, axis=0) 68 | corr = np.array(xtx / (norm[:, np.newaxis] * norm[np.newaxis, :])) 69 | del xtx 70 | 71 | sl = np.triu(np.abs(corr) > self.corr_co, k=1) 72 | grid_x, grid_y = np.meshgrid(np.arange(sl.shape[0]), np.arange(sl.shape[0])) 73 | 74 | removed = set() 75 | 76 | for x, y in zip(grid_x[sl], grid_y[sl]): 77 | if x not in removed: 78 | removed.add(y) 79 | 80 | const = np.arange(corr.shape[0])[np.isnan(np.diagonal(corr))] 81 | for i in const: 82 | removed.add(i) 83 | 84 | self._selected_features = [x for (n, x) in enumerate(train_valid.features) if n not in removed] 85 | -------------------------------------------------------------------------------- /lightautoml/pipelines/utils.py: -------------------------------------------------------------------------------- 1 | """Pipeline utils.""" 2 | 3 | from typing import Any 4 | from typing import List 5 | from typing import Optional 6 | from typing import Sequence 7 | 8 | from lightautoml.dataset.base import LAMLDataset 9 | 10 | 11 | def map_pipeline_names(input_names: Sequence[str], output_names: Sequence[str]) -> List[Optional[str]]: 12 | """Pipelines create name in the way 'prefix__feature_name'. 13 | 14 | Multiple pipelines will create names 15 | in the way 'prefix1__prefix2__feature_name'. 16 | This function maps initial features names to outputs. 17 | Result may be not exact in some rare cases, 18 | but it's ok for real pipelines. 19 | 20 | Args: 21 | input_names: Initial feature names. 22 | output_names: Output feature names. 23 | 24 | Returns: 25 | Mapping between feature names. 26 | 27 | """ 28 | # TODO: Add assert here 29 | mapped: List[Optional[str]] = [None] * len(output_names) 30 | s_in = set(input_names) 31 | 32 | for n, name in enumerate(output_names): 33 | splitted = name.split("__") 34 | 35 | for i in range(len(splitted)): 36 | name = "__".join(splitted[i:]) 37 | if name in s_in: 38 | mapped[n] = name 39 | break 40 | 41 | assert None not in mapped, "Can not infer names. For feature selection purposes use simple pipeline (one-to-one)" 42 | 43 | return mapped 44 | 45 | 46 | def get_columns_by_role(dataset: LAMLDataset, role_name: str, **kwargs: Any) -> List[str]: 47 | """Search for columns with specific role and attributes when building pipeline. 48 | 49 | Args: 50 | dataset: Dataset to search. 51 | role_name: Name of features role. 52 | **kwargs: Specific parameters values to search. 53 | Example: search for categories with OHE processing only. 54 | 55 | Returns: 56 | List of str features names. 57 | 58 | """ 59 | features = [] 60 | inv_roles = dataset.inverse_roles 61 | for role in inv_roles: 62 | if role.name == role_name: 63 | flg = True 64 | # TODO: maybe refactor 65 | for k in kwargs: 66 | try: 67 | attr = getattr(role, k) 68 | except AttributeError: 69 | flg = False 70 | break 71 | if attr != kwargs[k]: 72 | flg = False 73 | break 74 | if flg: 75 | features.extend(inv_roles[role]) 76 | 77 | return sorted(features) 78 | -------------------------------------------------------------------------------- /lightautoml/reader/__init__.py: -------------------------------------------------------------------------------- 1 | """Utils for training and analysing data.""" 2 | 3 | __all__ = ["base", "utils", "tabular_batch_generator"] 4 | -------------------------------------------------------------------------------- /lightautoml/reader/utils.py: -------------------------------------------------------------------------------- 1 | """Reader utils.""" 2 | 3 | from typing import Callable 4 | from typing import Optional 5 | from typing import Union 6 | 7 | import numpy as np 8 | 9 | from sklearn.model_selection import GroupKFold 10 | from sklearn.model_selection import KFold 11 | from sklearn.model_selection import StratifiedKFold 12 | 13 | from ..tasks import Task 14 | 15 | 16 | def set_sklearn_folds( 17 | task: Task, 18 | target: np.ndarray, 19 | cv: Union[Callable, int] = 5, 20 | random_state: int = 42, 21 | group: Optional[np.ndarray] = None, 22 | ) -> Optional[np.ndarray]: 23 | """Determines the cross-validation splitting strategy. 24 | 25 | Args: 26 | task: If `'binary'` or `'multiclass'` used stratified cv. 27 | target: Target values. 28 | cv: Specifies number of folds. 29 | random_state: Determines random number generation. 30 | group: For group k-folding. 31 | 32 | Returns: 33 | Array with fold indices. 34 | 35 | """ 36 | if type(cv) is int: 37 | if group is not None: 38 | split = GroupKFold(cv).split(group, group, group) 39 | elif task.name in ["binary", "multiclass"]: 40 | 41 | split = StratifiedKFold(cv, random_state=random_state, shuffle=True).split(target, target) 42 | else: 43 | split = KFold(cv, random_state=random_state, shuffle=True).split(target, target) 44 | 45 | folds = np.zeros(target.shape[0], dtype=np.int32) 46 | for n, (f0, f1) in enumerate(split): 47 | folds[f1] = n 48 | 49 | return folds 50 | 51 | return 52 | -------------------------------------------------------------------------------- /lightautoml/report/__init__.py: -------------------------------------------------------------------------------- 1 | """Report generators and templates.""" 2 | 3 | from lightautoml.utils.installation import __validate_extra_deps 4 | 5 | from .report_deco import ReportDeco 6 | from .report_deco import ReportDecoNLP 7 | from .report_deco import ReportDecoWhitebox 8 | 9 | 10 | __validate_extra_deps("pdf") 11 | 12 | 13 | __all__ = ["ReportDeco", "ReportDecoWhitebox", "ReportDecoNLP"] 14 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/binary_inference_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    {{ title }}

    3 |
      4 |
    • 5 |

      ROC curve

      6 |
        7 |

        AUC valid plot full

        8 |
      9 |
    • 10 |
    • 11 |

      ROC-PR curve

      12 |
        13 |

        PR valid plot full

        14 |
      15 |
    • 16 |
    • 17 |

      Pie F1 metric

      18 |
        19 |

        Pie F1 metric

        20 |
      21 |
    • 22 |
    • 23 |

      Distribution of object predictions by bins

      24 |
        25 |

        preds_distribution_by_bins

        26 |

        distribution_of_logits

        27 |
      28 |
    • 29 |
    • 30 |

      Distribution of Logits by bins

      31 |
        32 | {{ sample_bins_table }} 33 |
      34 |
    • 35 |
    36 |
  • 37 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/feature_importance_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    Feature importance

    3 |
      4 | {% if feature_importance is not none %} 5 |

      Feature importance calculation method: {{fi_method}}.

      6 |

      feature_importance

      7 | {% else %} 8 |

      No feature importance provided for original features.

      9 | {% endif %} 10 |
    11 |
  • 12 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/feature_importance_utillized_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    Feature importance

    3 |
      4 | {% if feature_importance is not none %} 5 |

      Feature importance calculation method: {{fi_method}}.

      6 |

      feature_importance

      7 | {% else %} 8 |

      No feature importance provided for original features or used presets do not provide feature importance.

      9 |

      Try increasing the timeout to use more presets.

      10 | {% endif %} 11 |
    12 |
  • 13 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/interpretation_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    PDP interpretation

    3 |
      4 | {% if interpretation_top is not none %} 5 | {% for section in interpretation_top %} 6 | {{ section }} 7 | {% endfor %} 8 | {% else %} 9 |

      Pass valid_data to build PDP interpretation section.

      10 | {% endif %} 11 |
    12 |
  • 13 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/interpretation_subsection.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    {{ feature_name }}

    3 |
      4 |

      feature_interpretation_plot

      5 |
    6 |
  • 7 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/lama_base_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | LAMA report 8 | 9 | 85 | 89 | 90 | 91 |
    92 |

    LAMA report

    93 |
    94 |
      95 | {% for section in sections %} 96 | {{ section }} 97 | {% endfor %} 98 |
    99 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/model_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    Model overview

    3 |
      4 |
    • 5 |

      Model parameters

      6 |
        7 |

        The following model is loaded: {{ model_name }}.

        8 |

        Model parameters are described below:

        9 | 10 | {{ model_parameters }} 11 |
      12 |
    • 13 |
    • 14 |

      Summary results

      15 |
        16 | {% if model_summary is not none %} 17 |

        Results for data samples:

        18 | 19 | {{ model_summary }} 20 | {% else %} 21 |

        Use fit_predict() for training model.

        22 | {% endif %} 23 |
      24 |
    • 25 |
    26 |
  • 27 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/model_section_utilized.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    Model overview

    3 |
      4 |
    • 5 |

      Summary results

      6 |
        7 | {% if model_summary is not none %} 8 |

        Results for data samples:

        9 | 10 | {{ model_summary }} 11 | {% else %} 12 |

        Use fit_predict() for training model.

        13 | {% endif %} 14 |
      15 |
    • 16 |
    • 17 |

      Prediction formula

      18 |
        19 | {% if pred_formula is not none %} 20 | {{ pred_formula }} 21 | {% else %} 22 |

        Use fit_predict() for training model.

        23 | {% endif %} 24 |
      25 |
    • 26 |
    • 27 |

      Model parameters

      28 |
        29 |

        The following model is loaded: {{ model_name }}.

        30 | 31 | {% if model_presets is not none %} 32 |

        Parameters of applied presets are described below:

        33 | {% for preset in model_presets %} 34 | {{ preset }} 35 | {% endfor %} 36 | {% else %} 37 |

        Use fit_predict() for training model. 38 | Description of applied presets will appear in this section.

        39 | {% endif %} 40 |
      41 |
    • 42 |
    43 |
  • 44 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/multiclass_inference_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    {{ title }}

    3 |
      4 |
    • 5 |

      Metrics for separate classes

      6 |
        7 | {{classification_report}} 8 |
      9 |
    • 10 |
    • 11 |

      Confusion matrix

      12 |
        13 | Confusion matrix is normalized over the true (rows) conditions. 14 |

        confusion_matrix

        15 |
      16 |
    • 17 |
    18 |
  • 19 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/nlp_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    NLP section

    3 |
      4 | {% for section in nlp_subsections %} 5 | {{ section }} 6 | {% endfor %} 7 |
    8 |
  • 9 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/nlp_subsection.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    {{ title }}

    3 |
      4 |

      char_len_histogram

      5 |

      tokens_len_histogram

      6 |
    7 |
  • 8 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/preset_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    {{ preset_name }}

    3 |
      4 | {{ model_parameters }} 5 |
    6 |
  • 7 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/reg_inference_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    {{ title }}

    3 |
      4 |
    • 5 |

      Distribution of targets

      6 |
        7 |

        target_distribution

        8 |
      9 |
    • 10 |
    • 11 |

      Predictions vs labels difference histogram

      12 |
        13 |

        error_hist

        14 |
      15 |
    • 16 |
    • 17 |

      Predictions vs labels scatter plot

      18 |
        19 |

        scatter_plot

        20 |
      21 |
    • 22 |
    23 |
  • 24 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/results_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    Detailed model results

    3 |
      4 | {% for section in model_results %} 5 | {{ section }} 6 | {% endfor %} 7 |
    8 |
  • 9 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/train_set_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    Data overview

    3 |
      4 |
    • 5 |

      Train data summary

      6 |
        7 | {{ train_data_overview }} 8 |
      9 |
    • 10 |
    • 11 |

      Train data details

      12 |
        13 |
      • 14 |

        Numerical features

        15 |
          16 | {% if numerical_features_table is not none %} 17 | {{ numerical_features_table }} 18 | {% else %} 19 |

          No numerical features.

          20 | {% endif %} 21 |
        22 |
      • 23 |
      • 24 |

        Categorical features

        25 |
          26 | {% if categorical_features_table is not none %} 27 | {{ categorical_features_table }} 28 | {% else %} 29 |

          No categorical features.

          30 | {% endif %} 31 |
        32 |
      • 33 |
      • 34 |

        Datetime features

        35 |
          36 | {% if datetime_features_table is not none %} 37 | {{ datetime_features_table }} 38 | {% else %} 39 |

          No datetime features.

          40 | {% endif %} 41 |
        42 |
      • 43 |
      • 44 |

        Textual features

        45 |
          46 | {% if text_features_table is not none %} 47 | {{ text_features_table }} 48 | {% else %} 49 |

          No textual features.

          50 | {% endif %} 51 |
        52 |
      • 53 |
      • 54 |

        Dropped features

        55 |
          56 | {% if dropped_features_table is not none %} 57 |

          Some features were excluded from the training set.

          58 |

          Except {{ target }} variable which was used as target, also there were excluded variables, in which NaN rate exceeds max_nan_rate = {{ max_nan_rate }}. In addition to this, the variables with max_constant_rate > {{ max_constant_rate }} excluded:

          59 | {{ dropped_features_table }} 60 | {% else %} 61 |

          No dropped features.

          62 | {% endif %} 63 | 64 |
        65 |
      • 66 |
      67 |
    • 68 |
    69 |
  • 70 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/train_set_section_utilized.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    Data overview

    3 |
      4 |
    • 5 |

      Roles table

      6 |
        7 |

        Each reader has its own preprocessing configuration and type guessing, thus 8 | each reader recognises feature roles differently. 9 |

        10 |

        Table below summarizes the guessed feature roles for all applied presets.

        11 |

        "N" - numerical, "C" - categorical, "D" - datetime, "T" - textual, "-" - dropped.

        12 | {{ roles_table }} 13 |
      14 |
    • 15 | 16 |
    • 17 |

      Precise description

      18 |
        19 |

        Below is the precise description of training data for each preset.

        20 | {% for section in data_sections %} 21 | {{ section }} 22 | {% endfor %} 23 |
      24 |
    • 25 |
    26 |
  • 27 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/uplift_section.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    Uplift performance

    3 |
      4 | {% for section in uplift_results %} 5 | {{ section }} 6 | {% endfor %} 7 |
    8 |
  • 9 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/uplift_subsection.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    {{ title }}

    3 |
      4 |
    • 5 |

      Test sample summary

      6 |
        7 | {{ test_data_overview }} 8 |
      9 |
    • 10 |
    • 11 |

      Uplift curves

      12 |
        13 |

        Uplift curve

        14 |
      15 |
    • 16 |
    • 17 |

      Uplift distribution KDE

      18 |
        19 |

        preds_distribution_by_bins

        20 |
      21 |
    • 22 |
    • 23 |

      Uplift distribution by bins

      24 |
        25 | {{ uplift_bins_table }} 26 |
      27 |
    • 28 |
    29 |
  • 30 | -------------------------------------------------------------------------------- /lightautoml/report/lama_report_templates/utilized_data_subsections.html: -------------------------------------------------------------------------------- 1 |
  • 2 |

    {{ preset_name }}

    3 |
      4 |
    • 5 |
      Train data summary
      6 |
        7 | {{ train_data_overview }} 8 |
      9 |
    • 10 |
    • 11 |
      Train data details
      12 |
        13 |
      • 14 |
        Numerical features
        15 |
          16 | {% if numerical_features_table is not none %} 17 | {{ numerical_features_table }} 18 | {% else %} 19 |

          No numerical features.

          20 | {% endif %} 21 |
        22 |
      • 23 |
      • 24 |
        Categorical features
        25 |
          26 | {% if categorical_features_table is not none %} 27 | {{ categorical_features_table }} 28 | {% else %} 29 |

          No categorical features.

          30 | {% endif %} 31 |
        32 |
      • 33 |
      • 34 |
        Datetime features
        35 |
          36 | {% if datetime_features_table is not none %} 37 | {{ datetime_features_table }} 38 | {% else %} 39 |

          No datetime features.

          40 | {% endif %} 41 |
        42 |
      • 43 |
      • 44 |
        Textual features
        45 |
          46 | {% if text_features_table is not none %} 47 | {{ text_features_table }} 48 | {% else %} 49 |

          No textual features.

          50 | {% endif %} 51 |
        52 |
      • 53 |
      • 54 |
        Dropped features
        55 |
          56 | {% if dropped_features_table is not none %} 57 |

          Some features were excluded from the training set.

          58 |

          Except {{ target }} variable which was used as target, also there were excluded variables, in which NaN rate exceeds max_nan_rate = {{ max_nan_rate }}. In addition to this, the variables with max_constant_rate > {{ max_constant_rate }} excluded:

          59 | {{ dropped_features_table }} 60 | {% else %} 61 |

          No dropped features.

          62 | {% endif %} 63 | 64 |
        65 |
      • 66 |
      67 |
    • 68 |
    69 |
  • 70 | -------------------------------------------------------------------------------- /lightautoml/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | """Define the task to solve its loss, metric.""" 2 | 3 | from .base import Task 4 | 5 | 6 | __all__ = ["losses", "base", "common_metric", "utils", "Task"] 7 | -------------------------------------------------------------------------------- /lightautoml/tasks/losses/__init__.py: -------------------------------------------------------------------------------- 1 | """Set of loss functions for different machine learning algorithms.""" 2 | 3 | from .base import _valid_str_metric_names 4 | from .cb import CBLoss 5 | from .lgb import LGBLoss 6 | from .xgb import XGBLoss 7 | from .sklearn import SKLoss 8 | from .torch import TORCHLoss 9 | from .torch import TorchLossWrapper 10 | 11 | 12 | __all__ = [ 13 | "XGBLoss", 14 | "LGBLoss", 15 | "TORCHLoss", 16 | "SKLoss", 17 | "CBLoss", 18 | "_valid_str_metric_names", 19 | "TorchLossWrapper", 20 | ] 21 | -------------------------------------------------------------------------------- /lightautoml/tasks/losses/lgb_custom.py: -------------------------------------------------------------------------------- 1 | """Custom metrics and loss functions for LightGBM.""" 2 | 3 | from typing import Tuple 4 | 5 | import lightgbm as lgb 6 | import numpy as np 7 | 8 | from scipy.special import softmax 9 | 10 | 11 | def softmax_ax1(x: np.ndarray) -> np.ndarray: 12 | """Softmax columnwise. 13 | 14 | Args: 15 | x: input. 16 | 17 | Returns: 18 | softmax values. 19 | 20 | """ 21 | return softmax(x, axis=1) 22 | 23 | 24 | def lgb_f1_loss_multiclass( 25 | preds: np.ndarray, train_data: lgb.Dataset, clip: float = 1e-5 26 | ) -> Tuple[np.ndarray, np.ndarray]: 27 | """Custom loss for optimizing f1. 28 | 29 | Args: 30 | preds: Predctions. 31 | train_data: Dataset in LightGBM format. 32 | clip: Clump constant. 33 | 34 | Returns: 35 | Gradient, hessian. 36 | 37 | """ 38 | y_true = train_data.get_label().astype(np.int32) 39 | preds = preds.reshape((y_true.shape[0], -1), order="F") 40 | # softmax 41 | preds = np.clip(softmax_ax1(preds), clip, 1 - clip) 42 | # make ohe 43 | y_ohe = np.zeros_like(preds) 44 | np.add.at(y_ohe, (np.arange(y_true.shape[0]), y_true), 1) 45 | # grad 46 | grad = (preds - y_ohe) * preds 47 | # hess 48 | hess = (1 - preds) * preds * np.clip((2 * preds - y_ohe), 1e-3, np.inf) 49 | # reshape back preds 50 | return grad.reshape((-1,), order="F"), hess.reshape((-1,), order="F") 51 | -------------------------------------------------------------------------------- /lightautoml/tasks/losses/sklearn.py: -------------------------------------------------------------------------------- 1 | """Metrics and loss functions for scikit-learn models.""" 2 | 3 | import logging 4 | 5 | from typing import Callable 6 | from typing import Dict 7 | from typing import Optional 8 | from typing import Union 9 | 10 | import numpy as np 11 | 12 | from .base import Loss 13 | from .base import fw_rmsle 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | _sk_loss_mapping = {"rmsle": ("mse", fw_rmsle, np.expm1)} 20 | 21 | _sk_force_metric = { 22 | "rmsle": ("mse", None, None), 23 | } 24 | 25 | 26 | class SKLoss(Loss): 27 | """Loss used for scikit-learn. 28 | 29 | Args: 30 | loss: One of default loss function. 31 | Valid are: 'logloss', 'mse', 'crossentropy', 'rmsle'. 32 | loss_params: Additional loss parameters. 33 | fw_func: Forward transformation. 34 | Used for transformation of target and item weights. 35 | bw_func: backward transformation. 36 | Used for predict values transformation. 37 | 38 | """ 39 | 40 | def __init__( 41 | self, 42 | loss: str, 43 | loss_params: Optional[Dict] = None, 44 | fw_func: Optional[Callable] = None, 45 | bw_func: Optional[Callable] = None, 46 | ): 47 | assert loss in [ 48 | "logloss", 49 | "mse", 50 | "mae", 51 | "crossentropy", 52 | "rmsle", 53 | ], "Not supported in sklearn in general case." 54 | self.flg_regressor = loss in ["mse", "rmsle"] 55 | 56 | if loss in _sk_loss_mapping: 57 | self.loss, fw_func, bw_func = _sk_loss_mapping[loss] 58 | else: 59 | self.loss = loss 60 | # set forward and backward transformations 61 | if fw_func is not None: 62 | self._fw_func = fw_func 63 | if bw_func is not None: 64 | self._bw_func = bw_func 65 | 66 | self.loss_params = loss_params 67 | 68 | def set_callback_metric( 69 | self, 70 | metric: Union[str, Callable], 71 | greater_is_better: Optional[bool] = None, 72 | metric_params: Optional[Dict] = None, 73 | task_name: Optional[str] = None, 74 | ): 75 | """Callback metric setter. 76 | 77 | Uses default callback of parent class `Loss`. 78 | 79 | Args: 80 | metric: Callback metric. 81 | greater_is_better: Whether or not higher value is better. 82 | metric_params: Additional metric parameters. 83 | task_name: Name of task. 84 | 85 | """ 86 | if self.loss in _sk_force_metric: 87 | metric, greater_is_better, metric_params = _sk_force_metric[self.loss] 88 | logger.info2("For sklearn {0} callback metric switched to {1}".format(self.loss, metric)) 89 | 90 | super().set_callback_metric(metric, greater_is_better, metric_params, task_name) 91 | -------------------------------------------------------------------------------- /lightautoml/tasks/utils.py: -------------------------------------------------------------------------------- 1 | """.""" 2 | 3 | from typing import Callable 4 | 5 | import numpy as np 6 | 7 | 8 | def infer_gib(metric: Callable) -> bool: 9 | """Infer greater is better from metric. 10 | 11 | Args: 12 | metric: Score or loss function. 13 | 14 | Returns: 15 | ```True``` if grater is better. 16 | 17 | Raises: 18 | AssertionError: If there is no way to order the predictions. 19 | 20 | """ 21 | label = np.array([0, 1]) 22 | pred = np.array([0.1, 0.9]) 23 | 24 | g_val = metric(label, pred) 25 | b_val = metric(label, pred[::-1]) 26 | 27 | assert g_val != b_val, "Cannot infer greater is better from metric." " Should be set manually." 28 | 29 | return g_val > b_val 30 | 31 | 32 | def infer_gib_multiclass(metric: Callable) -> bool: 33 | """Infer greater is better from metric. 34 | 35 | Args: 36 | metric: Metric function. It must take two arguments y_true, y_pred. 37 | 38 | Returns: 39 | ```True``` if grater is better. 40 | 41 | Raises: 42 | AssertionError: If there is no way to order the predictions. 43 | 44 | """ 45 | label = np.array([0, 1, 2]) 46 | pred = np.array([[0.9, 0.05, 0.05], [0.05, 0.9, 0.05], [0.05, 0.05, 0.9]]) 47 | 48 | g_val = metric(label, pred) 49 | b_val = metric(label, pred[::-1]) 50 | 51 | assert g_val != b_val, "Cannot infer greater is better from metric. " "Should be set manually." 52 | 53 | return g_val > b_val 54 | -------------------------------------------------------------------------------- /lightautoml/text/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides an internal interface for working with text features.""" 2 | 3 | 4 | __all__ = [ 5 | "tokenizer", 6 | "dl_transformers", 7 | "sentence_pooling", 8 | "weighted_average_transformer", 9 | "embed_dataset", 10 | ] 11 | -------------------------------------------------------------------------------- /lightautoml/text/embed_dataset.py: -------------------------------------------------------------------------------- 1 | """Pytorch Datasets for text features.""" 2 | 3 | from typing import Any 4 | from typing import Dict 5 | from typing import Sequence 6 | from typing import Union 7 | 8 | import numpy as np 9 | 10 | 11 | try: 12 | from transformers import AutoTokenizer 13 | except: 14 | import warnings 15 | 16 | warnings.warn("'transformers' - package isn't installed") 17 | 18 | 19 | class BertDataset: 20 | """Dataset class with transformers tokenization. 21 | 22 | Class for preparing transformers input. 23 | 24 | Args: 25 | sentences: List of tokenized sentences. 26 | max_length: Max sentence length. 27 | model_name: Name of transformer model. 28 | **kwargs: Other. 29 | 30 | """ 31 | 32 | def __init__(self, sentences: Sequence[str], max_length: int, model_name: str, **kwargs: Any): 33 | self.sentences = sentences 34 | self.max_length = max_length 35 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 36 | 37 | def __getitem__(self, idx: int) -> Dict[str, np.ndarray]: 38 | sent = self.sentences[idx] 39 | _split = sent.split("[SEP]") 40 | sent = _split if len(_split) == 2 else (sent,) 41 | data = self.tokenizer.encode_plus( 42 | *sent, add_special_tokens=True, max_length=self.max_length, padding="max_length", truncation=True 43 | ) 44 | return {i: np.array(data[i]) for i in data.keys()} 45 | 46 | def __len__(self) -> int: 47 | return len(self.sentences) 48 | 49 | 50 | class EmbedDataset: 51 | """Dataset class for extracting word embeddings. 52 | 53 | Class for transforming list of tokens to dict of embeddings and sentence length. 54 | 55 | Args: 56 | sentences: List of tokenized sentences. 57 | embedding_model: word2vec, fasstext, etc. 58 | Should have dict interface {: }. 59 | max_length: Max sentence length. 60 | embed_size: Size of embedding. 61 | **kwargs: Not used. 62 | 63 | """ 64 | 65 | def __init__(self, sentences: Sequence[str], embedding_model: Dict, max_length: int, embed_size: int, **kwargs): 66 | self.sentences = sentences 67 | self.embedding_model = embedding_model 68 | self.max_length = max_length 69 | self.embed_size = embed_size 70 | 71 | def __getitem__(self, idx: int) -> Dict[str, Union[Sequence, int]]: 72 | result = np.zeros((self.max_length, self.embed_size)) 73 | length = 0 74 | for word in self.sentences[idx]: 75 | if word in self.embedding_model: 76 | result[length, :] = self.embedding_model[word] 77 | length += 1 78 | if length >= self.max_length: 79 | break 80 | return {"text": result, "length": length if length > 0 else 1} 81 | 82 | def __len__(self) -> int: 83 | return len(self.sentences) 84 | -------------------------------------------------------------------------------- /lightautoml/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | """Basic feature generation steps and helper utils.""" 2 | 3 | __all__ = ["base", "categorical", "datetime", "numeric", "composite", "utils"] 4 | -------------------------------------------------------------------------------- /lightautoml/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Common util tools.""" 2 | 3 | __all__ = ["timer"] 4 | -------------------------------------------------------------------------------- /lightautoml/utils/installation.py: -------------------------------------------------------------------------------- 1 | """Tools for partial installation.""" 2 | 3 | import os 4 | 5 | 6 | try: 7 | from importlib.metadata import PackageNotFoundError 8 | from importlib.metadata import distribution 9 | except ModuleNotFoundError: 10 | from importlib_metadata import PackageNotFoundError, distribution 11 | 12 | import logging 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def __validate_extra_deps(extra_section: str, error: bool = False) -> None: 19 | """Check if extra dependencies is installed. 20 | 21 | Args: 22 | extra_section: Name of extra dependencies 23 | error: How to process error 24 | 25 | """ 26 | ignore_deps = os.environ.get("DOCUMENTATION_ENV", False) 27 | 28 | md = distribution("lightautoml").metadata 29 | extra_pattern = 'extra == "{}"'.format(extra_section) 30 | reqs_info = [] 31 | for k, v in md.items(): 32 | if k == "Requires-Dist" and extra_pattern in v: 33 | req = v.split(";")[0].split()[0] 34 | reqs_info.append(req) 35 | 36 | for req_info in reqs_info: 37 | lib_name: str = req_info.split()[0] 38 | try: 39 | distribution(lib_name) 40 | except PackageNotFoundError as e: 41 | # Print warning 42 | logger.warning( 43 | "'%s' extra dependency package '%s' isn't installed. " 44 | "Look at README.md in repo 'LightAutoML' for installation instructions.", 45 | extra_section, 46 | lib_name, 47 | ) 48 | 49 | if not ignore_deps: 50 | if error: 51 | raise e 52 | -------------------------------------------------------------------------------- /lightautoml/validation/__init__.py: -------------------------------------------------------------------------------- 1 | """The module provide classes and functions for model validation.""" 2 | 3 | __all__ = ["base", "np_iterators", "utils"] 4 | -------------------------------------------------------------------------------- /lightautoml/validation/utils.py: -------------------------------------------------------------------------------- 1 | """Validation utils.""" 2 | 3 | from typing import Callable 4 | from typing import Optional 5 | from typing import Union 6 | from typing import cast 7 | 8 | from ..dataset.base import LAMLDataset 9 | from ..dataset.np_pd_dataset import CSRSparseDataset 10 | from ..dataset.np_pd_dataset import NumpyDataset 11 | from ..dataset.np_pd_dataset import PandasDataset 12 | from .base import DummyIterator 13 | from .base import HoldoutIterator 14 | from .base import TrainValidIterator 15 | from .np_iterators import get_numpy_iterator 16 | 17 | 18 | NpDataset = Union[CSRSparseDataset, NumpyDataset, PandasDataset] 19 | 20 | 21 | def create_validation_iterator( 22 | train: LAMLDataset, 23 | valid: Optional[LAMLDataset] = None, 24 | n_folds: Optional[int] = None, 25 | cv_iter: Optional[Callable] = None, 26 | ) -> TrainValidIterator: 27 | """Creates train-validation iterator. 28 | 29 | If train is one of common datasets types (``PandasDataset``, ``NumpyDataset``, ``CSRSparseDataset``) 30 | the :func:`~lightautoml.validation.np_iterators.get_numpy_iterator` will be used. 31 | Else if validation dataset is defined, the holdout-iterator will be used. 32 | Else the dummy iterator will be used. 33 | 34 | Args: 35 | train: Dataset to train. 36 | valid: Optional dataset for validate. 37 | n_folds: maximum number of folds to iterate. If ``None`` - iterate through all folds. 38 | cv_iter: Takes dataset as input and return an iterator of indexes of train/valid for train dataset. 39 | 40 | Returns: 41 | New iterator. 42 | 43 | """ 44 | if type(train) in [PandasDataset, NumpyDataset, CSRSparseDataset]: 45 | train = cast(NpDataset, train) 46 | valid = cast(NpDataset, valid) 47 | iterator = get_numpy_iterator(train, valid, n_folds, cv_iter) 48 | 49 | else: 50 | if valid is not None: 51 | iterator = HoldoutIterator(train, valid) 52 | else: 53 | iterator = DummyIterator(train) 54 | 55 | return iterator 56 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Useful scripts 2 | 3 | 1. ```poetry_fix.py``` fixes problem: too long time of poetry lock command. It re-writes a single python version in the ```pyproject.toml```, which helps resolve all dependencies in a short time. 4 | 5 | ```bash 6 | 7 | # Set current python version from poetry env 8 | # After that you can easily run command: `poetry lock` 9 | poetry run python poetry_fix.py -c 10 | 11 | # Set all python versions before `git push` or `poetry publish` 12 | poetry run python poetry_fix.py -f 13 | 14 | ``` 15 | 16 | **Warning**: You must set the default version before publishing the library to PyPI. 17 | 18 | 2. ```run_tutorials.py``` - execute tutorials in CLI. The execution drops in case of an any error. More information in `help`. 19 | 20 | ```bash 21 | 22 | # Run all tutorials except those excluded by default. 23 | poetry run python scripts/run_tutorials.py 24 | 25 | # Run tutorials (1, 2) 26 | poetry run python scripts/run_tutorials -t 1 -t 2 27 | 28 | ``` 29 | -------------------------------------------------------------------------------- /scripts/exp_branch_push.py: -------------------------------------------------------------------------------- 1 | """Experiment branch filter.""" 2 | 3 | import os 4 | import subprocess 5 | 6 | 7 | LAMA_GITHUB_URL = "git@github.com:sb-ai-lab/LightAutoML.git" 8 | EXPERIMENT_BRANCH_PREFIX = "experiment/" 9 | 10 | REMOTE_URL = os.getenv("PRE_COMMIT_REMOTE_URL") 11 | BRANCH_NAME = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], stdout=subprocess.PIPE).stdout.decode( 12 | "utf-8" 13 | ) 14 | 15 | if BRANCH_NAME.startswith(EXPERIMENT_BRANCH_PREFIX) and REMOTE_URL == LAMA_GITHUB_URL: 16 | raise RuntimeError("Prevent push 'experiment/' branches to LAMA Github") 17 | -------------------------------------------------------------------------------- /scripts/experiments/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for running experiments.""" 2 | 3 | import os 4 | import time 5 | 6 | 7 | class Timer: # noqa: D101 8 | @staticmethod 9 | def _zero(): 10 | return 0 11 | 12 | def __init__(self, clock=time.time, enabled=True): 13 | self.start = 0 14 | self.stop = 0 15 | self._time = clock if enabled else Timer._zero 16 | self._tick = 0 17 | 18 | def __enter__(self): 19 | self.start = self._tick = self._time() 20 | return self 21 | 22 | def __exit__(self, *args): 23 | self.stop = self._tick = self._time() 24 | 25 | @property 26 | def tick(self): 27 | """Make one tick.""" 28 | if self.stop > 0: 29 | return -1 30 | now = self._time() 31 | tick = now - self._tick 32 | self._tick = now 33 | return tick 34 | 35 | @property 36 | def duration(self): 37 | """Get dureation in seconds.""" 38 | if self.stop > 0: 39 | return self.stop - self.start 40 | return self._time() - self.start 41 | 42 | 43 | def install_lightautoml(): 44 | """Install lightautoml using pip.""" 45 | # os.system("curl -sSL https://install.python-poetry.org | ../../bin/python -vvv -") 46 | # os.system("/root/.local/bin/poetry build") 47 | # os.system("ls -la ./dist/") 48 | os.system("pip install packaging==22.0") 49 | os.system("python scripts/poetry_fix.py -f") 50 | os.system("../../bin/pip install .") # ./dist/*.whl 51 | 52 | 53 | # .pip install --upgrade pip 54 | # poetry config virtualenvs.create false --local 55 | # poetry run python ./scripts/poetry_fix.py -c 56 | # ls -la 57 | # poetry run pip install pillow==9.2.0 58 | # poetry install 59 | # poetry run pip freeze 60 | # poetry run python -c "import sys; print(sys.path)" 61 | -------------------------------------------------------------------------------- /scripts/poetry_fix.py: -------------------------------------------------------------------------------- 1 | """Specify python version.""" 2 | 3 | import argparse 4 | import fileinput 5 | import re 6 | import sys 7 | 8 | from pathlib import Path 9 | from typing import Optional 10 | 11 | 12 | PYPROJECT_TOML = Path("pyproject.toml") 13 | ALL_PYTHON_DEPS = ">=3.8" 14 | PYTHON_DEPS = {8: "~3.8.0", 9: "~3.9.0", 10: "~3.10.0", 11: "~3.11.0", 12: "~3.12.0"} 15 | PYTHON_DEPS_PATTERN = '^python = ".*"$' 16 | 17 | 18 | def _check_py_version(): 19 | py_version = sys.version_info 20 | 21 | if py_version.major != 3: 22 | raise RuntimeError("Works only with python 3") 23 | 24 | if py_version.minor not in PYTHON_DEPS: 25 | raise RuntimeError(f"Works only with python 3.[{list(PYTHON_DEPS)}]") 26 | 27 | 28 | def _set_version(py_version: Optional[int] = None): 29 | for line in fileinput.input(PYPROJECT_TOML.name, inplace=1): 30 | if re.search(PYTHON_DEPS_PATTERN, line): 31 | if py_version is None: 32 | version = ALL_PYTHON_DEPS 33 | else: 34 | version = PYTHON_DEPS[py_version] 35 | line = 'python = "{}"\n'.format(version) 36 | 37 | sys.stdout.write(line) 38 | 39 | 40 | def main(): 41 | """Cli.""" 42 | _check_py_version() 43 | 44 | parser = argparse.ArgumentParser() 45 | group = parser.add_mutually_exclusive_group() 46 | group.add_argument( 47 | "-c", 48 | "--current", 49 | action="store_true", 50 | help="Set current python version in `pyproject.toml`", 51 | ) 52 | group.add_argument("-f", "--full", action="store_true", help="Set all pythons versions in `pyproject.toml`") 53 | 54 | args = parser.parse_args() 55 | 56 | if args.current: 57 | _set_version(sys.version_info.minor) 58 | elif args.full: 59 | _set_version(None) 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [darglint] 2 | docstring_style = google 3 | strictness = short 4 | ignore = DAR401, DAR402 5 | 6 | 7 | [flake8] 8 | max-line-length = 120 9 | ignore = D104, D105, D107, E402, E203, W503, W605, E722, E501 10 | docstring-convention = google 11 | per-file-ignores = 12 | lightautoml/automl/presets/tabular_presets.py:D102 13 | lightautoml/automl/presets/text_presets.py:D102 14 | lightautoml/automl/presets/utils.py:D100,D102,D103 15 | lightautoml/addons/autots/*:D100,D101,D102,D200,D205,D212,D415 16 | lightautoml/reader/seq.py:D100,D101,D102,D200,D205,D212,D415 17 | lightautoml/addons/interpretation/*:D100,D101,D102,D200,D205,D212,D415 18 | lightautoml/addons/utilization/*:D102 19 | lightautoml/report/report_deco.py:D101,D102,D103,D205,D212,D412,D415 20 | lightautoml/text/sentence_pooling.py:D102 21 | lightautoml/utils/*:D101,D102,D103 22 | lightautoml/addons/hypex/tests/*:D103,D100 23 | lightautoml/addons/hypex/ABTesting/ab_tester.py:D100,D101,D102,D103 24 | lightautoml/addons/hypex/utils/tutorial_data_creation.py:D100 25 | docs/*:D100,D103 26 | examples/*:D100,D103 27 | tests/*:D100,D101,D102,D103 28 | check_docs.py:D100 29 | exclude = 30 | .git 31 | __pycache__ 32 | setup.py 33 | build 34 | dist 35 | releases 36 | .venv 37 | .tox 38 | .mypy_cache 39 | .pytest_cache 40 | .vscode 41 | .github 42 | 43 | 44 | [rstcheck] 45 | ignore_directives=one,two,three 46 | ignore_roles=src,RFC 47 | ignore_messages=(Duplicate implicit target name|Unknown directive type "autoclass".|No directive entry for "autoclass" in module "docutils.parsers.rst.languages.en".|Unknown directive type "automodule".|Unknown directive type "autofunction".|No directive entry for "autofunction" in module "docutils.parsers.rst.languages.en".|No directive entry for "automodule" in module "docutils.parsers.rst.languages.en".) 48 | ignore_language=python 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # we use poetry for our build, but this file seems to be required 4 | # in order to get GitHub dependencies graph to work 5 | 6 | import setuptools 7 | 8 | 9 | if __name__ == "__main__": 10 | setuptools.setup(name="lightautoml") 11 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/__init__.py -------------------------------------------------------------------------------- /tests/integration/integration_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lightautoml.dataset.roles import TargetRole 4 | from joblib import load 5 | 6 | from sklearn.metrics import log_loss 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.metrics import roc_auc_score 9 | 10 | 11 | def load_and_test_automl(filename, task, score, pred, data, target_name): 12 | automl = load(filename) 13 | 14 | test_pred_joblib = automl.predict(data) 15 | 16 | if task.name == "binary": 17 | score_new = roc_auc_score(data[target_name].values, test_pred_joblib.data[:, 0]) 18 | elif task.name == "multiclass": 19 | score_new = log_loss(data[target_name].map(automl.reader.targets_mapping), test_pred_joblib.data) 20 | elif task.name == "reg": 21 | score_new = mean_squared_error(data[target_name].values, test_pred_joblib.data[:, 0]) 22 | 23 | np.testing.assert_almost_equal(score, score_new, decimal=3) 24 | np.testing.assert_allclose(pred.data[:, 0], test_pred_joblib.data[:, 0]) 25 | 26 | 27 | def get_target_name(roles): 28 | for key, value in roles.items(): 29 | if (key == "target") or isinstance(key, TargetRole): 30 | return value 31 | -------------------------------------------------------------------------------- /tests/integration/test_demo10.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | 6 | from sklearn.metrics import log_loss 7 | 8 | from lightautoml.automl.base import AutoML 9 | from lightautoml.automl.blend import WeightedBlender 10 | from lightautoml.ml_algo.boost_cb import BoostCB 11 | from lightautoml.ml_algo.linear_sklearn import LinearLBFGS 12 | from lightautoml.ml_algo.tuning.optuna import OptunaTuner 13 | from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline 14 | from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures 15 | from lightautoml.pipelines.features.linear_pipeline import LinearFeatures 16 | from lightautoml.pipelines.ml.base import MLPipeline 17 | from lightautoml.pipelines.selection.importance_based import ( 18 | ImportanceCutoffSelector, 19 | ModelBasedImportanceEstimator, 20 | ) 21 | from lightautoml.reader.base import PandasToPandasReader 22 | from lightautoml.utils.timer import PipelineTimer 23 | 24 | # demo of timer, blender and multiclass 25 | 26 | np.random.seed(42) 27 | 28 | 29 | def test_some_pipeline(sampled_app_train_test, multiclass_task): 30 | 31 | train, test = sampled_app_train_test 32 | 33 | timer = PipelineTimer(600, mode=2) 34 | 35 | timer_gbm = timer.get_task_timer("gbm") 36 | feat_sel_0 = LGBSimpleFeatures() 37 | mod_sel_0 = BoostCB(timer=timer_gbm) 38 | imp_sel_0 = ModelBasedImportanceEstimator() 39 | selector_0 = ImportanceCutoffSelector( 40 | feat_sel_0, 41 | mod_sel_0, 42 | imp_sel_0, 43 | cutoff=0, 44 | ) 45 | 46 | feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, feats_imp=imp_sel_0) 47 | timer_gbm_0 = timer.get_task_timer("gbm") 48 | timer_gbm_1 = timer.get_task_timer("gbm") 49 | 50 | gbm_0 = BoostCB(timer=timer_gbm_0, default_params={"devices": "0"}) 51 | gbm_1 = BoostCB(timer=timer_gbm_1, default_params={"devices": "0"}) 52 | 53 | tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True) 54 | gbm_lvl0 = MLPipeline( 55 | [(gbm_0, tuner_0), gbm_1], 56 | pre_selection=selector_0, 57 | features_pipeline=feats_gbm_0, 58 | post_selection=None, 59 | ) 60 | 61 | feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe="auto") 62 | 63 | timer_reg = timer.get_task_timer("reg") 64 | reg_0 = LinearLBFGS(timer=timer_reg) 65 | 66 | reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) 67 | 68 | reader = PandasToPandasReader( 69 | multiclass_task, 70 | samples=None, 71 | max_nan_rate=1, 72 | max_constant_rate=1, 73 | advanced_roles=True, 74 | drop_score_co=-1, 75 | n_jobs=1, 76 | ) 77 | blender = WeightedBlender() 78 | 79 | automl = AutoML( 80 | reader=reader, 81 | levels=[[gbm_lvl0, reg_lvl0]], 82 | timer=timer, 83 | blender=blender, 84 | skip_conn=False, 85 | ) 86 | 87 | oof_pred = automl.fit_predict(train, roles={"target": "TARGET"}) 88 | test_pred = automl.predict(test) 89 | 90 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 91 | 92 | oof_score = log_loss(train["TARGET"].values[not_nan], oof_pred.data[not_nan]) 93 | assert oof_score < 1 94 | 95 | test_score = log_loss(test["TARGET"].values, test_pred.data) 96 | assert test_score < 1 97 | -------------------------------------------------------------------------------- /tests/integration/test_demo11.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import shutil 5 | 6 | import numpy as np 7 | 8 | from sklearn.metrics import mean_squared_error 9 | 10 | from lightautoml.automl.presets.text_presets import TabularNLPAutoML 11 | 12 | 13 | np.random.seed(42) 14 | 15 | 16 | def test_tabularnlp(avito1k_train_test, avito1k_roles, regression_task): 17 | train, test = avito1k_train_test 18 | 19 | roles = avito1k_roles 20 | 21 | task = regression_task 22 | 23 | automl = TabularNLPAutoML(task=task, timeout=600) 24 | oof_pred = automl.fit_predict(train, roles=roles) 25 | test_pred = automl.predict(test) 26 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 27 | target = roles["target"] 28 | 29 | oof_score = mean_squared_error(train[target].values[not_nan], oof_pred.data[not_nan][:, 0]) 30 | 31 | assert oof_score < 0.2 32 | 33 | test_score = mean_squared_error(test[target].values, test_pred.data[:, 0]) 34 | assert test_score < 0.2 35 | 36 | shutil.rmtree("./models", ignore_errors=True) 37 | -------------------------------------------------------------------------------- /tests/integration/test_demo12.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import numpy as np 6 | import tempfile 7 | 8 | from sklearn.metrics import roc_auc_score 9 | 10 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 11 | from lightautoml.validation.np_iterators import TimeSeriesIterator 12 | 13 | 14 | np.random.seed(42) 15 | 16 | 17 | def test_tabular_with_dates(sampled_app_train_test, binary_task): 18 | 19 | train, test = sampled_app_train_test 20 | 21 | # create time series iterator that is passed as cv_func 22 | cv_iter = TimeSeriesIterator(train["EMP_DATE"].astype("datetime64[ns]"), n_splits=5, sorted_kfold=False) 23 | 24 | # train dataset may be passed as dict of np.ndarray 25 | train = { 26 | "data": train[["AMT_CREDIT", "AMT_ANNUITY"]].values, 27 | "target": train["TARGET"].values, 28 | } 29 | 30 | task = binary_task 31 | 32 | automl = TabularAutoML( 33 | task=task, 34 | timeout=200, 35 | ) 36 | oof_pred = automl.fit_predict(train, train_features=["AMT_CREDIT", "AMT_ANNUITY"], cv_iter=cv_iter) 37 | 38 | with tempfile.TemporaryDirectory() as tmpdirname: 39 | # prediction can be made on file by 40 | tmp_file = os.path.join(tmpdirname, "temp_test_data.csv") 41 | test.to_csv(tmp_file, index=False) 42 | test_pred = automl.predict(tmp_file, batch_size=100, n_jobs=4) 43 | 44 | oof_prediction = oof_pred.data[:, 0] 45 | not_empty = np.logical_not(np.isnan(oof_prediction)) 46 | 47 | oof_score = roc_auc_score(train["target"][not_empty], oof_prediction[not_empty]) 48 | assert oof_score > 0.52 49 | 50 | test_score = roc_auc_score(test["TARGET"].values, test_pred.data[:, 0]) 51 | assert test_score > 0.51 52 | -------------------------------------------------------------------------------- /tests/integration/test_demo13.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.metrics import mean_absolute_error 4 | 5 | from lightautoml.addons.autots.base import AutoTS 6 | from lightautoml.tasks import Task 7 | 8 | 9 | np.random.seed(42) 10 | 11 | 12 | def test_autots(ai92_value_77_train_test): 13 | 14 | train, test, horizon = ai92_value_77_train_test 15 | roles = {"target": "value", "datetime": "date"} 16 | 17 | seq_params = { 18 | "seq0": { 19 | "case": "next_values", 20 | "params": {"n_target": horizon, "history": np.maximum(7, horizon), "step": 1, "test_last": True}, 21 | }, 22 | } 23 | 24 | # True (then set default values) / False; int, list or np.array 25 | # default: lag_features=30, diff_features=7 26 | transformers_params = { 27 | "lag_features": [0, 1, 2, 3, 5, 10], 28 | "lag_time_features": [0, 1, 2], 29 | "diff_features": [0, 1, 3, 4], 30 | } 31 | 32 | task = Task("multi:reg", greater_is_better=False, metric="mae", loss="mae") 33 | 34 | reader_params = { 35 | "seq_params": seq_params, 36 | "transformers_params": transformers_params, 37 | } 38 | automl = AutoTS( 39 | task, 40 | reader_params=reader_params, 41 | time_series_trend_params={ 42 | "trend": False, 43 | }, 44 | ) 45 | automl.fit_predict(train, roles, verbose=4) 46 | forecast, _ = automl.predict(train) 47 | 48 | test_score = mean_absolute_error(test[roles["target"]].values, forecast) 49 | assert test_score < 22e4 # TODO: 2e5 50 | -------------------------------------------------------------------------------- /tests/integration/test_demo4.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.metrics import roc_auc_score 4 | 5 | from lightautoml.automl.base import AutoML 6 | from lightautoml.ml_algo.boost_lgbm import BoostLGBM 7 | from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures 8 | from lightautoml.pipelines.ml.base import MLPipeline 9 | from lightautoml.reader.base import PandasToPandasReader 10 | from lightautoml.tasks import Task 11 | 12 | np.random.seed(42) 13 | 14 | 15 | def test_different_task_params(sampled_app_train_test): 16 | 17 | train_data, test_data = sampled_app_train_test 18 | 19 | for task_params, target in zip( 20 | [ 21 | {"name": "binary"}, 22 | {"name": "binary", "metric": roc_auc_score}, 23 | {"name": "reg", "loss": "mse", "metric": "r2"}, 24 | {"name": "reg", "loss": "rmsle", "metric": "rmsle"}, 25 | { 26 | "name": "reg", 27 | "loss": "quantile", 28 | "loss_params": {"q": 0.9}, 29 | "metric": "quantile", 30 | "metric_params": {"q": 0.9}, 31 | }, 32 | ], 33 | ["TARGET", "TARGET", "AMT_CREDIT", "AMT_CREDIT", "AMT_CREDIT"], 34 | ): 35 | 36 | task = Task(**task_params) 37 | 38 | reader = PandasToPandasReader(task, cv=5, random_state=1) 39 | 40 | # pipeline 1 level parts 41 | pipe = LGBSimpleFeatures() 42 | 43 | model2 = BoostLGBM( 44 | default_params={ 45 | "learning_rate": 0.025, 46 | "num_leaves": 64, 47 | "seed": 2, 48 | "num_threads": 5, 49 | } 50 | ) 51 | 52 | pipeline_lvl1 = MLPipeline( 53 | [model2], 54 | pre_selection=None, # selector, 55 | features_pipeline=pipe, 56 | post_selection=None, 57 | ) 58 | 59 | automl = AutoML( 60 | reader, 61 | [ 62 | [pipeline_lvl1], 63 | ], 64 | skip_conn=False, 65 | # debug=True, 66 | ) 67 | 68 | oof_pred = automl.fit_predict(train_data, roles={"target": target}, verbose=1) 69 | # assert for last oof score 70 | assert task.metric_func(train_data[target].values, oof_pred.data[:, 0]) < 10 ** 5 71 | assert task.metric_func(test_data[target].values, automl.predict(test_data).data[:, 0]) < 10 ** 5 72 | -------------------------------------------------------------------------------- /tests/integration/test_demo6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | """AutoML with nested CV usage.""" 6 | 7 | import numpy as np 8 | 9 | from sklearn.metrics import roc_auc_score 10 | 11 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 12 | from lightautoml.dataset.roles import DatetimeRole 13 | 14 | 15 | np.random.seed(42) 16 | 17 | 18 | def test_tabularautoml_2lvl(sampled_app_train_test, binary_task): 19 | 20 | train, test = sampled_app_train_test 21 | 22 | roles = { 23 | "target": "TARGET", 24 | DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", 25 | } 26 | 27 | task = binary_task 28 | 29 | automl = TabularAutoML( 30 | task=task, 31 | timeout=600, 32 | general_params={ 33 | "use_algos": [ 34 | [ 35 | "linear_l2", 36 | "lgb", 37 | ], 38 | ["linear_l2", "lgb"], 39 | ], 40 | "nested_cv": True, 41 | "skip_conn": True, 42 | }, 43 | nested_cv_params={"cv": 5, "n_folds": None}, 44 | debug=True, 45 | ) 46 | 47 | oof_pred = automl.fit_predict(train, roles=roles, verbose=5) 48 | test_pred = automl.predict(test) 49 | 50 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 51 | 52 | oof_score = roc_auc_score(train[roles["target"]].values[not_nan], oof_pred.data[not_nan][:, 0]) 53 | assert oof_score > 0.75 54 | 55 | test_score = roc_auc_score(test[roles["target"]].values, test_pred.data[:, 0]) 56 | assert test_score > 0.7 57 | -------------------------------------------------------------------------------- /tests/integration/test_demo7.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | 6 | from sklearn.metrics import roc_auc_score 7 | 8 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 9 | from lightautoml.dataset.roles import DatetimeRole 10 | 11 | 12 | np.random.seed(42) 13 | 14 | 15 | def test_classic_tabularautoml(sampled_app_train_test, binary_task): 16 | 17 | train, test = sampled_app_train_test 18 | 19 | roles = { 20 | "target": "TARGET", 21 | DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", 22 | } 23 | 24 | task = binary_task 25 | 26 | automl = TabularAutoML( 27 | task=task, 28 | timeout=3600, 29 | debug=True, 30 | ) 31 | oof_pred = automl.fit_predict(train, roles=roles, verbose=5) 32 | test_pred = automl.predict(test) 33 | 34 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 35 | 36 | oof_score = roc_auc_score(train[roles["target"]].values[not_nan], oof_pred.data[not_nan][:, 0]) 37 | assert oof_score > 0.7 38 | 39 | test_score = roc_auc_score(test[roles["target"]].values, test_pred.data[:, 0]) 40 | assert test_score > 0.7 41 | -------------------------------------------------------------------------------- /tests/integration/test_demo8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | 6 | from sklearn.metrics import log_loss 7 | 8 | from lightautoml.automl.base import AutoML 9 | from lightautoml.automl.blend import WeightedBlender 10 | from lightautoml.ml_algo.boost_lgbm import BoostLGBM 11 | from lightautoml.ml_algo.linear_sklearn import LinearLBFGS 12 | from lightautoml.ml_algo.tuning.optuna import OptunaTuner 13 | from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline 14 | from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures 15 | from lightautoml.pipelines.features.linear_pipeline import LinearFeatures 16 | from lightautoml.pipelines.ml.base import MLPipeline 17 | from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector 18 | from lightautoml.pipelines.selection.importance_based import ( 19 | ModelBasedImportanceEstimator, 20 | ) 21 | from lightautoml.reader.base import PandasToPandasReader 22 | from lightautoml.utils.timer import PipelineTimer 23 | 24 | 25 | def test_lgbm_linear_pipeline(sampled_app_train_test, multiclass_task): 26 | 27 | # demo of timer, blender and multiclass 28 | np.random.seed(42) 29 | train, test = sampled_app_train_test 30 | timer = PipelineTimer(600, mode=2) 31 | 32 | timer_gbm = timer.get_task_timer("gbm") 33 | feat_sel_0 = LGBSimpleFeatures() 34 | mod_sel_0 = BoostLGBM(timer=timer_gbm) 35 | imp_sel_0 = ModelBasedImportanceEstimator() 36 | selector_0 = ImportanceCutoffSelector( 37 | feat_sel_0, 38 | mod_sel_0, 39 | imp_sel_0, 40 | cutoff=0, 41 | ) 42 | 43 | feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, output_categories=True, feats_imp=imp_sel_0) 44 | timer_gbm_0 = timer.get_task_timer("gbm") 45 | timer_gbm_1 = timer.get_task_timer("gbm") 46 | 47 | gbm_0 = BoostLGBM(timer=timer_gbm_0) 48 | gbm_1 = BoostLGBM(timer=timer_gbm_1) 49 | 50 | tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True) 51 | gbm_lvl0 = MLPipeline( 52 | [(gbm_0, tuner_0), gbm_1], 53 | pre_selection=selector_0, 54 | features_pipeline=feats_gbm_0, 55 | post_selection=None, 56 | ) 57 | 58 | feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe="auto") 59 | 60 | timer_reg = timer.get_task_timer("reg") 61 | reg_0 = LinearLBFGS(timer=timer_reg) 62 | 63 | reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) 64 | 65 | reader = PandasToPandasReader( 66 | multiclass_task, 67 | samples=None, 68 | max_nan_rate=1, 69 | max_constant_rate=1, 70 | advanced_roles=True, 71 | drop_score_co=-1, 72 | n_jobs=1, 73 | ) 74 | 75 | blender = WeightedBlender() 76 | 77 | automl = AutoML( 78 | reader=reader, 79 | levels=[[gbm_lvl0, reg_lvl0]], 80 | timer=timer, 81 | blender=blender, 82 | debug=True, 83 | skip_conn=False, 84 | ) 85 | oof_pred = automl.fit_predict(train, roles={"target": "TARGET"}, verbose=5) 86 | test_pred = automl.predict(test) 87 | 88 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 89 | 90 | oof_score = log_loss(train["TARGET"].values[not_nan], oof_pred.data[not_nan, :]) 91 | assert oof_score < 1 92 | 93 | test_score = log_loss(test["TARGET"].values, test_pred.data) 94 | assert test_score < 1 95 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_addons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_addons/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_automl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_automl/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_automl/test_presets/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/presets_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from pytest import approx 4 | import tempfile 5 | 6 | from sklearn.metrics import log_loss 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.metrics import roc_auc_score 9 | 10 | from lightautoml.dataset.roles import TargetRole 11 | 12 | 13 | def check_pickling(automl, ho_score, task, test_data, target_name): 14 | with tempfile.TemporaryDirectory() as tmpdirname: 15 | filename = os.path.join(tmpdirname, "automl.pickle") 16 | with open(filename, "wb") as f: 17 | pickle.dump(automl, f) 18 | 19 | with open(filename, "rb") as f: 20 | automl = pickle.load(f) 21 | 22 | test_pred = automl.predict(test_data) 23 | 24 | if task.name == "binary": 25 | ho_score_new = roc_auc_score(test_data[target_name].values, test_pred.data[:, 0]) 26 | elif task.name == "multiclass": 27 | ho_score_new = log_loss(test_data[target_name].map(automl.reader.target_mapping), test_pred.data) 28 | elif task.name == "reg": 29 | ho_score_new = mean_squared_error(test_data[target_name].values, test_pred.data[:, 0]) 30 | 31 | assert ho_score == approx(ho_score_new, rel=1e-3) 32 | 33 | 34 | def get_target_name(roles): 35 | for key, value in roles.items(): 36 | if (key == "target") or isinstance(key, TargetRole): 37 | return value 38 | -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/test_tabularautoml.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | 3 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 4 | from tests.unit.test_automl.test_presets.presets_utils import check_pickling 5 | from tests.unit.test_automl.test_presets.presets_utils import get_target_name 6 | 7 | 8 | class TestTabularAutoML: 9 | def test_fit_predict(self, sampled_app_train_test, sampled_app_roles, binary_task): 10 | # load and prepare data 11 | train, test = sampled_app_train_test 12 | 13 | # run automl 14 | automl = TabularAutoML(task=binary_task) 15 | oof_predictions = automl.fit_predict(train, roles=sampled_app_roles, verbose=10) 16 | ho_predictions = automl.predict(test) 17 | 18 | # calculate scores 19 | target_name = get_target_name(sampled_app_roles) 20 | oof_score = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0]) 21 | ho_score = roc_auc_score(test[target_name].values, ho_predictions.data[:, 0]) 22 | 23 | # checks 24 | assert oof_score > 0.73 25 | assert ho_score > 0.72 26 | 27 | check_pickling(automl, ho_score, binary_task, test, target_name) 28 | -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/test_tabularautoml_nn.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | 3 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 4 | from tests.unit.test_automl.test_presets.presets_utils import check_pickling 5 | from tests.unit.test_automl.test_presets.presets_utils import get_target_name 6 | 7 | 8 | class TestTabularAutoML: 9 | """Neural network test based on out-of-fold and test scores.""" 10 | 11 | def test_fit_predict(self, sampled_app_train_test, sampled_app_roles, binary_task): 12 | """Test function.""" 13 | # load and prepare data 14 | train, test = sampled_app_train_test 15 | 16 | # run automl 17 | automl = TabularAutoML( 18 | debug=True, 19 | task=binary_task, 20 | general_params={"use_algos": [["mlp"]]}, 21 | nn_params={"n_epochs": 10, "bs": 128, "num_workers": 0, "path_to_save": None, "freeze_defaults": True}, 22 | ) 23 | oof_predictions = automl.fit_predict(train, roles=sampled_app_roles, verbose=10) 24 | ho_predictions = automl.predict(test) 25 | 26 | # calculate scores 27 | target_name = get_target_name(sampled_app_roles) 28 | oof_score = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0]) 29 | ho_score = roc_auc_score(test[target_name].values, ho_predictions.data[:, 0]) 30 | 31 | # checks 32 | assert oof_score > 0.58 33 | assert ho_score > 0.58 34 | 35 | check_pickling(automl, ho_score, binary_task, test, target_name) 36 | -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/test_tabularautoml_xgb.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | 3 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 4 | from tests.unit.test_automl.test_presets.presets_utils import check_pickling 5 | from tests.unit.test_automl.test_presets.presets_utils import get_target_name 6 | 7 | 8 | class TestTabularAutoMLXGB: 9 | def test_fit_predict(self, sampled_app_train_test, sampled_app_roles, binary_task): 10 | # load and prepare data 11 | train, test = sampled_app_train_test 12 | 13 | # run automl 14 | automl = TabularAutoML(task=binary_task, general_params={"use_algos": [["xgb"]]}) 15 | oof_predictions = automl.fit_predict(train, roles=sampled_app_roles, verbose=10) 16 | ho_predictions = automl.predict(test) 17 | 18 | # calculate scores 19 | target_name = get_target_name(sampled_app_roles) 20 | oof_score = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0]) 21 | ho_score = roc_auc_score(test[target_name].values, ho_predictions.data[:, 0]) 22 | 23 | # checks 24 | assert oof_score > 0.69 25 | assert ho_score > 0.69 26 | 27 | check_pickling(automl, ho_score, binary_task, test, target_name) 28 | -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/test_tabularnlpautoml.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.metrics import mean_squared_error 4 | 5 | from lightautoml.automl.presets.text_presets import TabularNLPAutoML 6 | from tests.unit.test_automl.test_presets.presets_utils import check_pickling 7 | from tests.unit.test_automl.test_presets.presets_utils import get_target_name 8 | 9 | 10 | class TestTabularNLPAutoML: 11 | def test_fit_predict(self, avito1k_train_test, avito1k_roles, regression_task): 12 | # load and prepare data 13 | train, test = avito1k_train_test 14 | 15 | # run automl 16 | automl = TabularNLPAutoML(task=regression_task, timeout=600) 17 | oof_pred = automl.fit_predict(train, roles=avito1k_roles, verbose=10) 18 | test_pred = automl.predict(test) 19 | not_nan = np.any(~np.isnan(oof_pred.data), axis=1) 20 | 21 | target_name = get_target_name(avito1k_roles) 22 | oof_score = mean_squared_error(train[target_name].values[not_nan], oof_pred.data[not_nan][:, 0]) 23 | ho_score = mean_squared_error(test[target_name].values, test_pred.data[:, 0]) 24 | 25 | # checks 26 | assert oof_score < 0.7 27 | assert ho_score < 0.7 28 | 29 | check_pickling(automl, ho_score, regression_task, test, target_name) 30 | -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/test_tabularutilizedautoml.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | 3 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 4 | from tests.unit.test_automl.test_presets.presets_utils import check_pickling 5 | from tests.unit.test_automl.test_presets.presets_utils import get_target_name 6 | 7 | 8 | class TabularUtilizedAutoML: 9 | def test_fit_predict(self, sampled_app_train_test, sampled_app_roles, binary_task): 10 | # load and prepare data 11 | train, test = sampled_app_train_test 12 | 13 | # run automl 14 | automl = TabularAutoML(task=binary_task) 15 | oof_predictions = automl.fit_predict(train, roles=sampled_app_roles, verbose=10) 16 | ho_predictions = automl.predict(test) 17 | 18 | # calculate scores 19 | target_name = get_target_name(sampled_app_roles) 20 | oof_score = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0]) 21 | ho_score = roc_auc_score(test[target_name].values, ho_predictions.data[:, 0]) 22 | 23 | # checks 24 | assert oof_score > 0.73 25 | assert ho_score > 0.72 26 | 27 | check_pickling(automl, ho_score, binary_task, test, target_name) 28 | -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/test_uplift.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | 3 | # from tests.unit.test_automl.test_presets.presets_utils import check_pickling 4 | 5 | import copy 6 | from lightautoml.addons.uplift.base import AutoUplift 7 | from lightautoml.addons.uplift.metrics import ( 8 | calculate_min_max_uplift_auc, 9 | calculate_uplift_auc, 10 | ) 11 | 12 | 13 | class TestAutoUpliftPreset: 14 | def test_fit_predict(self, uplift_data_train_test, sampled_app_roles, binary_task): 15 | # load and prepare data 16 | train, test, test_target, test_treatment = uplift_data_train_test 17 | 18 | # run automl 19 | autouplift = AutoUplift( 20 | binary_task, 21 | metric="adj_qini", 22 | has_report=True, 23 | test_size=0.2, 24 | timeout=200, 25 | cpu_limit=1, 26 | # gpu_ids=["0"] 27 | # timeout_metalearner=5 28 | ) 29 | 30 | uplift_data_roles = copy.deepcopy(sampled_app_roles) 31 | uplift_data_roles["treatment"] = "CODE_GENDER" 32 | 33 | autouplift.fit(train, uplift_data_roles, verbose=1) 34 | 35 | best_metalearner = autouplift.create_best_metalearner( 36 | update_metalearner_params={"timeout": None}, update_baselearner_params={"timeout": 30} 37 | ) 38 | best_metalearner.fit(train, uplift_data_roles) 39 | _ = best_metalearner.predict(test) 40 | 41 | uplift_pred, treatment_pred, control_pred = best_metalearner.predict(test) 42 | uplift_pred = uplift_pred.ravel() 43 | 44 | # calculate scores 45 | roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1]) 46 | roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0]) 47 | 48 | uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False) 49 | uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True) 50 | auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment) 51 | 52 | print("--- Check scores ---") 53 | print('OOF scores "ROC_AUC":') 54 | print("\tTreatment = {:.5f}".format(roc_auc_treatment)) 55 | print("\tControl = {:.5f}".format(roc_auc_control)) 56 | print('Uplift score of test group (default="adj_qini"):') 57 | print("\tBaseline = {:.5f}".format(auc_base)) 58 | print("\tAlgo (Normed) = {:.5f} ({:.5f})".format(uplift_auc_algo, uplift_auc_algo_normed)) 59 | print("\tPerfect = {:.5f}".format(auc_perfect)) 60 | 61 | # Uplift score of test group (default="adj_qini"): 62 | # Baseline = 0.01340 63 | # Algo (Normed) = 0.03012 (0.20648) 64 | # Perfect = 0.09438 65 | 66 | # checks 67 | assert roc_auc_treatment > 0.68 # 0.69535 68 | assert roc_auc_control > 0.71 # 0.73022 69 | 70 | # check_pickling(autouplift, ho_score, binary_task, test, target_name) 71 | -------------------------------------------------------------------------------- /tests/unit/test_automl/test_presets/test_whiteboxpreset.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | 3 | from lightautoml.automl.presets.whitebox_presets import WhiteBoxPreset 4 | from tests.unit.test_automl.test_presets.presets_utils import check_pickling 5 | from tests.unit.test_automl.test_presets.presets_utils import get_target_name 6 | 7 | 8 | class TestWhiteBoxPreset: 9 | def test_fit_predict(self, jobs_train_test, jobs_roles, binary_task): 10 | # load and prepare data 11 | train, test = jobs_train_test 12 | 13 | # run automl 14 | automl = WhiteBoxPreset(binary_task) 15 | oof_predictions = automl.fit_predict(train.reset_index(drop=True), roles=jobs_roles, verbose=10) 16 | ho_predictions = automl.predict(test) 17 | 18 | # calculate scores 19 | target_name = get_target_name(jobs_roles) 20 | oof_score = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0]) 21 | ho_score = roc_auc_score(test[target_name].values, ho_predictions.data[:, 0]) 22 | 23 | # checks 24 | assert oof_score > 0.75 25 | assert ho_score > 0.75 26 | 27 | check_pickling(automl, ho_score, binary_task, test, target_name) 28 | -------------------------------------------------------------------------------- /tests/unit/test_dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_dataset/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_image/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_image/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_ml_algo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_ml_algo/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_ml_algo/test_optimization/optuna/test_optuna_tuner.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | 5 | from lightautoml.ml_algo.boost_lgbm import BoostLGBM 6 | from lightautoml.ml_algo.tuning.base import Normal 7 | from lightautoml.ml_algo.tuning.base import Uniform 8 | from lightautoml.ml_algo.tuning.optuna import OptunaTuner 9 | 10 | 11 | # from lightautoml.dataset.np_pd_dataset import PandasDataset 12 | # from lightautoml.dataset.utils import roles_parser 13 | # from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures 14 | # from lightautoml.validation.np_iterators import FoldsIterator 15 | 16 | 17 | # @pytest.mark.parametrize( 18 | # "sampled_app_train_test", 19 | # [ 20 | # (1000), 21 | # ], 22 | # indirect=["sampled_app_train_test"], 23 | # ) 24 | # def test_params_values_ranges( 25 | # sampled_app_train_test, 26 | # sampled_app_roles, 27 | # binary_task, 28 | # ): 29 | 30 | # train, _ = sampled_app_train_test 31 | 32 | # features_pipeline = LGBSimpleFeatures() 33 | # iterator = FoldsIterator( 34 | # PandasDataset( 35 | # data=train, 36 | # roles=roles_parser(sampled_app_roles), 37 | # task=binary_task, 38 | # ) 39 | # ) 40 | 41 | # iterator = iterator.apply_feature_pipeline(features_pipeline) 42 | 43 | # model = BoostLGBM( 44 | # default_params={"num_trees": 1, "random_state": 42}, 45 | # freeze_defaults=True, 46 | # optimization_search_space={ 47 | # "feature_fraction": SearchSpace(Distribution.UNIFORM, low=0.5, high=1.0), 48 | # "min_sum_hessian_in_leaf": SearchSpace(Distribution.CHOICE, choices=[0.5, 0.8]), 49 | # }, 50 | # ) 51 | 52 | # params_tuner = OptunaTuner(n_trials=10, timeout=300) 53 | # params_tuner.fit( 54 | # ml_algo=model, 55 | # train_valid_iterator=iterator, 56 | # ) 57 | 58 | # # check that the hyperparameters values are in the defined search space 59 | # for trial in params_tuner.study.get_trials(): 60 | # assert (trial.params["feature_fraction"] >= 0) and (trial.params["feature_fraction"] <= 1) 61 | # assert trial.params["min_sum_hessian_in_leaf"] in [0.5, 0.8] 62 | 63 | # # check time, n_trials 64 | 65 | # # check best params 66 | # assert (params_tuner.best_params["feature_fraction"] == 0.7993292420985183) and ( 67 | # params_tuner.best_params["min_sum_hessian_in_leaf"] == 0.5 68 | # ) 69 | 70 | 71 | def test_invalid_distributions(): 72 | iterator_mock = mock.MagicMock() 73 | 74 | model = BoostLGBM( 75 | default_params={"num_trees": 1, "random_state": 42}, 76 | freeze_defaults=True, 77 | optimization_search_space={ 78 | "feature_fraction": Uniform(low=0.5, high=1.0), 79 | "min_sum_hessian_in_leaf": Normal(low=1, high=2), # distribution is not supported by Optuna 80 | }, 81 | ) 82 | 83 | params_tuner = OptunaTuner(n_trials=10, timeout=300) 84 | 85 | with pytest.raises(Exception): 86 | params_tuner.fit( 87 | ml_algo=model, 88 | train_valid_iterator=iterator_mock, 89 | ) 90 | -------------------------------------------------------------------------------- /tests/unit/test_pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_pipelines/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_reader/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_report/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_report/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_tasks/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_text/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_transformers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_transformers/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_transformers/test_numeric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lightautoml.transformers.numeric import FillnaMean 4 | from lightautoml.transformers.numeric import FillnaMedian 5 | from lightautoml.transformers.numeric import QuantileTransformer 6 | 7 | 8 | def test_fillnamean(lamldataset_with_na): 9 | transformer = FillnaMean() 10 | output = transformer.fit_transform(lamldataset_with_na) 11 | 12 | assert output.data[:, 0].mean() == 4 13 | assert output.data[:, 1].mean() == 5 14 | assert output.data[:, 2].mean() == 0 15 | 16 | 17 | def test_fillnamedian(lamldataset_with_na): 18 | transformer = FillnaMedian() 19 | output = transformer.fit_transform(lamldataset_with_na) 20 | 21 | assert output.data[:, 0].mean() == 4 22 | assert output.data[:, 1].mean() == 5 23 | assert output.data[:, 2].mean() == 0 24 | 25 | 26 | def test_quantiletransformer(lamldataset_30_2): 27 | transformer = QuantileTransformer(noise=None) 28 | output = transformer.fit_transform(lamldataset_30_2) 29 | 30 | # raise(Exception(output.data)) 31 | np.testing.assert_allclose( 32 | output.data, 33 | np.array( 34 | [ 35 | [-5.19933758, -5.19933758], 36 | [-1.47640435, -1.48183072], 37 | [-1.42177828, -1.44872465], 38 | [-1.24067307, -1.25262296], 39 | [-1.02813514, -1.06089913], 40 | [-0.87314381, -0.95310275], 41 | [-0.86592145, -0.86396215], 42 | [-0.62097828, -0.60156557], 43 | [-0.50478792, -0.5339135], 44 | [-0.50373715, -0.48136567], 45 | [-0.39911771, -0.32828215], 46 | [-0.36893762, -0.30284499], 47 | [-0.18779519, -0.24328491], 48 | [-0.12682175, -0.13728361], 49 | [-0.01319139, 0.02800416], 50 | [0.01861645, 0.04895583], 51 | [0.13783602, 0.13759678], 52 | [0.1464553, 0.23178871], 53 | [0.2576737, 0.38408782], 54 | [0.35208669, 0.41496716], 55 | [0.59203696, 0.44743911], 56 | [0.6766203, 0.46734882], 57 | [0.75052545, 0.50159806], 58 | [0.77324891, 0.80971082], 59 | [0.95569552, 0.86776588], 60 | [1.18959458, 1.09445074], 61 | [1.22743552, 1.26058256], 62 | [1.43696861, 1.27169708], 63 | [1.55529186, 1.77127928], 64 | [5.19933758, 5.19933758], 65 | ] 66 | ), 67 | atol=1e-5, 68 | rtol=1e-5, 69 | ) 70 | -------------------------------------------------------------------------------- /tests/unit/test_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_utils/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_utils/test_logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | 6 | import pytest 7 | 8 | from lightautoml.automl.presets.tabular_presets import TabularAutoML 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "sampled_app_train_test, verbose, log_file", 13 | [ 14 | (1000, 0, "log_file.log"), 15 | # (10, 'log_file.log'), 16 | ], 17 | indirect=["sampled_app_train_test"], 18 | ) 19 | def test_logging( 20 | capsys, 21 | tmp_path, 22 | sampled_app_train_test, 23 | sampled_app_roles, 24 | binary_task, 25 | verbose, 26 | log_file, 27 | ): 28 | train, _ = sampled_app_train_test 29 | 30 | if log_file: 31 | log_file = os.path.join(tmp_path, "log_file.log") 32 | 33 | automl = TabularAutoML( 34 | task=binary_task, 35 | tuning_params={"max_tuning_iter": 3, "max_tuning_time": 30}, 36 | lgb_params={"default_params": {"num_trees": 5}}, 37 | ) 38 | 39 | automl.fit_predict( 40 | train, 41 | roles=sampled_app_roles, 42 | verbose=verbose, 43 | log_file=log_file, 44 | ) 45 | 46 | sys_out, sys_err = capsys.readouterr() 47 | 48 | if log_file: 49 | assert os.path.exists(log_file) 50 | 51 | if verbose == 0: 52 | assert sys_out == "" 53 | assert sys_err == "" 54 | 55 | # If log_file contains exact same that in stdout at max verbose value 56 | # if (verbose >= 4) and (log_file is not None): 57 | # sys_out_lines = sys_out.split('\n') 58 | # with open(log_file) as f: 59 | # for line_file, line_stdout in zip(f, sys_out_lines): 60 | # # remove message prefixes and compare 61 | # assert re.split(r'^(?:[^\t\r\n]+\t){5}([01])(?:\t|$)', line_file) == re.split(r'\s(.*)', line_stdout) 62 | 63 | 64 | # def test_logging_verbose_switching(): 65 | # def test_logging_custom_pipeline(): 66 | -------------------------------------------------------------------------------- /tests/unit/test_validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/b53830f84dc1ceec0112c7905be950304fafaa9f/tests/unit/test_validation/__init__.py -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | min_version = 3.28.0 3 | isolated_build = True 4 | envlist = 5 | py{38, 39, 310, 311, 312}, 6 | lint, 7 | docs, 8 | ; typing, 9 | ; build, 10 | codespell 11 | 12 | ; [tox:.package] 13 | # note tox will use the same python version as under what tox is installed to package 14 | # so unless this is python 3 you can require a given python version for the packaging 15 | # environment via the basepython key 16 | basepython = python3 17 | 18 | [gh-actions] 19 | python = 20 | 3.8: py38 21 | 3.9: py39 22 | 3.10: py310 23 | 3.11: py311 24 | 3.12: py312 25 | 26 | [gh-actions:env] 27 | PLATFORM = 28 | ubuntu-latest: linux 29 | macos-latest: macos 30 | windows-latest: windows 31 | 32 | [testenv] 33 | skip_install = true 34 | allowlist_externals = make 35 | deps = 36 | -e .[all] 37 | pytest >= 6.2.5 38 | jupyter 39 | commands = pytest {posargs} -v --basetemp="{envtmpdir}" --log-level=DEBUG 40 | 41 | [testenv:lint] 42 | skip_install = true 43 | deps = 44 | pre-commit == 2.15.0 45 | commands = 46 | pre-commit install 47 | pre-commit run --all-files 48 | 49 | [testenv:docs] 50 | requires = python >= 3.8 51 | changedir = docs 52 | deps = 53 | sphinx == 5.3.0 # extras = ["autdoc", "autosummary", "intersphinx", "napoleon", "viewcode"] 54 | sphinx-autodoc-typehints >=1.19.5 55 | sphinx-rtd-theme >=1.1.1 56 | nbsphinx == 0.8.10 57 | nbsphinx-link == 1.3.0 58 | doc8 == 0.10.1 59 | rstcheck == 3.3.1 60 | pandoc == 2.0.1 61 | ipython >=3.8 62 | commands = 63 | make clean html 64 | python ../check_docs.py 65 | 66 | ; [testenv:typing] 67 | ; description = run type checks 68 | ; deps = 69 | ; mypy >= 0.991 70 | ; commands = 71 | ; mypy {posargs:lightautoml tests} 72 | 73 | ; [testenv:build] 74 | ; skip_install = true 75 | ; deps = 76 | ; poetry >= 1.1.7 77 | ; commands = 78 | ; poetry run python scripts/poetry_fix.py -f 79 | ; poetry build 80 | 81 | [testenv:codespell] 82 | skip_install = true 83 | deps = 84 | codespell 85 | commands = 86 | codespell --skip="docs,_build,imgs" 87 | 88 | # example: 89 | # tox -e exp -- --dataset_project=Datasets_with_metadata --tags=binary openml 90 | # tox -e exp -- --dataset_project=Datasets_with_metadata --dataset=CIFAR_10_openml --queue=gpu_queue 91 | # tox -e exp -- --dataset_project=Datasets_with_metadata --tags=multiclass --queue=gpu_queue --n_datasets=5 --name=mlp --min_num_obs=100000 92 | # Notion: args [--tags=binary openml] means tag is binary OR tag is openml 93 | [testenv:exp] 94 | deps = 95 | clearml 96 | pandas 97 | numpy 98 | commands = 99 | python scripts/experiments/run.py {posargs} 100 | 101 | [testenv:exp_bonus] 102 | deps = 103 | -e .[all] 104 | clearml 105 | commands = 106 | python scripts/experiments/run.py {posargs} 107 | --------------------------------------------------------------------------------