├── .gitignore ├── HR_REPORT_1 ├── AUC_test_plot_full.png ├── Model_weights.png ├── autowoe_report.html ├── binned_stats_target.png ├── binned_stats_test.png ├── binned_stats_train.png ├── binned_test_posneg.png ├── binned_test_total.png ├── binned_train_posneg.png ├── binned_train_total.png ├── city_development_index_backlash_plot.png ├── city_development_index_roc_auc.png ├── city_development_index_woe.png ├── city_development_index_woe_bars.png ├── company_size_backlash_plot.png ├── company_size_roc_auc.png ├── company_size_woe.png ├── company_size_woe_bars.png ├── company_type_backlash_plot.png ├── company_type_roc_auc.png ├── company_type_woe.png ├── company_type_woe_bars.png ├── corr_heatmap.png ├── education_level_backlash_plot.png ├── education_level_roc_auc.png ├── education_level_woe.png ├── education_level_woe_bars.png ├── enrolled_university_backlash_plot.png ├── enrolled_university_roc_auc.png ├── enrolled_university_woe.png ├── enrolled_university_woe_bars.png ├── experience_backlash_plot.png ├── experience_roc_auc.png ├── experience_woe.png ├── experience_woe_bars.png ├── shap.js ├── test_enc_ginis.png └── train_enc_ginis.png ├── HR_REPORT_2 ├── AUC_test_plot_full.png ├── Model_weights.png ├── autowoe_report.html ├── binned_stats_target.png ├── binned_stats_test.png ├── binned_stats_train.png ├── binned_test_posneg.png ├── binned_test_total.png ├── binned_train_posneg.png ├── binned_train_total.png ├── city_backlash_plot.png ├── city_development_index_backlash_plot.png ├── city_development_index_roc_auc.png ├── city_development_index_woe.png ├── city_development_index_woe_bars.png ├── city_roc_auc.png ├── city_woe.png ├── city_woe_bars.png ├── company_size_backlash_plot.png ├── company_size_roc_auc.png ├── company_size_woe.png ├── company_size_woe_bars.png ├── company_type_backlash_plot.png ├── company_type_roc_auc.png ├── company_type_woe.png ├── company_type_woe_bars.png ├── corr_heatmap.png ├── education_level_backlash_plot.png ├── education_level_roc_auc.png ├── education_level_woe.png ├── education_level_woe_bars.png ├── enrolled_university_backlash_plot.png ├── enrolled_university_roc_auc.png ├── enrolled_university_woe.png ├── enrolled_university_woe_bars.png ├── experience_backlash_plot.png ├── experience_roc_auc.png ├── experience_woe.png ├── experience_woe_bars.png ├── shap.js ├── test_enc_ginis.png └── train_enc_ginis.png ├── LICENSE ├── LightAutoML demo (Blackbox).ipynb ├── LightAutoML demo (NLP).ipynb ├── LightAutoML demo (Whitebox).ipynb ├── NLP_REPORT ├── BankName_char_len_hist.png ├── BankName_tokens_len_hist.png ├── Message_char_len_hist.png ├── Message_tokens_len_hist.png ├── concat_char_len_hist.png ├── concat_tokens_len_hist.png ├── report_nlp.html ├── test_distribution_of_logits_1.png ├── test_pie_f1_metric_1.png ├── test_pr_curve_1.png ├── test_preds_distribution_by_bins_1.png ├── test_roc_curve_1.png ├── valid_distribution_of_logits.png ├── valid_pie_f1_metric.png ├── valid_pr_curve.png ├── valid_preds_distribution_by_bins.png └── valid_roc_curve.png ├── README.md ├── example_data ├── jobs_train.csv └── nlp_data.csv ├── imgs ├── tutorial_NLP_image_1.jpg ├── tutorial_NLP_image_2.jpg ├── tutorial_blackbox_pipeline.png ├── tutorial_blackbox_report_1.png ├── tutorial_blackbox_report_2.png ├── tutorial_blackbox_report_3.png ├── tutorial_whitebox_report_1.png ├── tutorial_whitebox_report_2.png ├── tutorial_whitebox_report_3.png └── tutorial_whitebox_report_4.png └── tabularAutoML_model_report ├── lama_interactive_report.html ├── test_distribution_of_logits_1.png ├── test_pie_f1_metric_1.png ├── test_pr_curve_1.png ├── test_preds_distribution_by_bins_1.png ├── test_roc_curve_1.png ├── valid_distribution_of_logits.png ├── valid_pie_f1_metric.png ├── valid_pr_curve.png ├── valid_preds_distribution_by_bins.png └── valid_roc_curve.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /HR_REPORT_1/AUC_test_plot_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/AUC_test_plot_full.png -------------------------------------------------------------------------------- /HR_REPORT_1/Model_weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/Model_weights.png -------------------------------------------------------------------------------- /HR_REPORT_1/binned_stats_target.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_stats_target.png -------------------------------------------------------------------------------- /HR_REPORT_1/binned_stats_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_stats_test.png -------------------------------------------------------------------------------- /HR_REPORT_1/binned_stats_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_stats_train.png -------------------------------------------------------------------------------- /HR_REPORT_1/binned_test_posneg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_test_posneg.png -------------------------------------------------------------------------------- /HR_REPORT_1/binned_test_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_test_total.png -------------------------------------------------------------------------------- /HR_REPORT_1/binned_train_posneg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_train_posneg.png -------------------------------------------------------------------------------- /HR_REPORT_1/binned_train_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_train_total.png -------------------------------------------------------------------------------- /HR_REPORT_1/city_development_index_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/city_development_index_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_1/city_development_index_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/city_development_index_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_1/city_development_index_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/city_development_index_woe.png -------------------------------------------------------------------------------- /HR_REPORT_1/city_development_index_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/city_development_index_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_1/company_size_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_size_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_1/company_size_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_size_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_1/company_size_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_size_woe.png -------------------------------------------------------------------------------- /HR_REPORT_1/company_size_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_size_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_1/company_type_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_type_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_1/company_type_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_type_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_1/company_type_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_type_woe.png -------------------------------------------------------------------------------- /HR_REPORT_1/company_type_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_type_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_1/corr_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/corr_heatmap.png -------------------------------------------------------------------------------- /HR_REPORT_1/education_level_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/education_level_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_1/education_level_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/education_level_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_1/education_level_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/education_level_woe.png -------------------------------------------------------------------------------- /HR_REPORT_1/education_level_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/education_level_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_1/enrolled_university_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/enrolled_university_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_1/enrolled_university_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/enrolled_university_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_1/enrolled_university_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/enrolled_university_woe.png -------------------------------------------------------------------------------- /HR_REPORT_1/enrolled_university_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/enrolled_university_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_1/experience_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/experience_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_1/experience_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/experience_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_1/experience_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/experience_woe.png -------------------------------------------------------------------------------- /HR_REPORT_1/experience_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/experience_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_1/test_enc_ginis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/test_enc_ginis.png -------------------------------------------------------------------------------- /HR_REPORT_1/train_enc_ginis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/train_enc_ginis.png -------------------------------------------------------------------------------- /HR_REPORT_2/AUC_test_plot_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/AUC_test_plot_full.png -------------------------------------------------------------------------------- /HR_REPORT_2/Model_weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/Model_weights.png -------------------------------------------------------------------------------- /HR_REPORT_2/binned_stats_target.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_stats_target.png -------------------------------------------------------------------------------- /HR_REPORT_2/binned_stats_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_stats_test.png -------------------------------------------------------------------------------- /HR_REPORT_2/binned_stats_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_stats_train.png -------------------------------------------------------------------------------- /HR_REPORT_2/binned_test_posneg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_test_posneg.png -------------------------------------------------------------------------------- /HR_REPORT_2/binned_test_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_test_total.png -------------------------------------------------------------------------------- /HR_REPORT_2/binned_train_posneg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_train_posneg.png -------------------------------------------------------------------------------- /HR_REPORT_2/binned_train_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_train_total.png -------------------------------------------------------------------------------- /HR_REPORT_2/city_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_2/city_development_index_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_development_index_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_2/city_development_index_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_development_index_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_2/city_development_index_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_development_index_woe.png -------------------------------------------------------------------------------- /HR_REPORT_2/city_development_index_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_development_index_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_2/city_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_2/city_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_woe.png -------------------------------------------------------------------------------- /HR_REPORT_2/city_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_2/company_size_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_size_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_2/company_size_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_size_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_2/company_size_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_size_woe.png -------------------------------------------------------------------------------- /HR_REPORT_2/company_size_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_size_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_2/company_type_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_type_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_2/company_type_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_type_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_2/company_type_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_type_woe.png -------------------------------------------------------------------------------- /HR_REPORT_2/company_type_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_type_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_2/corr_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/corr_heatmap.png -------------------------------------------------------------------------------- /HR_REPORT_2/education_level_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/education_level_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_2/education_level_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/education_level_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_2/education_level_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/education_level_woe.png -------------------------------------------------------------------------------- /HR_REPORT_2/education_level_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/education_level_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_2/enrolled_university_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/enrolled_university_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_2/enrolled_university_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/enrolled_university_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_2/enrolled_university_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/enrolled_university_woe.png -------------------------------------------------------------------------------- /HR_REPORT_2/enrolled_university_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/enrolled_university_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_2/experience_backlash_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/experience_backlash_plot.png -------------------------------------------------------------------------------- /HR_REPORT_2/experience_roc_auc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/experience_roc_auc.png -------------------------------------------------------------------------------- /HR_REPORT_2/experience_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/experience_woe.png -------------------------------------------------------------------------------- /HR_REPORT_2/experience_woe_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/experience_woe_bars.png -------------------------------------------------------------------------------- /HR_REPORT_2/test_enc_ginis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/test_enc_ginis.png -------------------------------------------------------------------------------- /HR_REPORT_2/train_enc_ginis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/train_enc_ginis.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LightAutoML demo (NLP).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# AutoML на текстовых данных" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "![NLP](./imgs/tutorial_NLP_image_1.jpg)\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Чуть больше про стратегии получения представлений текстов на основе представлений слов:\n", 22 | "\n", 23 | "![NLP2](./imgs/tutorial_NLP_image_2.jpg)\n", 24 | "\n", 25 | "Про методы случайных алгоритмов можно подробнее прочитать в [статье](https://arxiv.org/abs/1901.10444) \"No Training Required: Exploring Random Encoders for Sentence Classification\".\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Импорты" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import pandas as pd\n", 42 | "import numpy as np\n", 43 | "import pickle\n", 44 | "\n", 45 | "from sklearn.metrics import roc_auc_score\n", 46 | "from sklearn.model_selection import train_test_split\n", 47 | "\n", 48 | "from lightautoml.automl.presets.text_presets import TabularNLPAutoML\n", 49 | "from lightautoml.tasks import Task\n", 50 | "from lightautoml.addons.interpretation import LimeTextExplainer\n", 51 | "from lightautoml.report import ReportDecoNLP\n", 52 | "\n", 53 | "# Выключим предупреждения от HuggingFace\n", 54 | "import transformers\n", 55 | "transformers.logging.set_verbosity(50)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Чтение данных" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "CPU times: user 183 ms, sys: 39.5 ms, total: 222 ms\n", 75 | "Wall time: 221 ms\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "%%time\n", 81 | "df = pd.read_csv(\"./example_data/nlp_data.csv\")" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "(13842, 6)\n" 94 | ] 95 | }, 96 | { 97 | "data": { 98 | "text/html": [ 99 | "
\n", 100 | "\n", 113 | "\n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | "
BankNameMessageViewsNumIsGoodMessageRecognizedWER
11474Альфа-БанкЯ клиент банка с 2007 года, зарплатный клиент ...1422FalseЯ клиент банка с две тысячи седьмого года зарп...60.000000
3955Альфа-Банк07.04 в 20-15 по Ульяновскому времени я зашла ...2016FalseСедьмого апреля в двадцать пятнадцать По Ульян...31.818182
3081Банк ОткрытиеУжасный сервис. Заказал кредитную карту по акц...2232FalseУжасной Сервис заказал кредитную карту по акци...68.750000
12107Почта БанкДобрый вечер.  21.01.2020, я обратилась в отде...1139FalseДобрый вечер двадцать первого января две тысяч...55.555556
10494Русский СтандартБанк второй месяц подряд еженедельно названива...1609FalseВторой месяц подряд еженедельно и предлагает к...39.393939
\n", 173 | "
" 174 | ], 175 | "text/plain": [ 176 | " BankName Message \\\n", 177 | "11474 Альфа-Банк Я клиент банка с 2007 года, зарплатный клиент ... \n", 178 | "3955 Альфа-Банк 07.04 в 20-15 по Ульяновскому времени я зашла ... \n", 179 | "3081 Банк Открытие Ужасный сервис. Заказал кредитную карту по акц... \n", 180 | "12107 Почта Банк Добрый вечер.  21.01.2020, я обратилась в отде... \n", 181 | "10494 Русский Стандарт Банк второй месяц подряд еженедельно названива... \n", 182 | "\n", 183 | " ViewsNum IsGood MessageRecognized \\\n", 184 | "11474 1422 False Я клиент банка с две тысячи седьмого года зарп... \n", 185 | "3955 2016 False Седьмого апреля в двадцать пятнадцать По Ульян... \n", 186 | "3081 2232 False Ужасной Сервис заказал кредитную карту по акци... \n", 187 | "12107 1139 False Добрый вечер двадцать первого января две тысяч... \n", 188 | "10494 1609 False Второй месяц подряд еженедельно и предлагает к... \n", 189 | "\n", 190 | " WER \n", 191 | "11474 60.000000 \n", 192 | "3955 31.818182 \n", 193 | "3081 68.750000 \n", 194 | "12107 55.555556 \n", 195 | "10494 39.393939 " 196 | ] 197 | }, 198 | "execution_count": 3, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "print(df.shape)\n", 205 | "df.sample(5, random_state=0)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "# Разбиение на обучающую и контрольные выборки" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 4, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "train, test = train_test_split(df, test_size=3_000, random_state=42, stratify=df.IsGood)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "# Скачиваем эмбеддинги для русского языка" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 5, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "name": "stdout", 238 | "output_type": "stream", 239 | "text": [ 240 | "--2021-06-14 23:58:12-- https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar\n", 241 | "Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9\n", 242 | "Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.\n", 243 | "HTTP request sent, awaiting response... 200 OK\n", 244 | "Length: 53012480 (51M) [application/x-tar]\n", 245 | "Saving to: ‘navec_hudlit_v1_12B_500K_300d_100q.tar.3’\n", 246 | "\n", 247 | "navec_hudlit_v1_12B 100%[===================>] 50.56M 11.4MB/s in 4.5s \n", 248 | "\n", 249 | "2021-06-14 23:58:16 (11.3 MB/s) - ‘navec_hudlit_v1_12B_500K_300d_100q.tar.3’ saved [53012480/53012480]\n", 250 | "\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 6, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "from navec import Navec\n", 265 | "path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'\n", 266 | "navec = Navec.load(path)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "# Обучение AutoML или День Сурка" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "## День 1. Стандартные параметры, ЦПУ" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "scrolled": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "roles = {'target': 'IsGood',\n", 292 | " 'text': ['BankName', 'Message'],\n", 293 | " 'drop': ['MessageRecognized', 'WER']}\n", 294 | "\n", 295 | "task = Task('binary')\n", 296 | "\n", 297 | "automl = TabularNLPAutoML(task = task, \n", 298 | " timeout = 3600,\n", 299 | " gpu_ids = None,\n", 300 | " text_params = {'lang': 'ru'},\n", 301 | " verbose=2)\n", 302 | "\n", 303 | "oof_pred = automl.fit_predict(train, roles=roles) \n", 304 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 8, 310 | "metadata": {}, 311 | "outputs": [ 312 | { 313 | "name": "stdout", 314 | "output_type": "stream", 315 | "text": [ 316 | "AUC OOF score: 0.8326628448807263\n" 317 | ] 318 | } 319 | ], 320 | "source": [ 321 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 9, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "name": "stdout", 331 | "output_type": "stream", 332 | "text": [ 333 | "Feature concated__BankName__Message transformed\n", 334 | "AUC TEST score: 0.8397933778340239\n", 335 | "CPU times: user 7.46 s, sys: 1.74 s, total: 9.2 s\n", 336 | "Wall time: 11.5 s\n" 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "%%time \n", 342 | "\n", 343 | "test_pred = automl.predict(test)\n", 344 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "## День 2. Пользовательские представления слов, ЦПУ" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": { 358 | "scrolled": true 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "roles = {'target': 'IsGood',\n", 363 | " 'text': ['BankName', 'Message'],\n", 364 | " 'drop': ['MessageRecognized', 'WER']}\n", 365 | "\n", 366 | "task = Task('binary')\n", 367 | "\n", 368 | "automl = TabularNLPAutoML(task = task, \n", 369 | " timeout = 3600,\n", 370 | " gpu_ids = None,\n", 371 | " text_params = {'lang': 'ru'},\n", 372 | " autonlp_params={'model_name': 'wat', 'embedding_model': navec,\n", 373 | " 'transformer_params': {'model_params': {'embed_size': 300},\n", 374 | " 'weight_type': 'idf', 'use_svd': True}},\n", 375 | " verbose=2)\n", 376 | "\n", 377 | "oof_pred = automl.fit_predict(train, roles=roles) \n", 378 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 11, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "AUC OOF score: 0.8331853706309897\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 12, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "Feature concated__BankName__Message transformed\n", 408 | "AUC TEST score: 0.8433377172303697\n", 409 | "CPU times: user 11.7 s, sys: 1.82 s, total: 13.5 s\n", 410 | "Wall time: 15.4 s\n" 411 | ] 412 | } 413 | ], 414 | "source": [ 415 | "%%time \n", 416 | "\n", 417 | "test_pred = automl.predict(test)\n", 418 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "## День 3. Стандартные параметры, ГПУ" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": { 432 | "scrolled": true 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "roles = {'target': 'IsGood',\n", 437 | " 'text': ['BankName', 'Message'],\n", 438 | " 'drop': ['MessageRecognized', 'WER']}\n", 439 | "\n", 440 | "task = Task('binary')\n", 441 | "\n", 442 | "automl = TabularNLPAutoML(task = task, \n", 443 | " timeout = 3600,\n", 444 | " gpu_ids = '1',\n", 445 | " text_params = {'lang': 'ru'},\n", 446 | " nn_params = {'lang': 'ru'},\n", 447 | " verbose=2)\n", 448 | "\n", 449 | "oof_pred = automl.fit_predict(train, roles=roles) \n", 450 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 14, 456 | "metadata": {}, 457 | "outputs": [ 458 | { 459 | "name": "stdout", 460 | "output_type": "stream", 461 | "text": [ 462 | "AUC OOF score: 0.895916820914043\n" 463 | ] 464 | } 465 | ], 466 | "source": [ 467 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 15, 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "name": "stderr", 477 | "output_type": "stream", 478 | "text": [ 479 | "100%|██████████| 10/10 [00:16<00:00, 1.69s/it]\n" 480 | ] 481 | }, 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "Feature concated__BankName__Message transformed\n" 487 | ] 488 | }, 489 | { 490 | "name": "stderr", 491 | "output_type": "stream", 492 | "text": [ 493 | "test: 100%|██████████| 188/188 [00:30<00:00, 6.13it/s]\n", 494 | "test: 100%|██████████| 188/188 [00:30<00:00, 6.11it/s]\n", 495 | "test: 100%|██████████| 188/188 [00:30<00:00, 6.11it/s]\n" 496 | ] 497 | }, 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "AUC TEST score: 0.9052540592405104\n", 503 | "CPU times: user 1min 32s, sys: 33.2 s, total: 2min 6s\n", 504 | "Wall time: 2min 26s\n" 505 | ] 506 | } 507 | ], 508 | "source": [ 509 | "%%time \n", 510 | "\n", 511 | "test_pred = automl.predict(test)\n", 512 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "## День 4. Пользовательские представления слов, ГПУ, LightGBM" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": { 526 | "scrolled": true 527 | }, 528 | "outputs": [], 529 | "source": [ 530 | "roles = {'target': 'IsGood',\n", 531 | " 'text': ['BankName', 'Message'],\n", 532 | " 'drop': ['MessageRecognized', 'WER']}\n", 533 | "\n", 534 | "task = Task('binary')\n", 535 | "\n", 536 | "automl = TabularNLPAutoML(task = task, \n", 537 | " timeout = 3600,\n", 538 | " gpu_ids = '1',\n", 539 | " general_params = {'use_algos': ['lgb']},\n", 540 | " text_params = {'lang': 'ru'},\n", 541 | " autonlp_params={'model_name': 'random_lstm', 'embedding_model': navec},\n", 542 | " verbose=2)\n", 543 | "\n", 544 | "oof_pred = automl.fit_predict(train, roles=roles) \n", 545 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 17, 551 | "metadata": {}, 552 | "outputs": [ 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "AUC OOF score: 0.6699591052567759\n" 558 | ] 559 | } 560 | ], 561 | "source": [ 562 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 18, 568 | "metadata": {}, 569 | "outputs": [ 570 | { 571 | "name": "stderr", 572 | "output_type": "stream", 573 | "text": [ 574 | "100%|██████████| 3/3 [00:04<00:00, 1.40s/it]\n" 575 | ] 576 | }, 577 | { 578 | "name": "stdout", 579 | "output_type": "stream", 580 | "text": [ 581 | "Feature concated__BankName__Message transformed\n", 582 | "AUC TEST score: 0.6823280698997218\n", 583 | "CPU times: user 1.19 s, sys: 717 ms, total: 1.91 s\n", 584 | "Wall time: 4.89 s\n" 585 | ] 586 | } 587 | ], 588 | "source": [ 589 | "%%time \n", 590 | "\n", 591 | "test_pred = automl.predict(test)\n", 592 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "## День 5. Выбор агрегации представлений слов, ГПУ, линейная модель и LightGBM" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "roles = {'target': 'IsGood',\n", 609 | " 'text': ['BankName', 'Message'],\n", 610 | " 'drop': ['MessageRecognized', 'WER']}\n", 611 | "\n", 612 | "task = Task('binary')\n", 613 | "\n", 614 | "automl = TabularNLPAutoML(task = task, \n", 615 | " timeout = 3600,\n", 616 | " gpu_ids = '1',\n", 617 | " general_params = {'use_algos': ['linear_l2', 'lgb']},\n", 618 | " text_params = {'lang': 'ru'},\n", 619 | " autonlp_params={'model_name': 'pooled_bert'},\n", 620 | " verbose=2)\n", 621 | "\n", 622 | "oof_pred = automl.fit_predict(train, roles=roles) \n", 623 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 20, 629 | "metadata": {}, 630 | "outputs": [ 631 | { 632 | "name": "stdout", 633 | "output_type": "stream", 634 | "text": [ 635 | "AUC OOF score: 0.8886449778166853\n" 636 | ] 637 | } 638 | ], 639 | "source": [ 640 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 21, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "name": "stderr", 650 | "output_type": "stream", 651 | "text": [ 652 | "100%|██████████| 10/10 [00:16<00:00, 1.64s/it]\n" 653 | ] 654 | }, 655 | { 656 | "name": "stdout", 657 | "output_type": "stream", 658 | "text": [ 659 | "Feature concated__BankName__Message transformed\n", 660 | "AUC TEST score: 0.8930039620503403\n", 661 | "CPU times: user 13.6 s, sys: 6.03 s, total: 19.7 s\n", 662 | "Wall time: 27.6 s\n" 663 | ] 664 | } 665 | ], 666 | "source": [ 667 | "%%time \n", 668 | "\n", 669 | "test_pred = automl.predict(test)\n", 670 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))" 671 | ] 672 | }, 673 | { 674 | "cell_type": "markdown", 675 | "metadata": {}, 676 | "source": [ 677 | "## День 6. Выбор модели Transformers, ГПУ\n", 678 | "\n", 679 | "rubert-tiny. Подробнее в [статье](https://habr.com/ru/post/562064/)." 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": {}, 686 | "outputs": [], 687 | "source": [ 688 | "roles = {'target': 'IsGood',\n", 689 | " 'text': ['BankName', 'Message'],\n", 690 | " 'drop': ['MessageRecognized', 'WER']}\n", 691 | "\n", 692 | "task = Task('binary')\n", 693 | "\n", 694 | "automl = TabularNLPAutoML(task = task, \n", 695 | " timeout = 3600,\n", 696 | " gpu_ids = '1',\n", 697 | " general_params = {'use_algos': ['nn']},\n", 698 | " nn_params = {'lang': 'ru', 'bert_name': \"cointegrated/rubert-tiny\"},\n", 699 | " verbose=2)\n", 700 | "\n", 701 | "oof_pred = automl.fit_predict(train, roles=roles) \n", 702 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 23, 708 | "metadata": {}, 709 | "outputs": [ 710 | { 711 | "name": "stdout", 712 | "output_type": "stream", 713 | "text": [ 714 | "AUC OOF score: 0.8444397429684534\n" 715 | ] 716 | } 717 | ], 718 | "source": [ 719 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": 24, 725 | "metadata": {}, 726 | "outputs": [ 727 | { 728 | "name": "stderr", 729 | "output_type": "stream", 730 | "text": [ 731 | "test: 100%|██████████| 188/188 [00:03<00:00, 52.82it/s]\n", 732 | "test: 100%|██████████| 188/188 [00:03<00:00, 52.87it/s]\n", 733 | "test: 100%|██████████| 188/188 [00:03<00:00, 52.69it/s]\n" 734 | ] 735 | }, 736 | { 737 | "name": "stdout", 738 | "output_type": "stream", 739 | "text": [ 740 | "AUC TEST score: 0.8588585048981088\n", 741 | "CPU times: user 9.53 s, sys: 2.83 s, total: 12.4 s\n", 742 | "Wall time: 24.8 s\n" 743 | ] 744 | } 745 | ], 746 | "source": [ 747 | "%%time \n", 748 | "\n", 749 | "test_pred = automl.predict(test)\n", 750 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))" 751 | ] 752 | }, 753 | { 754 | "cell_type": "markdown", 755 | "metadata": {}, 756 | "source": [ 757 | "# Что дальше?" 758 | ] 759 | }, 760 | { 761 | "cell_type": "markdown", 762 | "metadata": {}, 763 | "source": [ 764 | "## Интерпретация" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "### LIME\n", 772 | "\n", 773 | "Примерный алгоритм работы:\n", 774 | "\n", 775 | "1. Выбирается текстовая колонка (perturb_column), с помощью которой будем интерпретировать выделенное предсказание модели. При этом все остальные признаки фиксированные.\n", 776 | "2. Создается датасет размера n_sample (по-умолчанию 5000) путем случайных удалениий токенов (группами). Датасет бинарный (есть токен / нет токена).\n", 777 | "3. Опционально производится отбор признаков (важных токенов) с помощью LASSO (feature_selection='lasso', можно также 'none', чтобы не производить отбор). Количество признаков равно n_feautres (10 по умолчанию).\n", 778 | "4. Обучаем на этом объясняемую модель (линейную с весами, способ подсчета весов -- косинусное расстояние по-умолчанию, также можно и свою функцию или название расстояния из sklearn.metrics.pairwise_distances). \n", 779 | "5. После этого веса линейной модели и являются интерпретацией.\n", 780 | "\n", 781 | "tips: force_order отвечает за то, использовать ли признаки как мешок слов(force_order=False) или важен их порядок (force_order=True)." 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": 25, 787 | "metadata": {}, 788 | "outputs": [], 789 | "source": [ 790 | "lime = LimeTextExplainer(automl, feature_selection='lasso', force_order=False)" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 26, 796 | "metadata": {}, 797 | "outputs": [ 798 | { 799 | "name": "stderr", 800 | "output_type": "stream", 801 | "text": [ 802 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.01it/s]\n", 803 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.30it/s]\n", 804 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.07it/s]\n" 805 | ] 806 | }, 807 | { 808 | "data": { 809 | "text/html": [ 810 | "

Добрый день! Я являюсь счастливым обладателем кредитной карты данного банка. 20го числа должен был произвести пополнение карты для того, чтобы полностью воспользоваться льготным периодом кредитования.

" 811 | ] 812 | }, 813 | "metadata": {}, 814 | "output_type": "display_data" 815 | } 816 | ], 817 | "source": [ 818 | "instance = test.iloc[0] # объект для интерпретации\n", 819 | "exp = lime.explain_instance(instance, labels=(0, 1), perturb_column='Message')\n", 820 | "exp.visualize_in_notebook(label=1)" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 27, 826 | "metadata": {}, 827 | "outputs": [ 828 | { 829 | "name": "stderr", 830 | "output_type": "stream", 831 | "text": [ 832 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.36it/s]\n", 833 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.73it/s]\n", 834 | "test: 100%|██████████| 313/313 [00:05<00:00, 54.12it/s]\n" 835 | ] 836 | }, 837 | { 838 | "data": { 839 | "text/html": [ 840 | "

Я просто в шоке от этого банка!!! После ошибки сотрудников возникла спорная ситуация и я, как добропорядочный клиент обратилась в отделение и оформила претензию лично (19.09.16) и на этом сайте (21.09.16) для более оперативного реагирования. Мне обещали рассмотреть мои обращения в течении 14 рабочих дней.

" 841 | ] 842 | }, 843 | "metadata": {}, 844 | "output_type": "display_data" 845 | } 846 | ], 847 | "source": [ 848 | "instance = test.iloc[-1] # объект для интерпретации\n", 849 | "exp = lime.explain_instance(instance, labels=(0, 1), perturb_column='Message')\n", 850 | "exp.visualize_in_notebook(label=1)" 851 | ] 852 | }, 853 | { 854 | "cell_type": "markdown", 855 | "metadata": {}, 856 | "source": [ 857 | "## Отчет" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": null, 863 | "metadata": { 864 | "scrolled": true 865 | }, 866 | "outputs": [], 867 | "source": [ 868 | "RD = ReportDecoNLP(output_path='NLP_REPORT', \n", 869 | " report_file_name='report_nlp.html')\n", 870 | "\n", 871 | "roles = {'target': 'IsGood',\n", 872 | " 'text': ['BankName', 'Message'],\n", 873 | " 'drop': ['MessageRecognized', 'WER']}\n", 874 | "\n", 875 | "task = Task('binary')\n", 876 | "\n", 877 | "automl = TabularNLPAutoML(task = task, \n", 878 | " timeout = 3600,\n", 879 | " gpu_ids = '1',\n", 880 | " general_params = {'use_algos': ['linear_l2']},\n", 881 | " linear_pipeline_params = {'text_features': \"embed\"},\n", 882 | " text_params = {'lang': 'ru'},\n", 883 | " autonlp_params={'model_name': 'pooled_bert',\n", 884 | " 'transformer_params': {'model_params': {'pooling': 'cls'}}},\n", 885 | " verbose=2)\n", 886 | "\n", 887 | "automl_rd = RD(automl)\n", 888 | "\n", 889 | "oof_pred = automl_rd.fit_predict(train, roles=roles) \n", 890 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)" 891 | ] 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": 29, 896 | "metadata": {}, 897 | "outputs": [ 898 | { 899 | "name": "stdout", 900 | "output_type": "stream", 901 | "text": [ 902 | "AUC OOF score: 0.8608724490033035\n" 903 | ] 904 | } 905 | ], 906 | "source": [ 907 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": 30, 913 | "metadata": {}, 914 | "outputs": [ 915 | { 916 | "name": "stderr", 917 | "output_type": "stream", 918 | "text": [ 919 | "100%|██████████| 3/3 [00:16<00:00, 5.45s/it]\n" 920 | ] 921 | }, 922 | { 923 | "name": "stdout", 924 | "output_type": "stream", 925 | "text": [ 926 | "Feature concated__BankName__Message transformed\n", 927 | "AUC TEST score: 0.8713822510070556\n", 928 | "CPU times: user 13.9 s, sys: 4.86 s, total: 18.7 s\n", 929 | "Wall time: 23.2 s\n" 930 | ] 931 | } 932 | ], 933 | "source": [ 934 | "%%time \n", 935 | "\n", 936 | "test_pred = automl_rd.predict(test)\n", 937 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))" 938 | ] 939 | }, 940 | { 941 | "cell_type": "markdown", 942 | "metadata": {}, 943 | "source": [ 944 | "Отчет лежит [здесь](./NLP_REPORT/report_nlp.html)." 945 | ] 946 | }, 947 | { 948 | "cell_type": "markdown", 949 | "metadata": {}, 950 | "source": [ 951 | "## Сохранение модели" 952 | ] 953 | }, 954 | { 955 | "cell_type": "code", 956 | "execution_count": 31, 957 | "metadata": {}, 958 | "outputs": [], 959 | "source": [ 960 | "with open('LAMA_model.pkl', 'wb') as f:\n", 961 | " pickle.dump(automl_rd, f)" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "## Больше материалов\n", 969 | "\n", 970 | "* Github [LightAutoML](https://github.com/sberbank-ai-lab/LightAutoML) со ссылками на все материалы.\n", 971 | "* Канал [LAMA](https://t.me/lightautoml) в Telegram.\n", 972 | "* Примеры на kaggle с использованием текстового функционала в условии отсутсвия доступа к интернету во время инференса: [обучение](https://www.kaggle.com/simakov/lama-bert-starter) и [инференс](https://www.kaggle.com/simakov/lama-bert-inference)." 973 | ] 974 | } 975 | ], 976 | "metadata": { 977 | "kernelspec": { 978 | "display_name": "Python 3", 979 | "language": "python", 980 | "name": "python3" 981 | }, 982 | "language_info": { 983 | "codemirror_mode": { 984 | "name": "ipython", 985 | "version": 3 986 | }, 987 | "file_extension": ".py", 988 | "mimetype": "text/x-python", 989 | "name": "python", 990 | "nbconvert_exporter": "python", 991 | "pygments_lexer": "ipython3", 992 | "version": "3.7.6" 993 | } 994 | }, 995 | "nbformat": 4, 996 | "nbformat_minor": 5 997 | } 998 | -------------------------------------------------------------------------------- /LightAutoML demo (Whitebox).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## AutoWoE (WhiteBox модель для бинарной классификации на табличных данных)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Скор карта" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "![WB0](imgs/tutorial_whitebox_report_1.png)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Линейная модель" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "![WB1](imgs/tutorial_whitebox_report_2.png)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Дискретизация" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "![WB2](imgs/tutorial_whitebox_report_3.png)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Отбор и одномерный анализ" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "![WB3](imgs/tutorial_whitebox_report_4.png)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Whitebox pipeline:\n", 71 | "\n", 72 | "### Общие параметры:\n", 73 | "\n", 74 | " - n_jobs\n", 75 | " - debug\n", 76 | "\n", 77 | "### 0) Простая разметка типов и выброс мусора\n", 78 | "#### 0.0) Удаление мусорных фичей\n", 79 | " \n", 80 | " Medium:\n", 81 | " - th_nan \n", 82 | " - th_const \n", 83 | " \n", 84 | "#### 0.1) Разметка типов (авто или пользовательская)\n", 85 | " \n", 86 | " Critical:\n", 87 | " - features_type (dict) {'age': 'real', 'education': 'cat', 'birth_date': (None, (\"d\", \"wd\"), ...}\n", 88 | " \n", 89 | "#### 0.2) Кодирование категорий и дат\n", 90 | " \n", 91 | " Critical:\n", 92 | " - features_type (for datetimes)\n", 93 | " \n", 94 | " Optional:\n", 95 | " - cat_alpha (int) - greater means more conservative encoding\n", 96 | " \n", 97 | " \n", 98 | "### 1) Первая стадия отбора (отбор на основе важности BlackBox модели)\n", 99 | "\n", 100 | " Critical:\n", 101 | " - select_type (None or int)\n", 102 | " - imp_type (if type(select_type) is int 'perm_imt'/'feature_imp') \n", 103 | " \n", 104 | " Optional:\n", 105 | " - imt_th (float) - threshold for select_type is None\n", 106 | " \n", 107 | "### 2) Биннинг (дискретизация):\n", 108 | " \n", 109 | " Critical:\n", 110 | " - monotonic / features_monotone_constraints \n", 111 | " - max_bin_count / max_bin_count\n", 112 | " - min_bin_size\n", 113 | " \n", 114 | " - cat_merge_to\n", 115 | " - nan_merge_to\n", 116 | " \n", 117 | " Medium:\n", 118 | " - force_single_split\n", 119 | " \n", 120 | " Optional:\n", 121 | " - min_bin_mults\n", 122 | " - min_gains_to_split\n", 123 | "\n", 124 | "### 3) WoE оценки WoE = LN( ((% 0 in bin) / (% 0 in sample)) / ((% 1 in bin) / (% 1 in sample)) ):\n", 125 | " \n", 126 | " Critical:\n", 127 | " - oof_woe\n", 128 | " \n", 129 | " Optional:\n", 130 | " - woe_diff_th\n", 131 | " - n_folds (if oof_woe)\n", 132 | "\n", 133 | "### 4) Вторая стадия отбора:\n", 134 | "\n", 135 | "#### 4.0) Одномерная зависимость с таргетом\n", 136 | " \n", 137 | " Critical:\n", 138 | " - auc_th\n", 139 | " \n", 140 | "#### 4.1) VIF \n", 141 | " \n", 142 | " Critical:\n", 143 | " - vif_th\n", 144 | " \n", 145 | "#### 4.2) Частные корреляции\n", 146 | " \n", 147 | " Critical:\n", 148 | " - pearson_th\n", 149 | " \n", 150 | "### 5) Третья стадия отбора (на основе модели)\n", 151 | " \n", 152 | " Optional:\n", 153 | " - n_folds\n", 154 | " - l1_base_step\n", 155 | " - l1_exp_step\n", 156 | " \n", 157 | " Do not touch:\n", 158 | " - population_size\n", 159 | " - feature_groups_count\n", 160 | "\n", 161 | "\n", 162 | "### 6) Обучение финальной модели:\n", 163 | "\n", 164 | " Critical:\n", 165 | " - regularized_refit\n", 166 | " - p_val (if not regularized_refit)\n", 167 | " - validation (if not regularized_refit)\n", 168 | " \n", 169 | " Optional:\n", 170 | " - interpreted_model\n", 171 | " - l1_base_step (if regularized_refit)\n", 172 | " - l1_exp_step (if regularized_refit)\n", 173 | " \n", 174 | "### 7) Создание отчета\n", 175 | "\n", 176 | " - report_params" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "### Импорты" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 1, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "import pandas as pd\n", 193 | "from pandas import Series, DataFrame\n", 194 | "\n", 195 | "import numpy as np\n", 196 | "\n", 197 | "import joblib\n", 198 | "\n", 199 | "from sklearn.model_selection import train_test_split\n", 200 | "from sklearn.metrics import roc_auc_score\n", 201 | "\n", 202 | "from autowoe import AutoWoE, ReportDeco" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "### Чтение данных и train/test split" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 2, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "data = pd.read_csv('example_data/jobs_train.csv')" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 3, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/html": [ 229 | "
\n", 230 | "\n", 243 | "\n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | "
enrollee_idcitycity_development_indexgenderrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hourstarget
08949city_1030.920MaleHas relevent experienceno_enrollmentGraduateSTEM21.0NaNNaN1.0361.0
129725city_400.776MaleNo relevent experienceno_enrollmentGraduateSTEM15.099.0Pvt Ltd5.0470.0
211561city_210.624NaNNo relevent experienceFull time courseGraduateSTEM5.0NaNNaN0.0830.0
333241city_1150.789NaNNo relevent experienceNaNGraduateBusiness Degree0.0NaNPvt Ltd0.0521.0
4666city_1620.767MaleHas relevent experienceno_enrollmentMastersSTEM21.099.0Funded Startup4.080.0
.............................................
191537386city_1730.878MaleNo relevent experienceno_enrollmentGraduateHumanities14.0NaNNaN1.0421.0
1915431398city_1030.920MaleHas relevent experienceno_enrollmentGraduateSTEM14.0NaNNaN4.0521.0
1915524576city_1030.920MaleHas relevent experienceno_enrollmentGraduateSTEM21.099.0Pvt Ltd4.0440.0
191565756city_650.802MaleHas relevent experienceno_enrollmentHigh SchoolNaN0.0999.0Pvt Ltd2.0970.0
1915723834city_670.855NaNNo relevent experienceno_enrollmentPrimary SchoolNaN2.0NaNNaN1.01270.0
\n", 453 | "

19158 rows × 14 columns

\n", 454 | "
" 455 | ], 456 | "text/plain": [ 457 | " enrollee_id city city_development_index gender \\\n", 458 | "0 8949 city_103 0.920 Male \n", 459 | "1 29725 city_40 0.776 Male \n", 460 | "2 11561 city_21 0.624 NaN \n", 461 | "3 33241 city_115 0.789 NaN \n", 462 | "4 666 city_162 0.767 Male \n", 463 | "... ... ... ... ... \n", 464 | "19153 7386 city_173 0.878 Male \n", 465 | "19154 31398 city_103 0.920 Male \n", 466 | "19155 24576 city_103 0.920 Male \n", 467 | "19156 5756 city_65 0.802 Male \n", 468 | "19157 23834 city_67 0.855 NaN \n", 469 | "\n", 470 | " relevent_experience enrolled_university education_level \\\n", 471 | "0 Has relevent experience no_enrollment Graduate \n", 472 | "1 No relevent experience no_enrollment Graduate \n", 473 | "2 No relevent experience Full time course Graduate \n", 474 | "3 No relevent experience NaN Graduate \n", 475 | "4 Has relevent experience no_enrollment Masters \n", 476 | "... ... ... ... \n", 477 | "19153 No relevent experience no_enrollment Graduate \n", 478 | "19154 Has relevent experience no_enrollment Graduate \n", 479 | "19155 Has relevent experience no_enrollment Graduate \n", 480 | "19156 Has relevent experience no_enrollment High School \n", 481 | "19157 No relevent experience no_enrollment Primary School \n", 482 | "\n", 483 | " major_discipline experience company_size company_type \\\n", 484 | "0 STEM 21.0 NaN NaN \n", 485 | "1 STEM 15.0 99.0 Pvt Ltd \n", 486 | "2 STEM 5.0 NaN NaN \n", 487 | "3 Business Degree 0.0 NaN Pvt Ltd \n", 488 | "4 STEM 21.0 99.0 Funded Startup \n", 489 | "... ... ... ... ... \n", 490 | "19153 Humanities 14.0 NaN NaN \n", 491 | "19154 STEM 14.0 NaN NaN \n", 492 | "19155 STEM 21.0 99.0 Pvt Ltd \n", 493 | "19156 NaN 0.0 999.0 Pvt Ltd \n", 494 | "19157 NaN 2.0 NaN NaN \n", 495 | "\n", 496 | " last_new_job training_hours target \n", 497 | "0 1.0 36 1.0 \n", 498 | "1 5.0 47 0.0 \n", 499 | "2 0.0 83 0.0 \n", 500 | "3 0.0 52 1.0 \n", 501 | "4 4.0 8 0.0 \n", 502 | "... ... ... ... \n", 503 | "19153 1.0 42 1.0 \n", 504 | "19154 4.0 52 1.0 \n", 505 | "19155 4.0 44 0.0 \n", 506 | "19156 2.0 97 0.0 \n", 507 | "19157 1.0 127 0.0 \n", 508 | "\n", 509 | "[19158 rows x 14 columns]" 510 | ] 511 | }, 512 | "execution_count": 3, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "data" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 4, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "train, test = train_test_split(data.drop('enrollee_id', axis=1), test_size=0.2, stratify=data['target'])" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "### AutoWoe: настройки по умолчанию" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 5, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "auto_woe_0 = AutoWoE(interpreted_model=True,\n", 544 | " monotonic=False,\n", 545 | " max_bin_count=5,\n", 546 | " select_type=None,\n", 547 | " pearson_th=0.9,\n", 548 | " auc_th=.505,\n", 549 | " vif_th=10.,\n", 550 | " imp_th=0,\n", 551 | " th_const=32,\n", 552 | " force_single_split=True,\n", 553 | " th_nan=0.01,\n", 554 | " th_cat=0.005,\n", 555 | " auc_tol=1e-4,\n", 556 | " cat_alpha=100,\n", 557 | " cat_merge_to=\"to_woe_0\",\n", 558 | " nan_merge_to=\"to_woe_0\",\n", 559 | " imp_type=\"feature_imp\",\n", 560 | " regularized_refit=False,\n", 561 | " p_val=0.05,\n", 562 | " verbose=2\n", 563 | " )\n", 564 | "\n", 565 | "auto_woe_0 = ReportDeco(auto_woe_0, )" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 6, 571 | "metadata": {}, 572 | "outputs": [ 573 | { 574 | "name": "stdout", 575 | "output_type": "stream", 576 | "text": [ 577 | "city processing...\n", 578 | "city_development_index processing...\n", 579 | "gender processing...\n", 580 | "relevent_experience processing...\n", 581 | "enrolled_university processing...\n", 582 | "education_level processing...\n", 583 | "experience processing...\n", 584 | "company_size processing...\n", 585 | "company_type processing...\n", 586 | "last_new_job processing...\n", 587 | "training_hours processing...\n", 588 | "dict_keys(['city', 'city_development_index', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours']) to selector !!!!!\n", 589 | "Feature selection...\n", 590 | "city_development_index -0.974107\n", 591 | "company_size -0.795953\n", 592 | "company_type -0.400146\n", 593 | "experience -0.184238\n", 594 | "enrolled_university -0.251287\n", 595 | "education_level -1.188926\n", 596 | "dtype: float64\n" 597 | ] 598 | } 599 | ], 600 | "source": [ 601 | "auto_woe_0.fit(train,\n", 602 | " target_name=\"target\",\n", 603 | " )" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 7, 609 | "metadata": {}, 610 | "outputs": [ 611 | { 612 | "data": { 613 | "text/plain": [ 614 | "array([0.06265852, 0.56483877, 0.04151965, ..., 0.15191705, 0.08528486,\n", 615 | " 0.0409943 ])" 616 | ] 617 | }, 618 | "execution_count": 7, 619 | "metadata": {}, 620 | "output_type": "execute_result" 621 | } 622 | ], 623 | "source": [ 624 | "test_prediction = auto_woe_0.predict_proba(test)\n", 625 | "test_prediction" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 8, 631 | "metadata": {}, 632 | "outputs": [ 633 | { 634 | "data": { 635 | "text/plain": [ 636 | "0.8034365349304012" 637 | ] 638 | }, 639 | "execution_count": 8, 640 | "metadata": {}, 641 | "output_type": "execute_result" 642 | } 643 | ], 644 | "source": [ 645 | "roc_auc_score(test['target'].values, test_prediction)" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 9, 651 | "metadata": {}, 652 | "outputs": [ 653 | { 654 | "name": "stderr", 655 | "output_type": "stream", 656 | "text": [ 657 | "No handles with labels found to put in legend.\n", 658 | "No handles with labels found to put in legend.\n", 659 | "No handles with labels found to put in legend.\n", 660 | "No handles with labels found to put in legend.\n", 661 | "No handles with labels found to put in legend.\n", 662 | "No handles with labels found to put in legend.\n", 663 | "No handles with labels found to put in legend.\n" 664 | ] 665 | } 666 | ], 667 | "source": [ 668 | "report_params = {\"output_path\": \"HR_REPORT_1\", # папка, куда сгенерится отчет и сложатся нужные файлы\n", 669 | " \"report_name\": \"WHITEBOX REPORT\",\n", 670 | " \"report_version_id\": 1,\n", 671 | " \"city\": \"Moscow\",\n", 672 | " \"model_aim\": \"Predict if candidate will work for the company\",\n", 673 | " \"model_name\": \"HR model\",\n", 674 | " \"zakazchik\": \"Kaggle\",\n", 675 | " \"high_level_department\": \"Ai Lab\",\n", 676 | " \"ds_name\": \"Btbpanda\",\n", 677 | " \"target_descr\": \"Candidate will work for the company\",\n", 678 | " \"non_target_descr\": \"Candidate will work for the company\"}\n", 679 | "\n", 680 | "auto_woe_0.generate_report(report_params, )" 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "metadata": {}, 686 | "source": [ 687 | "### AutoWoE - более консервативная модель" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": 10, 693 | "metadata": {}, 694 | "outputs": [], 695 | "source": [ 696 | "auto_woe_1 = AutoWoE(interpreted_model=True,\n", 697 | " monotonic=True,\n", 698 | " max_bin_count=4,\n", 699 | " select_type=None,\n", 700 | " pearson_th=0.9,\n", 701 | " auc_th=.505,\n", 702 | " vif_th=10.,\n", 703 | " imp_th=0,\n", 704 | " th_const=32,\n", 705 | " force_single_split=True,\n", 706 | " th_nan=0.01,\n", 707 | " th_cat=0.005,\n", 708 | " auc_tol=1e-4,\n", 709 | " cat_alpha=100,\n", 710 | " cat_merge_to=\"to_woe_0\",\n", 711 | " nan_merge_to=\"to_woe_0\",\n", 712 | " imp_type=\"feature_imp\",\n", 713 | " regularized_refit=False,\n", 714 | " p_val=0.05,\n", 715 | " verbose=2\n", 716 | " )\n", 717 | "\n", 718 | "auto_woe_1 = ReportDeco(auto_woe_1, )" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 11, 724 | "metadata": {}, 725 | "outputs": [ 726 | { 727 | "name": "stdout", 728 | "output_type": "stream", 729 | "text": [ 730 | "city processing...city_development_index processing...\n", 731 | "\n", 732 | "gender processing...\n", 733 | "relevent_experience processing...\n", 734 | "enrolled_university processing...education_level processing...\n", 735 | "\n", 736 | "experience processing...company_type processing...company_size processing...\n", 737 | "\n", 738 | "\n", 739 | "last_new_job processing...\n", 740 | "training_hours processing...\n", 741 | "dict_keys(['city', 'city_development_index', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours']) to selector !!!!!\n", 742 | "Feature selection...\n", 743 | "city -0.516274\n", 744 | "city_development_index -0.512608\n", 745 | "company_size -0.814922\n", 746 | "company_type -0.397978\n", 747 | "experience -0.175231\n", 748 | "enrolled_university -0.219507\n", 749 | "education_level -1.239627\n", 750 | "dtype: float64\n" 751 | ] 752 | } 753 | ], 754 | "source": [ 755 | "auto_woe_1.fit(train,\n", 756 | " target_name=\"target\",\n", 757 | " )" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 12, 763 | "metadata": {}, 764 | "outputs": [ 765 | { 766 | "data": { 767 | "text/plain": [ 768 | "array([0.06460692, 0.57321671, 0.0497262 , ..., 0.13746553, 0.07190761,\n", 769 | " 0.04153373])" 770 | ] 771 | }, 772 | "execution_count": 12, 773 | "metadata": {}, 774 | "output_type": "execute_result" 775 | } 776 | ], 777 | "source": [ 778 | "test_prediction = auto_woe_1.predict_proba(test)\n", 779 | "test_prediction" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 13, 785 | "metadata": {}, 786 | "outputs": [ 787 | { 788 | "data": { 789 | "text/plain": [ 790 | "0.8019815944109903" 791 | ] 792 | }, 793 | "execution_count": 13, 794 | "metadata": {}, 795 | "output_type": "execute_result" 796 | } 797 | ], 798 | "source": [ 799 | "roc_auc_score(test['target'].values, test_prediction)" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": 14, 805 | "metadata": {}, 806 | "outputs": [ 807 | { 808 | "name": "stderr", 809 | "output_type": "stream", 810 | "text": [ 811 | "No handles with labels found to put in legend.\n", 812 | "No handles with labels found to put in legend.\n", 813 | "No handles with labels found to put in legend.\n", 814 | "No handles with labels found to put in legend.\n", 815 | "No handles with labels found to put in legend.\n", 816 | "No handles with labels found to put in legend.\n", 817 | "No handles with labels found to put in legend.\n", 818 | "No handles with labels found to put in legend.\n" 819 | ] 820 | } 821 | ], 822 | "source": [ 823 | "report_params = {\"output_path\": \"HR_REPORT_2\", # папка, куда сгенерится отчет и сложатся нужные файлы\n", 824 | " \"report_name\": \"WHITEBOX REPORT\",\n", 825 | " \"report_version_id\": 2,\n", 826 | " \"city\": \"Moscow\",\n", 827 | " \"model_aim\": \"Predict if candidate will work for the company\",\n", 828 | " \"model_name\": \"HR model\",\n", 829 | " \"zakazchik\": \"Kaggle\",\n", 830 | " \"high_level_department\": \"Ai Lab\",\n", 831 | " \"ds_name\": \"Btbpanda\",\n", 832 | " \"target_descr\": \"Candidate will work for the company\",\n", 833 | " \"non_target_descr\": \"Candidate will work for the company\"}\n", 834 | "\n", 835 | "auto_woe_1.generate_report(report_params, )" 836 | ] 837 | }, 838 | { 839 | "cell_type": "markdown", 840 | "metadata": {}, 841 | "source": [ 842 | "### WhiteBox preset - использование по аналогии с TabularAutoML" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": 15, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "from lightautoml.automl.presets.whitebox_presets import WhiteBoxPreset\n", 852 | "from lightautoml import Task" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": 16, 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [ 861 | "task = Task('binary')\n", 862 | "automl = WhiteBoxPreset(task)" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 17, 868 | "metadata": {}, 869 | "outputs": [ 870 | { 871 | "name": "stderr", 872 | "output_type": "stream", 873 | "text": [ 874 | "Validation data is not set. Train will be used as valid in report and valid prediction\n" 875 | ] 876 | }, 877 | { 878 | "name": "stdout", 879 | "output_type": "stream", 880 | "text": [ 881 | "Start automl preset with listed constraints:\n", 882 | "- time: 3600 seconds\n", 883 | "- cpus: 4 cores\n", 884 | "- memory: 16 gb\n", 885 | "\n", 886 | "Train data shape: (15326, 13)\n", 887 | "Feats was rejected during automatic roles guess: []\n", 888 | "\n", 889 | "\n", 890 | "Layer 1 ...\n", 891 | "Train process start. Time left 3595.0072581768036 secs\n", 892 | "Start fitting Lvl_0_Pipe_0_Mod_0_WhiteBox ...\n", 893 | "\n", 894 | "===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_WhiteBox =====\n", 895 | "\n", 896 | " features [] contain too many nans or identical values\n", 897 | " features [] have low importance\n", 898 | "city processing...\n", 899 | "city_development_index processing...company_type processing...education_level processing...\n", 900 | "\n", 901 | "\n", 902 | "enrolled_university processing...\n", 903 | "gender processing...\n", 904 | "major_discipline processing...\n", 905 | "relevent_experience processing...\n", 906 | "company_size processing...\n", 907 | "experience processing...\n", 908 | "last_new_job processing...\n", 909 | "training_hours processing...\n", 910 | "dict_keys(['city', 'city_development_index', 'company_type', 'education_level', 'enrolled_university', 'gender', 'major_discipline', 'relevent_experience', 'company_size', 'experience', 'last_new_job', 'training_hours']) to selector !!!!!\n", 911 | "Feature selection...\n", 912 | "Feature training_hours removed due to low AUC value 0.5031265374717342\n", 913 | "Feature city_development_index removed due to high VIF value = 40.56438648184099\n", 914 | "C parameter range in [0.0002603488674824265:260.3488674824265], 20 values\n", 915 | "Result(score=0.7856775296767177, reg_alpha=0.020431136952654548, is_neg=True, min_weights=city -0.980620\n", 916 | "company_size -0.800535\n", 917 | "company_type -0.340185\n", 918 | "experience -0.198176\n", 919 | "enrolled_university -0.101047\n", 920 | "relevent_experience 0.000000\n", 921 | "education_level -0.624324\n", 922 | "last_new_job 0.000000\n", 923 | "gender 0.000000\n", 924 | "major_discipline -0.317699\n", 925 | "dtype: float64)\n", 926 | "Iter 0 of final refit starts with 7 features\n", 927 | "Validation data checks\n", 928 | "city -0.956550\n", 929 | "company_size -0.866063\n", 930 | "company_type -0.402941\n", 931 | "experience -0.329493\n", 932 | "enrolled_university -0.230776\n", 933 | "education_level -0.641994\n", 934 | "major_discipline -1.596907\n", 935 | "dtype: float64\n", 936 | "Lvl_0_Pipe_0_Mod_0_WhiteBox fitting and predicting completed\n", 937 | "Time left 3587.2280378341675\n", 938 | "\n", 939 | "Automl preset training completed in 12.77 seconds.\n" 940 | ] 941 | } 942 | ], 943 | "source": [ 944 | "\n", 945 | "train_pred = automl.fit_predict(train.reset_index(drop=True), roles={'target': 'target'})" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": 18, 951 | "metadata": {}, 952 | "outputs": [], 953 | "source": [ 954 | "test_prediction = automl.predict(test).data[:, 0]" 955 | ] 956 | }, 957 | { 958 | "cell_type": "code", 959 | "execution_count": 19, 960 | "metadata": {}, 961 | "outputs": [ 962 | { 963 | "data": { 964 | "text/plain": [ 965 | "0.7966826628232216" 966 | ] 967 | }, 968 | "execution_count": 19, 969 | "metadata": {}, 970 | "output_type": "execute_result" 971 | } 972 | ], 973 | "source": [ 974 | "roc_auc_score(test['target'].values, test_prediction)" 975 | ] 976 | }, 977 | { 978 | "cell_type": "markdown", 979 | "metadata": {}, 980 | "source": [ 981 | "### Сериализация модели\n", 982 | "\n", 983 | "Важно: auto_woe_1 фактически является ReportDeco объектом (отчетом), не AutoWoE. Чтобы получить AutoWoE надо обратиться к атрибуту .model. \n", 984 | "\n", 985 | "ReportDeco не рекомендуется для использования на стадии инференса. Отчет требует целевой переменной в датасете для предсказания, так как считает метрики качества. Так же инференс из объекта-отчета намного дольше из-за собственно построения отчета." 986 | ] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": 20, 991 | "metadata": {}, 992 | "outputs": [], 993 | "source": [ 994 | "joblib.dump(auto_woe_1.model, 'model.pkl')\n", 995 | "model = joblib.load('model.pkl')" 996 | ] 997 | }, 998 | { 999 | "cell_type": "markdown", 1000 | "metadata": {}, 1001 | "source": [ 1002 | "### SQL запрос для инференса" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "execution_count": 21, 1008 | "metadata": {}, 1009 | "outputs": [ 1010 | { 1011 | "name": "stdout", 1012 | "output_type": "stream", 1013 | "text": [ 1014 | "SELECT\n", 1015 | " 1 / (1 + EXP(-(\n", 1016 | " -1.111\n", 1017 | " -0.516*WOE_TAB.city\n", 1018 | " -0.513*WOE_TAB.city_development_index\n", 1019 | " -0.815*WOE_TAB.company_size\n", 1020 | " -0.398*WOE_TAB.company_type\n", 1021 | " -0.175*WOE_TAB.experience\n", 1022 | " -0.22*WOE_TAB.enrolled_university\n", 1023 | " -1.24*WOE_TAB.education_level\n", 1024 | " ))) as PROB,\n", 1025 | " WOE_TAB.*\n", 1026 | "FROM \n", 1027 | " (SELECT\n", 1028 | " CASE\n", 1029 | " WHEN (city IS NULL OR LOWER(CAST(city AS VARCHAR(50))) = 'nan') THEN 0\n", 1030 | " WHEN city IN ('city_100', 'city_102', 'city_103', 'city_116', 'city_149', 'city_159', 'city_160', 'city_45', 'city_46', 'city_64', 'city_71', 'city_73', 'city_83', 'city_99') THEN 0.213\n", 1031 | " WHEN city IN ('city_104', 'city_114', 'city_136', 'city_138', 'city_16', 'city_173', 'city_23', 'city_28', 'city_36', 'city_50', 'city_57', 'city_61', 'city_65', 'city_67', 'city_75', 'city_97') THEN 1.017\n", 1032 | " WHEN city IN ('city_11', 'city_21', 'city_74') THEN -1.455\n", 1033 | " ELSE -0.209\n", 1034 | " END AS city,\n", 1035 | " CASE\n", 1036 | " WHEN (city_development_index IS NULL OR city_development_index = 'NaN') THEN 0\n", 1037 | " WHEN city_development_index <= 0.6245 THEN -1.454\n", 1038 | " WHEN city_development_index <= 0.7915 THEN -0.121\n", 1039 | " WHEN city_development_index <= 0.9235 THEN 0.461\n", 1040 | " ELSE 1.101\n", 1041 | " END AS city_development_index,\n", 1042 | " CASE\n", 1043 | " WHEN (company_size IS NULL OR company_size = 'NaN') THEN -0.717\n", 1044 | " WHEN company_size <= 74.0 THEN 0.221\n", 1045 | " ELSE 0.467\n", 1046 | " END AS company_size,\n", 1047 | " CASE\n", 1048 | " WHEN (company_type IS NULL OR LOWER(CAST(company_type AS VARCHAR(50))) = 'nan') THEN -0.64\n", 1049 | " WHEN company_type IN ('Early Stage Startup', 'NGO', 'Other', 'Public Sector') THEN 0.164\n", 1050 | " WHEN company_type = 'Funded Startup' THEN 0.737\n", 1051 | " WHEN company_type = 'Pvt Ltd' THEN 0.398\n", 1052 | " ELSE 0\n", 1053 | " END AS company_type,\n", 1054 | " CASE\n", 1055 | " WHEN (experience IS NULL OR experience = 'NaN') THEN 0\n", 1056 | " WHEN experience <= 1.5 THEN -0.811\n", 1057 | " WHEN experience <= 7.5 THEN -0.319\n", 1058 | " WHEN experience <= 11.5 THEN 0.119\n", 1059 | " ELSE 0.533\n", 1060 | " END AS experience,\n", 1061 | " CASE\n", 1062 | " WHEN (enrolled_university IS NULL OR LOWER(CAST(enrolled_university AS VARCHAR(50))) = 'nan') THEN -0.327\n", 1063 | " WHEN enrolled_university = 'Full time course' THEN -0.614\n", 1064 | " WHEN enrolled_university = 'Part time course' THEN 0.026\n", 1065 | " WHEN enrolled_university = 'no_enrollment' THEN 0.208\n", 1066 | " ELSE 0\n", 1067 | " END AS enrolled_university,\n", 1068 | " CASE\n", 1069 | " WHEN (education_level IS NULL OR LOWER(CAST(education_level AS VARCHAR(50))) = 'nan') THEN 0.21\n", 1070 | " WHEN education_level = 'Graduate' THEN -0.166\n", 1071 | " WHEN education_level = 'High School' THEN 0.34\n", 1072 | " WHEN education_level = 'Masters' THEN 0.21\n", 1073 | " WHEN education_level IN ('Phd', 'Primary School') THEN 0.704\n", 1074 | " ELSE 0\n", 1075 | " END AS education_level\n", 1076 | " FROM global_temp.TABLE_1) as WOE_TAB\n" 1077 | ] 1078 | } 1079 | ], 1080 | "source": [ 1081 | "sql_query = model.get_sql_inference_query('global_temp.TABLE_1')\n", 1082 | "print(sql_query)" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "markdown", 1087 | "metadata": {}, 1088 | "source": [ 1089 | "### Проверка SQL с использованием PySpark" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "code", 1094 | "execution_count": 23, 1095 | "metadata": {}, 1096 | "outputs": [], 1097 | "source": [ 1098 | "from pyspark.sql import SparkSession" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "code", 1103 | "execution_count": null, 1104 | "metadata": {}, 1105 | "outputs": [], 1106 | "source": [ 1107 | "spark = SparkSession.builder \\\n", 1108 | " .master(\"local[2]\") \\\n", 1109 | " .appName(\"spark-course\") \\\n", 1110 | " .config(\"spark.driver.memory\", \"512m\") \\\n", 1111 | " .getOrCreate()\n", 1112 | "sc = spark.sparkContext" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": 24, 1118 | "metadata": {}, 1119 | "outputs": [], 1120 | "source": [ 1121 | "spark_df = spark.read.csv(\"jobs_train.csv\", header=True)\n", 1122 | "spark_df.createGlobalTempView(\"TABLE_1\")" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "execution_count": 25, 1128 | "metadata": {}, 1129 | "outputs": [], 1130 | "source": [ 1131 | "res = spark.sql(sql_query).toPandas()" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "execution_count": 26, 1137 | "metadata": {}, 1138 | "outputs": [ 1139 | { 1140 | "data": { 1141 | "text/html": [ 1142 | "
\n", 1143 | "\n", 1156 | "\n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | "
PROBcitycity_development_indexcompany_sizecompany_typeexperienceenrolled_universityeducation_level
00.3655120.2130.461-0.717-0.6400.5330.208-0.166
10.195716-0.209-0.1210.4670.3980.5330.208-0.166
20.835002-1.455-1.454-0.717-0.640-0.319-0.614-0.166
30.476161-0.209-0.121-0.7170.398-0.811-0.327-0.166
40.117694-0.209-0.1210.4670.7370.5330.2080.210
...........................
191530.2756021.0170.461-0.717-0.6400.5330.208-0.166
191540.3655120.2130.461-0.717-0.6400.5330.208-0.166
191550.1267940.2130.4610.4670.3980.5330.208-0.166
191560.0608421.0170.4610.4670.398-0.8110.2080.340
191570.1305521.0170.461-0.717-0.640-0.3190.2080.704
\n", 1294 | "

19158 rows × 8 columns

\n", 1295 | "
" 1296 | ], 1297 | "text/plain": [ 1298 | " PROB city city_development_index company_size company_type \\\n", 1299 | "0 0.365512 0.213 0.461 -0.717 -0.640 \n", 1300 | "1 0.195716 -0.209 -0.121 0.467 0.398 \n", 1301 | "2 0.835002 -1.455 -1.454 -0.717 -0.640 \n", 1302 | "3 0.476161 -0.209 -0.121 -0.717 0.398 \n", 1303 | "4 0.117694 -0.209 -0.121 0.467 0.737 \n", 1304 | "... ... ... ... ... ... \n", 1305 | "19153 0.275602 1.017 0.461 -0.717 -0.640 \n", 1306 | "19154 0.365512 0.213 0.461 -0.717 -0.640 \n", 1307 | "19155 0.126794 0.213 0.461 0.467 0.398 \n", 1308 | "19156 0.060842 1.017 0.461 0.467 0.398 \n", 1309 | "19157 0.130552 1.017 0.461 -0.717 -0.640 \n", 1310 | "\n", 1311 | " experience enrolled_university education_level \n", 1312 | "0 0.533 0.208 -0.166 \n", 1313 | "1 0.533 0.208 -0.166 \n", 1314 | "2 -0.319 -0.614 -0.166 \n", 1315 | "3 -0.811 -0.327 -0.166 \n", 1316 | "4 0.533 0.208 0.210 \n", 1317 | "... ... ... ... \n", 1318 | "19153 0.533 0.208 -0.166 \n", 1319 | "19154 0.533 0.208 -0.166 \n", 1320 | "19155 0.533 0.208 -0.166 \n", 1321 | "19156 -0.811 0.208 0.340 \n", 1322 | "19157 -0.319 0.208 0.704 \n", 1323 | "\n", 1324 | "[19158 rows x 8 columns]" 1325 | ] 1326 | }, 1327 | "execution_count": 26, 1328 | "metadata": {}, 1329 | "output_type": "execute_result" 1330 | } 1331 | ], 1332 | "source": [ 1333 | "res" 1334 | ] 1335 | }, 1336 | { 1337 | "cell_type": "code", 1338 | "execution_count": 27, 1339 | "metadata": {}, 1340 | "outputs": [], 1341 | "source": [ 1342 | "sc.stop()" 1343 | ] 1344 | }, 1345 | { 1346 | "cell_type": "code", 1347 | "execution_count": 28, 1348 | "metadata": {}, 1349 | "outputs": [ 1350 | { 1351 | "data": { 1352 | "text/plain": [ 1353 | "array([0.36557352, 0.19577798, 0.83497665, ..., 0.12678668, 0.06083813,\n", 1354 | " 0.13061427])" 1355 | ] 1356 | }, 1357 | "execution_count": 28, 1358 | "metadata": {}, 1359 | "output_type": "execute_result" 1360 | } 1361 | ], 1362 | "source": [ 1363 | "full_prediction = model.predict_proba(data)\n", 1364 | "full_prediction" 1365 | ] 1366 | }, 1367 | { 1368 | "cell_type": "code", 1369 | "execution_count": 29, 1370 | "metadata": {}, 1371 | "outputs": [ 1372 | { 1373 | "data": { 1374 | "text/plain": [ 1375 | "0.0002878641803194526" 1376 | ] 1377 | }, 1378 | "execution_count": 29, 1379 | "metadata": {}, 1380 | "output_type": "execute_result" 1381 | } 1382 | ], 1383 | "source": [ 1384 | "(res['PROB'] - full_prediction).abs().max()" 1385 | ] 1386 | } 1387 | ], 1388 | "metadata": { 1389 | "kernelspec": { 1390 | "display_name": "Python 3", 1391 | "language": "python", 1392 | "name": "python3" 1393 | }, 1394 | "language_info": { 1395 | "codemirror_mode": { 1396 | "name": "ipython", 1397 | "version": 3 1398 | }, 1399 | "file_extension": ".py", 1400 | "mimetype": "text/x-python", 1401 | "name": "python", 1402 | "nbconvert_exporter": "python", 1403 | "pygments_lexer": "ipython3", 1404 | "version": "3.6.9" 1405 | } 1406 | }, 1407 | "nbformat": 4, 1408 | "nbformat_minor": 2 1409 | } 1410 | -------------------------------------------------------------------------------- /NLP_REPORT/BankName_char_len_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/BankName_char_len_hist.png -------------------------------------------------------------------------------- /NLP_REPORT/BankName_tokens_len_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/BankName_tokens_len_hist.png -------------------------------------------------------------------------------- /NLP_REPORT/Message_char_len_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/Message_char_len_hist.png -------------------------------------------------------------------------------- /NLP_REPORT/Message_tokens_len_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/Message_tokens_len_hist.png -------------------------------------------------------------------------------- /NLP_REPORT/concat_char_len_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/concat_char_len_hist.png -------------------------------------------------------------------------------- /NLP_REPORT/concat_tokens_len_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/concat_tokens_len_hist.png -------------------------------------------------------------------------------- /NLP_REPORT/report_nlp.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | LAMA report 8 | 9 | 61 | 65 | 66 | 67 |
68 |

LAMA report

69 |
70 | 710 | 721 | 722 | 723 | -------------------------------------------------------------------------------- /NLP_REPORT/test_distribution_of_logits_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_distribution_of_logits_1.png -------------------------------------------------------------------------------- /NLP_REPORT/test_pie_f1_metric_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_pie_f1_metric_1.png -------------------------------------------------------------------------------- /NLP_REPORT/test_pr_curve_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_pr_curve_1.png -------------------------------------------------------------------------------- /NLP_REPORT/test_preds_distribution_by_bins_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_preds_distribution_by_bins_1.png -------------------------------------------------------------------------------- /NLP_REPORT/test_roc_curve_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_roc_curve_1.png -------------------------------------------------------------------------------- /NLP_REPORT/valid_distribution_of_logits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_distribution_of_logits.png -------------------------------------------------------------------------------- /NLP_REPORT/valid_pie_f1_metric.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_pie_f1_metric.png -------------------------------------------------------------------------------- /NLP_REPORT/valid_pr_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_pr_curve.png -------------------------------------------------------------------------------- /NLP_REPORT/valid_preds_distribution_by_bins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_preds_distribution_by_bins.png -------------------------------------------------------------------------------- /NLP_REPORT/valid_roc_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_roc_curve.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Materials for LightAutoML workshop (DataFest 2021) 2 | 3 | - `LightAutoML demo (Blackbox).ipynb` - How-to for LightAutoML blackbox presets (with automatic report generation) 4 | - `LightAutoML demo (Whitebox).ipynb` - How-to for AutoWoE library and LightAutoML whitebox preset (with automatic inference SQL and report generation) 5 | - `LightAutoML demo (NLP).ipynb` - How-to for LightAutoML NLP preset (with automatic report generation and model interpretation) 6 | 7 | There are also all generated reports in the repo from the tutorials above: 8 | - `tabularAutoML_model_report` - report from LightAutoML Blackbox tutorial 9 | - `HR_REPORT_1` and `HR_REPORT_2` - report from LightAutoML Whitebox tutorial 10 | - `NLP_REPORT` - report from LightAutoML NLP tutorial 11 | 12 | ******* 13 | # Questions / Issues / Suggestions 14 | 15 | Write a message to us: 16 | - [Alexander Ryzhkov](https://kaggle.com/alexryzhkov) (_email_: AMRyzhkov@sberbank.ru, _telegram_: @RyzhkovAlex) 17 | - [Anton Vakhrushev](https://kaggle.com/btbpanda) (_email_: AGVakhrushev@sberbank.ru) 18 | - [Dmitry Simakov](https://kaggle.com/simakov) (_email_: Simakov.D.E@sberbank.ru) 19 | -------------------------------------------------------------------------------- /imgs/tutorial_NLP_image_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_NLP_image_1.jpg -------------------------------------------------------------------------------- /imgs/tutorial_NLP_image_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_NLP_image_2.jpg -------------------------------------------------------------------------------- /imgs/tutorial_blackbox_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_blackbox_pipeline.png -------------------------------------------------------------------------------- /imgs/tutorial_blackbox_report_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_blackbox_report_1.png -------------------------------------------------------------------------------- /imgs/tutorial_blackbox_report_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_blackbox_report_2.png -------------------------------------------------------------------------------- /imgs/tutorial_blackbox_report_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_blackbox_report_3.png -------------------------------------------------------------------------------- /imgs/tutorial_whitebox_report_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_whitebox_report_1.png -------------------------------------------------------------------------------- /imgs/tutorial_whitebox_report_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_whitebox_report_2.png -------------------------------------------------------------------------------- /imgs/tutorial_whitebox_report_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_whitebox_report_3.png -------------------------------------------------------------------------------- /imgs/tutorial_whitebox_report_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_whitebox_report_4.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/lama_interactive_report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | LAMA report 8 | 9 | 61 | 65 | 66 | 67 |
68 |

LAMA report

69 |
70 | 790 | 801 | 802 | 803 | -------------------------------------------------------------------------------- /tabularAutoML_model_report/test_distribution_of_logits_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_distribution_of_logits_1.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/test_pie_f1_metric_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_pie_f1_metric_1.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/test_pr_curve_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_pr_curve_1.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/test_preds_distribution_by_bins_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_preds_distribution_by_bins_1.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/test_roc_curve_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_roc_curve_1.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/valid_distribution_of_logits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_distribution_of_logits.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/valid_pie_f1_metric.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_pie_f1_metric.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/valid_pr_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_pr_curve.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/valid_preds_distribution_by_bins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_preds_distribution_by_bins.png -------------------------------------------------------------------------------- /tabularAutoML_model_report/valid_roc_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_roc_curve.png --------------------------------------------------------------------------------