├── .gitignore
├── HR_REPORT_1
├── AUC_test_plot_full.png
├── Model_weights.png
├── autowoe_report.html
├── binned_stats_target.png
├── binned_stats_test.png
├── binned_stats_train.png
├── binned_test_posneg.png
├── binned_test_total.png
├── binned_train_posneg.png
├── binned_train_total.png
├── city_development_index_backlash_plot.png
├── city_development_index_roc_auc.png
├── city_development_index_woe.png
├── city_development_index_woe_bars.png
├── company_size_backlash_plot.png
├── company_size_roc_auc.png
├── company_size_woe.png
├── company_size_woe_bars.png
├── company_type_backlash_plot.png
├── company_type_roc_auc.png
├── company_type_woe.png
├── company_type_woe_bars.png
├── corr_heatmap.png
├── education_level_backlash_plot.png
├── education_level_roc_auc.png
├── education_level_woe.png
├── education_level_woe_bars.png
├── enrolled_university_backlash_plot.png
├── enrolled_university_roc_auc.png
├── enrolled_university_woe.png
├── enrolled_university_woe_bars.png
├── experience_backlash_plot.png
├── experience_roc_auc.png
├── experience_woe.png
├── experience_woe_bars.png
├── shap.js
├── test_enc_ginis.png
└── train_enc_ginis.png
├── HR_REPORT_2
├── AUC_test_plot_full.png
├── Model_weights.png
├── autowoe_report.html
├── binned_stats_target.png
├── binned_stats_test.png
├── binned_stats_train.png
├── binned_test_posneg.png
├── binned_test_total.png
├── binned_train_posneg.png
├── binned_train_total.png
├── city_backlash_plot.png
├── city_development_index_backlash_plot.png
├── city_development_index_roc_auc.png
├── city_development_index_woe.png
├── city_development_index_woe_bars.png
├── city_roc_auc.png
├── city_woe.png
├── city_woe_bars.png
├── company_size_backlash_plot.png
├── company_size_roc_auc.png
├── company_size_woe.png
├── company_size_woe_bars.png
├── company_type_backlash_plot.png
├── company_type_roc_auc.png
├── company_type_woe.png
├── company_type_woe_bars.png
├── corr_heatmap.png
├── education_level_backlash_plot.png
├── education_level_roc_auc.png
├── education_level_woe.png
├── education_level_woe_bars.png
├── enrolled_university_backlash_plot.png
├── enrolled_university_roc_auc.png
├── enrolled_university_woe.png
├── enrolled_university_woe_bars.png
├── experience_backlash_plot.png
├── experience_roc_auc.png
├── experience_woe.png
├── experience_woe_bars.png
├── shap.js
├── test_enc_ginis.png
└── train_enc_ginis.png
├── LICENSE
├── LightAutoML demo (Blackbox).ipynb
├── LightAutoML demo (NLP).ipynb
├── LightAutoML demo (Whitebox).ipynb
├── NLP_REPORT
├── BankName_char_len_hist.png
├── BankName_tokens_len_hist.png
├── Message_char_len_hist.png
├── Message_tokens_len_hist.png
├── concat_char_len_hist.png
├── concat_tokens_len_hist.png
├── report_nlp.html
├── test_distribution_of_logits_1.png
├── test_pie_f1_metric_1.png
├── test_pr_curve_1.png
├── test_preds_distribution_by_bins_1.png
├── test_roc_curve_1.png
├── valid_distribution_of_logits.png
├── valid_pie_f1_metric.png
├── valid_pr_curve.png
├── valid_preds_distribution_by_bins.png
└── valid_roc_curve.png
├── README.md
├── example_data
├── jobs_train.csv
└── nlp_data.csv
├── imgs
├── tutorial_NLP_image_1.jpg
├── tutorial_NLP_image_2.jpg
├── tutorial_blackbox_pipeline.png
├── tutorial_blackbox_report_1.png
├── tutorial_blackbox_report_2.png
├── tutorial_blackbox_report_3.png
├── tutorial_whitebox_report_1.png
├── tutorial_whitebox_report_2.png
├── tutorial_whitebox_report_3.png
└── tutorial_whitebox_report_4.png
└── tabularAutoML_model_report
├── lama_interactive_report.html
├── test_distribution_of_logits_1.png
├── test_pie_f1_metric_1.png
├── test_pr_curve_1.png
├── test_preds_distribution_by_bins_1.png
├── test_roc_curve_1.png
├── valid_distribution_of_logits.png
├── valid_pie_f1_metric.png
├── valid_pr_curve.png
├── valid_preds_distribution_by_bins.png
└── valid_roc_curve.png
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/HR_REPORT_1/AUC_test_plot_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/AUC_test_plot_full.png
--------------------------------------------------------------------------------
/HR_REPORT_1/Model_weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/Model_weights.png
--------------------------------------------------------------------------------
/HR_REPORT_1/binned_stats_target.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_stats_target.png
--------------------------------------------------------------------------------
/HR_REPORT_1/binned_stats_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_stats_test.png
--------------------------------------------------------------------------------
/HR_REPORT_1/binned_stats_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_stats_train.png
--------------------------------------------------------------------------------
/HR_REPORT_1/binned_test_posneg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_test_posneg.png
--------------------------------------------------------------------------------
/HR_REPORT_1/binned_test_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_test_total.png
--------------------------------------------------------------------------------
/HR_REPORT_1/binned_train_posneg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_train_posneg.png
--------------------------------------------------------------------------------
/HR_REPORT_1/binned_train_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/binned_train_total.png
--------------------------------------------------------------------------------
/HR_REPORT_1/city_development_index_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/city_development_index_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_1/city_development_index_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/city_development_index_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_1/city_development_index_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/city_development_index_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_1/city_development_index_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/city_development_index_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_1/company_size_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_size_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_1/company_size_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_size_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_1/company_size_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_size_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_1/company_size_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_size_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_1/company_type_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_type_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_1/company_type_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_type_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_1/company_type_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_type_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_1/company_type_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/company_type_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_1/corr_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/corr_heatmap.png
--------------------------------------------------------------------------------
/HR_REPORT_1/education_level_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/education_level_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_1/education_level_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/education_level_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_1/education_level_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/education_level_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_1/education_level_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/education_level_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_1/enrolled_university_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/enrolled_university_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_1/enrolled_university_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/enrolled_university_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_1/enrolled_university_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/enrolled_university_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_1/enrolled_university_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/enrolled_university_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_1/experience_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/experience_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_1/experience_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/experience_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_1/experience_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/experience_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_1/experience_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/experience_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_1/test_enc_ginis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/test_enc_ginis.png
--------------------------------------------------------------------------------
/HR_REPORT_1/train_enc_ginis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_1/train_enc_ginis.png
--------------------------------------------------------------------------------
/HR_REPORT_2/AUC_test_plot_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/AUC_test_plot_full.png
--------------------------------------------------------------------------------
/HR_REPORT_2/Model_weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/Model_weights.png
--------------------------------------------------------------------------------
/HR_REPORT_2/binned_stats_target.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_stats_target.png
--------------------------------------------------------------------------------
/HR_REPORT_2/binned_stats_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_stats_test.png
--------------------------------------------------------------------------------
/HR_REPORT_2/binned_stats_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_stats_train.png
--------------------------------------------------------------------------------
/HR_REPORT_2/binned_test_posneg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_test_posneg.png
--------------------------------------------------------------------------------
/HR_REPORT_2/binned_test_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_test_total.png
--------------------------------------------------------------------------------
/HR_REPORT_2/binned_train_posneg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_train_posneg.png
--------------------------------------------------------------------------------
/HR_REPORT_2/binned_train_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/binned_train_total.png
--------------------------------------------------------------------------------
/HR_REPORT_2/city_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_2/city_development_index_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_development_index_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_2/city_development_index_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_development_index_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_2/city_development_index_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_development_index_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_2/city_development_index_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_development_index_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_2/city_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_2/city_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_2/city_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/city_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_2/company_size_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_size_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_2/company_size_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_size_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_2/company_size_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_size_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_2/company_size_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_size_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_2/company_type_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_type_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_2/company_type_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_type_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_2/company_type_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_type_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_2/company_type_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/company_type_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_2/corr_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/corr_heatmap.png
--------------------------------------------------------------------------------
/HR_REPORT_2/education_level_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/education_level_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_2/education_level_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/education_level_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_2/education_level_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/education_level_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_2/education_level_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/education_level_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_2/enrolled_university_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/enrolled_university_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_2/enrolled_university_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/enrolled_university_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_2/enrolled_university_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/enrolled_university_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_2/enrolled_university_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/enrolled_university_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_2/experience_backlash_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/experience_backlash_plot.png
--------------------------------------------------------------------------------
/HR_REPORT_2/experience_roc_auc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/experience_roc_auc.png
--------------------------------------------------------------------------------
/HR_REPORT_2/experience_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/experience_woe.png
--------------------------------------------------------------------------------
/HR_REPORT_2/experience_woe_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/experience_woe_bars.png
--------------------------------------------------------------------------------
/HR_REPORT_2/test_enc_ginis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/test_enc_ginis.png
--------------------------------------------------------------------------------
/HR_REPORT_2/train_enc_ginis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/HR_REPORT_2/train_enc_ginis.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/LightAutoML demo (NLP).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# AutoML на текстовых данных"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "\n"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Чуть больше про стратегии получения представлений текстов на основе представлений слов:\n",
22 | "\n",
23 | "\n",
24 | "\n",
25 | "Про методы случайных алгоритмов можно подробнее прочитать в [статье](https://arxiv.org/abs/1901.10444) \"No Training Required: Exploring Random Encoders for Sentence Classification\".\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "# Импорты"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 1,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "import pandas as pd\n",
42 | "import numpy as np\n",
43 | "import pickle\n",
44 | "\n",
45 | "from sklearn.metrics import roc_auc_score\n",
46 | "from sklearn.model_selection import train_test_split\n",
47 | "\n",
48 | "from lightautoml.automl.presets.text_presets import TabularNLPAutoML\n",
49 | "from lightautoml.tasks import Task\n",
50 | "from lightautoml.addons.interpretation import LimeTextExplainer\n",
51 | "from lightautoml.report import ReportDecoNLP\n",
52 | "\n",
53 | "# Выключим предупреждения от HuggingFace\n",
54 | "import transformers\n",
55 | "transformers.logging.set_verbosity(50)"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "# Чтение данных"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 2,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "CPU times: user 183 ms, sys: 39.5 ms, total: 222 ms\n",
75 | "Wall time: 221 ms\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "%%time\n",
81 | "df = pd.read_csv(\"./example_data/nlp_data.csv\")"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 3,
87 | "metadata": {},
88 | "outputs": [
89 | {
90 | "name": "stdout",
91 | "output_type": "stream",
92 | "text": [
93 | "(13842, 6)\n"
94 | ]
95 | },
96 | {
97 | "data": {
98 | "text/html": [
99 | "
\n",
100 | "\n",
113 | "
\n",
114 | " \n",
115 | " \n",
116 | " | \n",
117 | " BankName | \n",
118 | " Message | \n",
119 | " ViewsNum | \n",
120 | " IsGood | \n",
121 | " MessageRecognized | \n",
122 | " WER | \n",
123 | "
\n",
124 | " \n",
125 | " \n",
126 | " \n",
127 | " 11474 | \n",
128 | " Альфа-Банк | \n",
129 | " Я клиент банка с 2007 года, зарплатный клиент ... | \n",
130 | " 1422 | \n",
131 | " False | \n",
132 | " Я клиент банка с две тысячи седьмого года зарп... | \n",
133 | " 60.000000 | \n",
134 | "
\n",
135 | " \n",
136 | " 3955 | \n",
137 | " Альфа-Банк | \n",
138 | " 07.04 в 20-15 по Ульяновскому времени я зашла ... | \n",
139 | " 2016 | \n",
140 | " False | \n",
141 | " Седьмого апреля в двадцать пятнадцать По Ульян... | \n",
142 | " 31.818182 | \n",
143 | "
\n",
144 | " \n",
145 | " 3081 | \n",
146 | " Банк Открытие | \n",
147 | " Ужасный сервис. Заказал кредитную карту по акц... | \n",
148 | " 2232 | \n",
149 | " False | \n",
150 | " Ужасной Сервис заказал кредитную карту по акци... | \n",
151 | " 68.750000 | \n",
152 | "
\n",
153 | " \n",
154 | " 12107 | \n",
155 | " Почта Банк | \n",
156 | " Добрый вечер. 21.01.2020, я обратилась в отде... | \n",
157 | " 1139 | \n",
158 | " False | \n",
159 | " Добрый вечер двадцать первого января две тысяч... | \n",
160 | " 55.555556 | \n",
161 | "
\n",
162 | " \n",
163 | " 10494 | \n",
164 | " Русский Стандарт | \n",
165 | " Банк второй месяц подряд еженедельно названива... | \n",
166 | " 1609 | \n",
167 | " False | \n",
168 | " Второй месяц подряд еженедельно и предлагает к... | \n",
169 | " 39.393939 | \n",
170 | "
\n",
171 | " \n",
172 | "
\n",
173 | "
"
174 | ],
175 | "text/plain": [
176 | " BankName Message \\\n",
177 | "11474 Альфа-Банк Я клиент банка с 2007 года, зарплатный клиент ... \n",
178 | "3955 Альфа-Банк 07.04 в 20-15 по Ульяновскому времени я зашла ... \n",
179 | "3081 Банк Открытие Ужасный сервис. Заказал кредитную карту по акц... \n",
180 | "12107 Почта Банк Добрый вечер. 21.01.2020, я обратилась в отде... \n",
181 | "10494 Русский Стандарт Банк второй месяц подряд еженедельно названива... \n",
182 | "\n",
183 | " ViewsNum IsGood MessageRecognized \\\n",
184 | "11474 1422 False Я клиент банка с две тысячи седьмого года зарп... \n",
185 | "3955 2016 False Седьмого апреля в двадцать пятнадцать По Ульян... \n",
186 | "3081 2232 False Ужасной Сервис заказал кредитную карту по акци... \n",
187 | "12107 1139 False Добрый вечер двадцать первого января две тысяч... \n",
188 | "10494 1609 False Второй месяц подряд еженедельно и предлагает к... \n",
189 | "\n",
190 | " WER \n",
191 | "11474 60.000000 \n",
192 | "3955 31.818182 \n",
193 | "3081 68.750000 \n",
194 | "12107 55.555556 \n",
195 | "10494 39.393939 "
196 | ]
197 | },
198 | "execution_count": 3,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "print(df.shape)\n",
205 | "df.sample(5, random_state=0)"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "# Разбиение на обучающую и контрольные выборки"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 4,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "train, test = train_test_split(df, test_size=3_000, random_state=42, stratify=df.IsGood)"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "# Скачиваем эмбеддинги для русского языка"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 5,
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "name": "stdout",
238 | "output_type": "stream",
239 | "text": [
240 | "--2021-06-14 23:58:12-- https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar\n",
241 | "Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9\n",
242 | "Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.\n",
243 | "HTTP request sent, awaiting response... 200 OK\n",
244 | "Length: 53012480 (51M) [application/x-tar]\n",
245 | "Saving to: ‘navec_hudlit_v1_12B_500K_300d_100q.tar.3’\n",
246 | "\n",
247 | "navec_hudlit_v1_12B 100%[===================>] 50.56M 11.4MB/s in 4.5s \n",
248 | "\n",
249 | "2021-06-14 23:58:16 (11.3 MB/s) - ‘navec_hudlit_v1_12B_500K_300d_100q.tar.3’ saved [53012480/53012480]\n",
250 | "\n"
251 | ]
252 | }
253 | ],
254 | "source": [
255 | "!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 6,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "from navec import Navec\n",
265 | "path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'\n",
266 | "navec = Navec.load(path)"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "# Обучение AutoML или День Сурка"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {},
279 | "source": [
280 | "## День 1. Стандартные параметры, ЦПУ"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {
287 | "scrolled": true
288 | },
289 | "outputs": [],
290 | "source": [
291 | "roles = {'target': 'IsGood',\n",
292 | " 'text': ['BankName', 'Message'],\n",
293 | " 'drop': ['MessageRecognized', 'WER']}\n",
294 | "\n",
295 | "task = Task('binary')\n",
296 | "\n",
297 | "automl = TabularNLPAutoML(task = task, \n",
298 | " timeout = 3600,\n",
299 | " gpu_ids = None,\n",
300 | " text_params = {'lang': 'ru'},\n",
301 | " verbose=2)\n",
302 | "\n",
303 | "oof_pred = automl.fit_predict(train, roles=roles) \n",
304 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 8,
310 | "metadata": {},
311 | "outputs": [
312 | {
313 | "name": "stdout",
314 | "output_type": "stream",
315 | "text": [
316 | "AUC OOF score: 0.8326628448807263\n"
317 | ]
318 | }
319 | ],
320 | "source": [
321 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 9,
327 | "metadata": {},
328 | "outputs": [
329 | {
330 | "name": "stdout",
331 | "output_type": "stream",
332 | "text": [
333 | "Feature concated__BankName__Message transformed\n",
334 | "AUC TEST score: 0.8397933778340239\n",
335 | "CPU times: user 7.46 s, sys: 1.74 s, total: 9.2 s\n",
336 | "Wall time: 11.5 s\n"
337 | ]
338 | }
339 | ],
340 | "source": [
341 | "%%time \n",
342 | "\n",
343 | "test_pred = automl.predict(test)\n",
344 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "## День 2. Пользовательские представления слов, ЦПУ"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": null,
357 | "metadata": {
358 | "scrolled": true
359 | },
360 | "outputs": [],
361 | "source": [
362 | "roles = {'target': 'IsGood',\n",
363 | " 'text': ['BankName', 'Message'],\n",
364 | " 'drop': ['MessageRecognized', 'WER']}\n",
365 | "\n",
366 | "task = Task('binary')\n",
367 | "\n",
368 | "automl = TabularNLPAutoML(task = task, \n",
369 | " timeout = 3600,\n",
370 | " gpu_ids = None,\n",
371 | " text_params = {'lang': 'ru'},\n",
372 | " autonlp_params={'model_name': 'wat', 'embedding_model': navec,\n",
373 | " 'transformer_params': {'model_params': {'embed_size': 300},\n",
374 | " 'weight_type': 'idf', 'use_svd': True}},\n",
375 | " verbose=2)\n",
376 | "\n",
377 | "oof_pred = automl.fit_predict(train, roles=roles) \n",
378 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 11,
384 | "metadata": {},
385 | "outputs": [
386 | {
387 | "name": "stdout",
388 | "output_type": "stream",
389 | "text": [
390 | "AUC OOF score: 0.8331853706309897\n"
391 | ]
392 | }
393 | ],
394 | "source": [
395 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 12,
401 | "metadata": {},
402 | "outputs": [
403 | {
404 | "name": "stdout",
405 | "output_type": "stream",
406 | "text": [
407 | "Feature concated__BankName__Message transformed\n",
408 | "AUC TEST score: 0.8433377172303697\n",
409 | "CPU times: user 11.7 s, sys: 1.82 s, total: 13.5 s\n",
410 | "Wall time: 15.4 s\n"
411 | ]
412 | }
413 | ],
414 | "source": [
415 | "%%time \n",
416 | "\n",
417 | "test_pred = automl.predict(test)\n",
418 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))"
419 | ]
420 | },
421 | {
422 | "cell_type": "markdown",
423 | "metadata": {},
424 | "source": [
425 | "## День 3. Стандартные параметры, ГПУ"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": null,
431 | "metadata": {
432 | "scrolled": true
433 | },
434 | "outputs": [],
435 | "source": [
436 | "roles = {'target': 'IsGood',\n",
437 | " 'text': ['BankName', 'Message'],\n",
438 | " 'drop': ['MessageRecognized', 'WER']}\n",
439 | "\n",
440 | "task = Task('binary')\n",
441 | "\n",
442 | "automl = TabularNLPAutoML(task = task, \n",
443 | " timeout = 3600,\n",
444 | " gpu_ids = '1',\n",
445 | " text_params = {'lang': 'ru'},\n",
446 | " nn_params = {'lang': 'ru'},\n",
447 | " verbose=2)\n",
448 | "\n",
449 | "oof_pred = automl.fit_predict(train, roles=roles) \n",
450 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": 14,
456 | "metadata": {},
457 | "outputs": [
458 | {
459 | "name": "stdout",
460 | "output_type": "stream",
461 | "text": [
462 | "AUC OOF score: 0.895916820914043\n"
463 | ]
464 | }
465 | ],
466 | "source": [
467 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": 15,
473 | "metadata": {},
474 | "outputs": [
475 | {
476 | "name": "stderr",
477 | "output_type": "stream",
478 | "text": [
479 | "100%|██████████| 10/10 [00:16<00:00, 1.69s/it]\n"
480 | ]
481 | },
482 | {
483 | "name": "stdout",
484 | "output_type": "stream",
485 | "text": [
486 | "Feature concated__BankName__Message transformed\n"
487 | ]
488 | },
489 | {
490 | "name": "stderr",
491 | "output_type": "stream",
492 | "text": [
493 | "test: 100%|██████████| 188/188 [00:30<00:00, 6.13it/s]\n",
494 | "test: 100%|██████████| 188/188 [00:30<00:00, 6.11it/s]\n",
495 | "test: 100%|██████████| 188/188 [00:30<00:00, 6.11it/s]\n"
496 | ]
497 | },
498 | {
499 | "name": "stdout",
500 | "output_type": "stream",
501 | "text": [
502 | "AUC TEST score: 0.9052540592405104\n",
503 | "CPU times: user 1min 32s, sys: 33.2 s, total: 2min 6s\n",
504 | "Wall time: 2min 26s\n"
505 | ]
506 | }
507 | ],
508 | "source": [
509 | "%%time \n",
510 | "\n",
511 | "test_pred = automl.predict(test)\n",
512 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))"
513 | ]
514 | },
515 | {
516 | "cell_type": "markdown",
517 | "metadata": {},
518 | "source": [
519 | "## День 4. Пользовательские представления слов, ГПУ, LightGBM"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "metadata": {
526 | "scrolled": true
527 | },
528 | "outputs": [],
529 | "source": [
530 | "roles = {'target': 'IsGood',\n",
531 | " 'text': ['BankName', 'Message'],\n",
532 | " 'drop': ['MessageRecognized', 'WER']}\n",
533 | "\n",
534 | "task = Task('binary')\n",
535 | "\n",
536 | "automl = TabularNLPAutoML(task = task, \n",
537 | " timeout = 3600,\n",
538 | " gpu_ids = '1',\n",
539 | " general_params = {'use_algos': ['lgb']},\n",
540 | " text_params = {'lang': 'ru'},\n",
541 | " autonlp_params={'model_name': 'random_lstm', 'embedding_model': navec},\n",
542 | " verbose=2)\n",
543 | "\n",
544 | "oof_pred = automl.fit_predict(train, roles=roles) \n",
545 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 17,
551 | "metadata": {},
552 | "outputs": [
553 | {
554 | "name": "stdout",
555 | "output_type": "stream",
556 | "text": [
557 | "AUC OOF score: 0.6699591052567759\n"
558 | ]
559 | }
560 | ],
561 | "source": [
562 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))"
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": 18,
568 | "metadata": {},
569 | "outputs": [
570 | {
571 | "name": "stderr",
572 | "output_type": "stream",
573 | "text": [
574 | "100%|██████████| 3/3 [00:04<00:00, 1.40s/it]\n"
575 | ]
576 | },
577 | {
578 | "name": "stdout",
579 | "output_type": "stream",
580 | "text": [
581 | "Feature concated__BankName__Message transformed\n",
582 | "AUC TEST score: 0.6823280698997218\n",
583 | "CPU times: user 1.19 s, sys: 717 ms, total: 1.91 s\n",
584 | "Wall time: 4.89 s\n"
585 | ]
586 | }
587 | ],
588 | "source": [
589 | "%%time \n",
590 | "\n",
591 | "test_pred = automl.predict(test)\n",
592 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))"
593 | ]
594 | },
595 | {
596 | "cell_type": "markdown",
597 | "metadata": {},
598 | "source": [
599 | "## День 5. Выбор агрегации представлений слов, ГПУ, линейная модель и LightGBM"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": null,
605 | "metadata": {},
606 | "outputs": [],
607 | "source": [
608 | "roles = {'target': 'IsGood',\n",
609 | " 'text': ['BankName', 'Message'],\n",
610 | " 'drop': ['MessageRecognized', 'WER']}\n",
611 | "\n",
612 | "task = Task('binary')\n",
613 | "\n",
614 | "automl = TabularNLPAutoML(task = task, \n",
615 | " timeout = 3600,\n",
616 | " gpu_ids = '1',\n",
617 | " general_params = {'use_algos': ['linear_l2', 'lgb']},\n",
618 | " text_params = {'lang': 'ru'},\n",
619 | " autonlp_params={'model_name': 'pooled_bert'},\n",
620 | " verbose=2)\n",
621 | "\n",
622 | "oof_pred = automl.fit_predict(train, roles=roles) \n",
623 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": 20,
629 | "metadata": {},
630 | "outputs": [
631 | {
632 | "name": "stdout",
633 | "output_type": "stream",
634 | "text": [
635 | "AUC OOF score: 0.8886449778166853\n"
636 | ]
637 | }
638 | ],
639 | "source": [
640 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))"
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": 21,
646 | "metadata": {},
647 | "outputs": [
648 | {
649 | "name": "stderr",
650 | "output_type": "stream",
651 | "text": [
652 | "100%|██████████| 10/10 [00:16<00:00, 1.64s/it]\n"
653 | ]
654 | },
655 | {
656 | "name": "stdout",
657 | "output_type": "stream",
658 | "text": [
659 | "Feature concated__BankName__Message transformed\n",
660 | "AUC TEST score: 0.8930039620503403\n",
661 | "CPU times: user 13.6 s, sys: 6.03 s, total: 19.7 s\n",
662 | "Wall time: 27.6 s\n"
663 | ]
664 | }
665 | ],
666 | "source": [
667 | "%%time \n",
668 | "\n",
669 | "test_pred = automl.predict(test)\n",
670 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))"
671 | ]
672 | },
673 | {
674 | "cell_type": "markdown",
675 | "metadata": {},
676 | "source": [
677 | "## День 6. Выбор модели Transformers, ГПУ\n",
678 | "\n",
679 | "rubert-tiny. Подробнее в [статье](https://habr.com/ru/post/562064/)."
680 | ]
681 | },
682 | {
683 | "cell_type": "code",
684 | "execution_count": null,
685 | "metadata": {},
686 | "outputs": [],
687 | "source": [
688 | "roles = {'target': 'IsGood',\n",
689 | " 'text': ['BankName', 'Message'],\n",
690 | " 'drop': ['MessageRecognized', 'WER']}\n",
691 | "\n",
692 | "task = Task('binary')\n",
693 | "\n",
694 | "automl = TabularNLPAutoML(task = task, \n",
695 | " timeout = 3600,\n",
696 | " gpu_ids = '1',\n",
697 | " general_params = {'use_algos': ['nn']},\n",
698 | " nn_params = {'lang': 'ru', 'bert_name': \"cointegrated/rubert-tiny\"},\n",
699 | " verbose=2)\n",
700 | "\n",
701 | "oof_pred = automl.fit_predict(train, roles=roles) \n",
702 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)"
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": 23,
708 | "metadata": {},
709 | "outputs": [
710 | {
711 | "name": "stdout",
712 | "output_type": "stream",
713 | "text": [
714 | "AUC OOF score: 0.8444397429684534\n"
715 | ]
716 | }
717 | ],
718 | "source": [
719 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": 24,
725 | "metadata": {},
726 | "outputs": [
727 | {
728 | "name": "stderr",
729 | "output_type": "stream",
730 | "text": [
731 | "test: 100%|██████████| 188/188 [00:03<00:00, 52.82it/s]\n",
732 | "test: 100%|██████████| 188/188 [00:03<00:00, 52.87it/s]\n",
733 | "test: 100%|██████████| 188/188 [00:03<00:00, 52.69it/s]\n"
734 | ]
735 | },
736 | {
737 | "name": "stdout",
738 | "output_type": "stream",
739 | "text": [
740 | "AUC TEST score: 0.8588585048981088\n",
741 | "CPU times: user 9.53 s, sys: 2.83 s, total: 12.4 s\n",
742 | "Wall time: 24.8 s\n"
743 | ]
744 | }
745 | ],
746 | "source": [
747 | "%%time \n",
748 | "\n",
749 | "test_pred = automl.predict(test)\n",
750 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))"
751 | ]
752 | },
753 | {
754 | "cell_type": "markdown",
755 | "metadata": {},
756 | "source": [
757 | "# Что дальше?"
758 | ]
759 | },
760 | {
761 | "cell_type": "markdown",
762 | "metadata": {},
763 | "source": [
764 | "## Интерпретация"
765 | ]
766 | },
767 | {
768 | "cell_type": "markdown",
769 | "metadata": {},
770 | "source": [
771 | "### LIME\n",
772 | "\n",
773 | "Примерный алгоритм работы:\n",
774 | "\n",
775 | "1. Выбирается текстовая колонка (perturb_column), с помощью которой будем интерпретировать выделенное предсказание модели. При этом все остальные признаки фиксированные.\n",
776 | "2. Создается датасет размера n_sample (по-умолчанию 5000) путем случайных удалениий токенов (группами). Датасет бинарный (есть токен / нет токена).\n",
777 | "3. Опционально производится отбор признаков (важных токенов) с помощью LASSO (feature_selection='lasso', можно также 'none', чтобы не производить отбор). Количество признаков равно n_feautres (10 по умолчанию).\n",
778 | "4. Обучаем на этом объясняемую модель (линейную с весами, способ подсчета весов -- косинусное расстояние по-умолчанию, также можно и свою функцию или название расстояния из sklearn.metrics.pairwise_distances). \n",
779 | "5. После этого веса линейной модели и являются интерпретацией.\n",
780 | "\n",
781 | "tips: force_order отвечает за то, использовать ли признаки как мешок слов(force_order=False) или важен их порядок (force_order=True)."
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": 25,
787 | "metadata": {},
788 | "outputs": [],
789 | "source": [
790 | "lime = LimeTextExplainer(automl, feature_selection='lasso', force_order=False)"
791 | ]
792 | },
793 | {
794 | "cell_type": "code",
795 | "execution_count": 26,
796 | "metadata": {},
797 | "outputs": [
798 | {
799 | "name": "stderr",
800 | "output_type": "stream",
801 | "text": [
802 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.01it/s]\n",
803 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.30it/s]\n",
804 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.07it/s]\n"
805 | ]
806 | },
807 | {
808 | "data": {
809 | "text/html": [
810 | "Добрый день! Я являюсь счастливым обладателем кредитной карты данного банка. 20го числа должен был произвести пополнение карты для того, чтобы полностью воспользоваться льготным периодом кредитования.
"
811 | ]
812 | },
813 | "metadata": {},
814 | "output_type": "display_data"
815 | }
816 | ],
817 | "source": [
818 | "instance = test.iloc[0] # объект для интерпретации\n",
819 | "exp = lime.explain_instance(instance, labels=(0, 1), perturb_column='Message')\n",
820 | "exp.visualize_in_notebook(label=1)"
821 | ]
822 | },
823 | {
824 | "cell_type": "code",
825 | "execution_count": 27,
826 | "metadata": {},
827 | "outputs": [
828 | {
829 | "name": "stderr",
830 | "output_type": "stream",
831 | "text": [
832 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.36it/s]\n",
833 | "test: 100%|██████████| 313/313 [00:05<00:00, 57.73it/s]\n",
834 | "test: 100%|██████████| 313/313 [00:05<00:00, 54.12it/s]\n"
835 | ]
836 | },
837 | {
838 | "data": {
839 | "text/html": [
840 | "Я просто в шоке от этого банка!!! После ошибки сотрудников возникла спорная ситуация и я, как добропорядочный клиент обратилась в отделение и оформила претензию лично (19.09.16) и на этом сайте (21.09.16) для более оперативного реагирования. Мне обещали рассмотреть мои обращения в течении 14 рабочих дней.
"
841 | ]
842 | },
843 | "metadata": {},
844 | "output_type": "display_data"
845 | }
846 | ],
847 | "source": [
848 | "instance = test.iloc[-1] # объект для интерпретации\n",
849 | "exp = lime.explain_instance(instance, labels=(0, 1), perturb_column='Message')\n",
850 | "exp.visualize_in_notebook(label=1)"
851 | ]
852 | },
853 | {
854 | "cell_type": "markdown",
855 | "metadata": {},
856 | "source": [
857 | "## Отчет"
858 | ]
859 | },
860 | {
861 | "cell_type": "code",
862 | "execution_count": null,
863 | "metadata": {
864 | "scrolled": true
865 | },
866 | "outputs": [],
867 | "source": [
868 | "RD = ReportDecoNLP(output_path='NLP_REPORT', \n",
869 | " report_file_name='report_nlp.html')\n",
870 | "\n",
871 | "roles = {'target': 'IsGood',\n",
872 | " 'text': ['BankName', 'Message'],\n",
873 | " 'drop': ['MessageRecognized', 'WER']}\n",
874 | "\n",
875 | "task = Task('binary')\n",
876 | "\n",
877 | "automl = TabularNLPAutoML(task = task, \n",
878 | " timeout = 3600,\n",
879 | " gpu_ids = '1',\n",
880 | " general_params = {'use_algos': ['linear_l2']},\n",
881 | " linear_pipeline_params = {'text_features': \"embed\"},\n",
882 | " text_params = {'lang': 'ru'},\n",
883 | " autonlp_params={'model_name': 'pooled_bert',\n",
884 | " 'transformer_params': {'model_params': {'pooling': 'cls'}}},\n",
885 | " verbose=2)\n",
886 | "\n",
887 | "automl_rd = RD(automl)\n",
888 | "\n",
889 | "oof_pred = automl_rd.fit_predict(train, roles=roles) \n",
890 | "not_nan = np.any(~np.isnan(oof_pred.data), axis=1)"
891 | ]
892 | },
893 | {
894 | "cell_type": "code",
895 | "execution_count": 29,
896 | "metadata": {},
897 | "outputs": [
898 | {
899 | "name": "stdout",
900 | "output_type": "stream",
901 | "text": [
902 | "AUC OOF score: 0.8608724490033035\n"
903 | ]
904 | }
905 | ],
906 | "source": [
907 | "print('AUC OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))"
908 | ]
909 | },
910 | {
911 | "cell_type": "code",
912 | "execution_count": 30,
913 | "metadata": {},
914 | "outputs": [
915 | {
916 | "name": "stderr",
917 | "output_type": "stream",
918 | "text": [
919 | "100%|██████████| 3/3 [00:16<00:00, 5.45s/it]\n"
920 | ]
921 | },
922 | {
923 | "name": "stdout",
924 | "output_type": "stream",
925 | "text": [
926 | "Feature concated__BankName__Message transformed\n",
927 | "AUC TEST score: 0.8713822510070556\n",
928 | "CPU times: user 13.9 s, sys: 4.86 s, total: 18.7 s\n",
929 | "Wall time: 23.2 s\n"
930 | ]
931 | }
932 | ],
933 | "source": [
934 | "%%time \n",
935 | "\n",
936 | "test_pred = automl_rd.predict(test)\n",
937 | "print('AUC TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))"
938 | ]
939 | },
940 | {
941 | "cell_type": "markdown",
942 | "metadata": {},
943 | "source": [
944 | "Отчет лежит [здесь](./NLP_REPORT/report_nlp.html)."
945 | ]
946 | },
947 | {
948 | "cell_type": "markdown",
949 | "metadata": {},
950 | "source": [
951 | "## Сохранение модели"
952 | ]
953 | },
954 | {
955 | "cell_type": "code",
956 | "execution_count": 31,
957 | "metadata": {},
958 | "outputs": [],
959 | "source": [
960 | "with open('LAMA_model.pkl', 'wb') as f:\n",
961 | " pickle.dump(automl_rd, f)"
962 | ]
963 | },
964 | {
965 | "cell_type": "markdown",
966 | "metadata": {},
967 | "source": [
968 | "## Больше материалов\n",
969 | "\n",
970 | "* Github [LightAutoML](https://github.com/sberbank-ai-lab/LightAutoML) со ссылками на все материалы.\n",
971 | "* Канал [LAMA](https://t.me/lightautoml) в Telegram.\n",
972 | "* Примеры на kaggle с использованием текстового функционала в условии отсутсвия доступа к интернету во время инференса: [обучение](https://www.kaggle.com/simakov/lama-bert-starter) и [инференс](https://www.kaggle.com/simakov/lama-bert-inference)."
973 | ]
974 | }
975 | ],
976 | "metadata": {
977 | "kernelspec": {
978 | "display_name": "Python 3",
979 | "language": "python",
980 | "name": "python3"
981 | },
982 | "language_info": {
983 | "codemirror_mode": {
984 | "name": "ipython",
985 | "version": 3
986 | },
987 | "file_extension": ".py",
988 | "mimetype": "text/x-python",
989 | "name": "python",
990 | "nbconvert_exporter": "python",
991 | "pygments_lexer": "ipython3",
992 | "version": "3.7.6"
993 | }
994 | },
995 | "nbformat": 4,
996 | "nbformat_minor": 5
997 | }
998 |
--------------------------------------------------------------------------------
/LightAutoML demo (Whitebox).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## AutoWoE (WhiteBox модель для бинарной классификации на табличных данных)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "### Скор карта"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | ""
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "### Линейная модель"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | ""
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "### Дискретизация"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | ""
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "### Отбор и одномерный анализ"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | ""
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "## Whitebox pipeline:\n",
71 | "\n",
72 | "### Общие параметры:\n",
73 | "\n",
74 | " - n_jobs\n",
75 | " - debug\n",
76 | "\n",
77 | "### 0) Простая разметка типов и выброс мусора\n",
78 | "#### 0.0) Удаление мусорных фичей\n",
79 | " \n",
80 | " Medium:\n",
81 | " - th_nan \n",
82 | " - th_const \n",
83 | " \n",
84 | "#### 0.1) Разметка типов (авто или пользовательская)\n",
85 | " \n",
86 | " Critical:\n",
87 | " - features_type (dict) {'age': 'real', 'education': 'cat', 'birth_date': (None, (\"d\", \"wd\"), ...}\n",
88 | " \n",
89 | "#### 0.2) Кодирование категорий и дат\n",
90 | " \n",
91 | " Critical:\n",
92 | " - features_type (for datetimes)\n",
93 | " \n",
94 | " Optional:\n",
95 | " - cat_alpha (int) - greater means more conservative encoding\n",
96 | " \n",
97 | " \n",
98 | "### 1) Первая стадия отбора (отбор на основе важности BlackBox модели)\n",
99 | "\n",
100 | " Critical:\n",
101 | " - select_type (None or int)\n",
102 | " - imp_type (if type(select_type) is int 'perm_imt'/'feature_imp') \n",
103 | " \n",
104 | " Optional:\n",
105 | " - imt_th (float) - threshold for select_type is None\n",
106 | " \n",
107 | "### 2) Биннинг (дискретизация):\n",
108 | " \n",
109 | " Critical:\n",
110 | " - monotonic / features_monotone_constraints \n",
111 | " - max_bin_count / max_bin_count\n",
112 | " - min_bin_size\n",
113 | " \n",
114 | " - cat_merge_to\n",
115 | " - nan_merge_to\n",
116 | " \n",
117 | " Medium:\n",
118 | " - force_single_split\n",
119 | " \n",
120 | " Optional:\n",
121 | " - min_bin_mults\n",
122 | " - min_gains_to_split\n",
123 | "\n",
124 | "### 3) WoE оценки WoE = LN( ((% 0 in bin) / (% 0 in sample)) / ((% 1 in bin) / (% 1 in sample)) ):\n",
125 | " \n",
126 | " Critical:\n",
127 | " - oof_woe\n",
128 | " \n",
129 | " Optional:\n",
130 | " - woe_diff_th\n",
131 | " - n_folds (if oof_woe)\n",
132 | "\n",
133 | "### 4) Вторая стадия отбора:\n",
134 | "\n",
135 | "#### 4.0) Одномерная зависимость с таргетом\n",
136 | " \n",
137 | " Critical:\n",
138 | " - auc_th\n",
139 | " \n",
140 | "#### 4.1) VIF \n",
141 | " \n",
142 | " Critical:\n",
143 | " - vif_th\n",
144 | " \n",
145 | "#### 4.2) Частные корреляции\n",
146 | " \n",
147 | " Critical:\n",
148 | " - pearson_th\n",
149 | " \n",
150 | "### 5) Третья стадия отбора (на основе модели)\n",
151 | " \n",
152 | " Optional:\n",
153 | " - n_folds\n",
154 | " - l1_base_step\n",
155 | " - l1_exp_step\n",
156 | " \n",
157 | " Do not touch:\n",
158 | " - population_size\n",
159 | " - feature_groups_count\n",
160 | "\n",
161 | "\n",
162 | "### 6) Обучение финальной модели:\n",
163 | "\n",
164 | " Critical:\n",
165 | " - regularized_refit\n",
166 | " - p_val (if not regularized_refit)\n",
167 | " - validation (if not regularized_refit)\n",
168 | " \n",
169 | " Optional:\n",
170 | " - interpreted_model\n",
171 | " - l1_base_step (if regularized_refit)\n",
172 | " - l1_exp_step (if regularized_refit)\n",
173 | " \n",
174 | "### 7) Создание отчета\n",
175 | "\n",
176 | " - report_params"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "### Импорты"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 1,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "import pandas as pd\n",
193 | "from pandas import Series, DataFrame\n",
194 | "\n",
195 | "import numpy as np\n",
196 | "\n",
197 | "import joblib\n",
198 | "\n",
199 | "from sklearn.model_selection import train_test_split\n",
200 | "from sklearn.metrics import roc_auc_score\n",
201 | "\n",
202 | "from autowoe import AutoWoE, ReportDeco"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "### Чтение данных и train/test split"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 2,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "data = pd.read_csv('example_data/jobs_train.csv')"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 3,
224 | "metadata": {},
225 | "outputs": [
226 | {
227 | "data": {
228 | "text/html": [
229 | "\n",
230 | "\n",
243 | "
\n",
244 | " \n",
245 | " \n",
246 | " | \n",
247 | " enrollee_id | \n",
248 | " city | \n",
249 | " city_development_index | \n",
250 | " gender | \n",
251 | " relevent_experience | \n",
252 | " enrolled_university | \n",
253 | " education_level | \n",
254 | " major_discipline | \n",
255 | " experience | \n",
256 | " company_size | \n",
257 | " company_type | \n",
258 | " last_new_job | \n",
259 | " training_hours | \n",
260 | " target | \n",
261 | "
\n",
262 | " \n",
263 | " \n",
264 | " \n",
265 | " 0 | \n",
266 | " 8949 | \n",
267 | " city_103 | \n",
268 | " 0.920 | \n",
269 | " Male | \n",
270 | " Has relevent experience | \n",
271 | " no_enrollment | \n",
272 | " Graduate | \n",
273 | " STEM | \n",
274 | " 21.0 | \n",
275 | " NaN | \n",
276 | " NaN | \n",
277 | " 1.0 | \n",
278 | " 36 | \n",
279 | " 1.0 | \n",
280 | "
\n",
281 | " \n",
282 | " 1 | \n",
283 | " 29725 | \n",
284 | " city_40 | \n",
285 | " 0.776 | \n",
286 | " Male | \n",
287 | " No relevent experience | \n",
288 | " no_enrollment | \n",
289 | " Graduate | \n",
290 | " STEM | \n",
291 | " 15.0 | \n",
292 | " 99.0 | \n",
293 | " Pvt Ltd | \n",
294 | " 5.0 | \n",
295 | " 47 | \n",
296 | " 0.0 | \n",
297 | "
\n",
298 | " \n",
299 | " 2 | \n",
300 | " 11561 | \n",
301 | " city_21 | \n",
302 | " 0.624 | \n",
303 | " NaN | \n",
304 | " No relevent experience | \n",
305 | " Full time course | \n",
306 | " Graduate | \n",
307 | " STEM | \n",
308 | " 5.0 | \n",
309 | " NaN | \n",
310 | " NaN | \n",
311 | " 0.0 | \n",
312 | " 83 | \n",
313 | " 0.0 | \n",
314 | "
\n",
315 | " \n",
316 | " 3 | \n",
317 | " 33241 | \n",
318 | " city_115 | \n",
319 | " 0.789 | \n",
320 | " NaN | \n",
321 | " No relevent experience | \n",
322 | " NaN | \n",
323 | " Graduate | \n",
324 | " Business Degree | \n",
325 | " 0.0 | \n",
326 | " NaN | \n",
327 | " Pvt Ltd | \n",
328 | " 0.0 | \n",
329 | " 52 | \n",
330 | " 1.0 | \n",
331 | "
\n",
332 | " \n",
333 | " 4 | \n",
334 | " 666 | \n",
335 | " city_162 | \n",
336 | " 0.767 | \n",
337 | " Male | \n",
338 | " Has relevent experience | \n",
339 | " no_enrollment | \n",
340 | " Masters | \n",
341 | " STEM | \n",
342 | " 21.0 | \n",
343 | " 99.0 | \n",
344 | " Funded Startup | \n",
345 | " 4.0 | \n",
346 | " 8 | \n",
347 | " 0.0 | \n",
348 | "
\n",
349 | " \n",
350 | " ... | \n",
351 | " ... | \n",
352 | " ... | \n",
353 | " ... | \n",
354 | " ... | \n",
355 | " ... | \n",
356 | " ... | \n",
357 | " ... | \n",
358 | " ... | \n",
359 | " ... | \n",
360 | " ... | \n",
361 | " ... | \n",
362 | " ... | \n",
363 | " ... | \n",
364 | " ... | \n",
365 | "
\n",
366 | " \n",
367 | " 19153 | \n",
368 | " 7386 | \n",
369 | " city_173 | \n",
370 | " 0.878 | \n",
371 | " Male | \n",
372 | " No relevent experience | \n",
373 | " no_enrollment | \n",
374 | " Graduate | \n",
375 | " Humanities | \n",
376 | " 14.0 | \n",
377 | " NaN | \n",
378 | " NaN | \n",
379 | " 1.0 | \n",
380 | " 42 | \n",
381 | " 1.0 | \n",
382 | "
\n",
383 | " \n",
384 | " 19154 | \n",
385 | " 31398 | \n",
386 | " city_103 | \n",
387 | " 0.920 | \n",
388 | " Male | \n",
389 | " Has relevent experience | \n",
390 | " no_enrollment | \n",
391 | " Graduate | \n",
392 | " STEM | \n",
393 | " 14.0 | \n",
394 | " NaN | \n",
395 | " NaN | \n",
396 | " 4.0 | \n",
397 | " 52 | \n",
398 | " 1.0 | \n",
399 | "
\n",
400 | " \n",
401 | " 19155 | \n",
402 | " 24576 | \n",
403 | " city_103 | \n",
404 | " 0.920 | \n",
405 | " Male | \n",
406 | " Has relevent experience | \n",
407 | " no_enrollment | \n",
408 | " Graduate | \n",
409 | " STEM | \n",
410 | " 21.0 | \n",
411 | " 99.0 | \n",
412 | " Pvt Ltd | \n",
413 | " 4.0 | \n",
414 | " 44 | \n",
415 | " 0.0 | \n",
416 | "
\n",
417 | " \n",
418 | " 19156 | \n",
419 | " 5756 | \n",
420 | " city_65 | \n",
421 | " 0.802 | \n",
422 | " Male | \n",
423 | " Has relevent experience | \n",
424 | " no_enrollment | \n",
425 | " High School | \n",
426 | " NaN | \n",
427 | " 0.0 | \n",
428 | " 999.0 | \n",
429 | " Pvt Ltd | \n",
430 | " 2.0 | \n",
431 | " 97 | \n",
432 | " 0.0 | \n",
433 | "
\n",
434 | " \n",
435 | " 19157 | \n",
436 | " 23834 | \n",
437 | " city_67 | \n",
438 | " 0.855 | \n",
439 | " NaN | \n",
440 | " No relevent experience | \n",
441 | " no_enrollment | \n",
442 | " Primary School | \n",
443 | " NaN | \n",
444 | " 2.0 | \n",
445 | " NaN | \n",
446 | " NaN | \n",
447 | " 1.0 | \n",
448 | " 127 | \n",
449 | " 0.0 | \n",
450 | "
\n",
451 | " \n",
452 | "
\n",
453 | "
19158 rows × 14 columns
\n",
454 | "
"
455 | ],
456 | "text/plain": [
457 | " enrollee_id city city_development_index gender \\\n",
458 | "0 8949 city_103 0.920 Male \n",
459 | "1 29725 city_40 0.776 Male \n",
460 | "2 11561 city_21 0.624 NaN \n",
461 | "3 33241 city_115 0.789 NaN \n",
462 | "4 666 city_162 0.767 Male \n",
463 | "... ... ... ... ... \n",
464 | "19153 7386 city_173 0.878 Male \n",
465 | "19154 31398 city_103 0.920 Male \n",
466 | "19155 24576 city_103 0.920 Male \n",
467 | "19156 5756 city_65 0.802 Male \n",
468 | "19157 23834 city_67 0.855 NaN \n",
469 | "\n",
470 | " relevent_experience enrolled_university education_level \\\n",
471 | "0 Has relevent experience no_enrollment Graduate \n",
472 | "1 No relevent experience no_enrollment Graduate \n",
473 | "2 No relevent experience Full time course Graduate \n",
474 | "3 No relevent experience NaN Graduate \n",
475 | "4 Has relevent experience no_enrollment Masters \n",
476 | "... ... ... ... \n",
477 | "19153 No relevent experience no_enrollment Graduate \n",
478 | "19154 Has relevent experience no_enrollment Graduate \n",
479 | "19155 Has relevent experience no_enrollment Graduate \n",
480 | "19156 Has relevent experience no_enrollment High School \n",
481 | "19157 No relevent experience no_enrollment Primary School \n",
482 | "\n",
483 | " major_discipline experience company_size company_type \\\n",
484 | "0 STEM 21.0 NaN NaN \n",
485 | "1 STEM 15.0 99.0 Pvt Ltd \n",
486 | "2 STEM 5.0 NaN NaN \n",
487 | "3 Business Degree 0.0 NaN Pvt Ltd \n",
488 | "4 STEM 21.0 99.0 Funded Startup \n",
489 | "... ... ... ... ... \n",
490 | "19153 Humanities 14.0 NaN NaN \n",
491 | "19154 STEM 14.0 NaN NaN \n",
492 | "19155 STEM 21.0 99.0 Pvt Ltd \n",
493 | "19156 NaN 0.0 999.0 Pvt Ltd \n",
494 | "19157 NaN 2.0 NaN NaN \n",
495 | "\n",
496 | " last_new_job training_hours target \n",
497 | "0 1.0 36 1.0 \n",
498 | "1 5.0 47 0.0 \n",
499 | "2 0.0 83 0.0 \n",
500 | "3 0.0 52 1.0 \n",
501 | "4 4.0 8 0.0 \n",
502 | "... ... ... ... \n",
503 | "19153 1.0 42 1.0 \n",
504 | "19154 4.0 52 1.0 \n",
505 | "19155 4.0 44 0.0 \n",
506 | "19156 2.0 97 0.0 \n",
507 | "19157 1.0 127 0.0 \n",
508 | "\n",
509 | "[19158 rows x 14 columns]"
510 | ]
511 | },
512 | "execution_count": 3,
513 | "metadata": {},
514 | "output_type": "execute_result"
515 | }
516 | ],
517 | "source": [
518 | "data"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": 4,
524 | "metadata": {},
525 | "outputs": [],
526 | "source": [
527 | "train, test = train_test_split(data.drop('enrollee_id', axis=1), test_size=0.2, stratify=data['target'])"
528 | ]
529 | },
530 | {
531 | "cell_type": "markdown",
532 | "metadata": {},
533 | "source": [
534 | "### AutoWoe: настройки по умолчанию"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 5,
540 | "metadata": {},
541 | "outputs": [],
542 | "source": [
543 | "auto_woe_0 = AutoWoE(interpreted_model=True,\n",
544 | " monotonic=False,\n",
545 | " max_bin_count=5,\n",
546 | " select_type=None,\n",
547 | " pearson_th=0.9,\n",
548 | " auc_th=.505,\n",
549 | " vif_th=10.,\n",
550 | " imp_th=0,\n",
551 | " th_const=32,\n",
552 | " force_single_split=True,\n",
553 | " th_nan=0.01,\n",
554 | " th_cat=0.005,\n",
555 | " auc_tol=1e-4,\n",
556 | " cat_alpha=100,\n",
557 | " cat_merge_to=\"to_woe_0\",\n",
558 | " nan_merge_to=\"to_woe_0\",\n",
559 | " imp_type=\"feature_imp\",\n",
560 | " regularized_refit=False,\n",
561 | " p_val=0.05,\n",
562 | " verbose=2\n",
563 | " )\n",
564 | "\n",
565 | "auto_woe_0 = ReportDeco(auto_woe_0, )"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 6,
571 | "metadata": {},
572 | "outputs": [
573 | {
574 | "name": "stdout",
575 | "output_type": "stream",
576 | "text": [
577 | "city processing...\n",
578 | "city_development_index processing...\n",
579 | "gender processing...\n",
580 | "relevent_experience processing...\n",
581 | "enrolled_university processing...\n",
582 | "education_level processing...\n",
583 | "experience processing...\n",
584 | "company_size processing...\n",
585 | "company_type processing...\n",
586 | "last_new_job processing...\n",
587 | "training_hours processing...\n",
588 | "dict_keys(['city', 'city_development_index', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours']) to selector !!!!!\n",
589 | "Feature selection...\n",
590 | "city_development_index -0.974107\n",
591 | "company_size -0.795953\n",
592 | "company_type -0.400146\n",
593 | "experience -0.184238\n",
594 | "enrolled_university -0.251287\n",
595 | "education_level -1.188926\n",
596 | "dtype: float64\n"
597 | ]
598 | }
599 | ],
600 | "source": [
601 | "auto_woe_0.fit(train,\n",
602 | " target_name=\"target\",\n",
603 | " )"
604 | ]
605 | },
606 | {
607 | "cell_type": "code",
608 | "execution_count": 7,
609 | "metadata": {},
610 | "outputs": [
611 | {
612 | "data": {
613 | "text/plain": [
614 | "array([0.06265852, 0.56483877, 0.04151965, ..., 0.15191705, 0.08528486,\n",
615 | " 0.0409943 ])"
616 | ]
617 | },
618 | "execution_count": 7,
619 | "metadata": {},
620 | "output_type": "execute_result"
621 | }
622 | ],
623 | "source": [
624 | "test_prediction = auto_woe_0.predict_proba(test)\n",
625 | "test_prediction"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": 8,
631 | "metadata": {},
632 | "outputs": [
633 | {
634 | "data": {
635 | "text/plain": [
636 | "0.8034365349304012"
637 | ]
638 | },
639 | "execution_count": 8,
640 | "metadata": {},
641 | "output_type": "execute_result"
642 | }
643 | ],
644 | "source": [
645 | "roc_auc_score(test['target'].values, test_prediction)"
646 | ]
647 | },
648 | {
649 | "cell_type": "code",
650 | "execution_count": 9,
651 | "metadata": {},
652 | "outputs": [
653 | {
654 | "name": "stderr",
655 | "output_type": "stream",
656 | "text": [
657 | "No handles with labels found to put in legend.\n",
658 | "No handles with labels found to put in legend.\n",
659 | "No handles with labels found to put in legend.\n",
660 | "No handles with labels found to put in legend.\n",
661 | "No handles with labels found to put in legend.\n",
662 | "No handles with labels found to put in legend.\n",
663 | "No handles with labels found to put in legend.\n"
664 | ]
665 | }
666 | ],
667 | "source": [
668 | "report_params = {\"output_path\": \"HR_REPORT_1\", # папка, куда сгенерится отчет и сложатся нужные файлы\n",
669 | " \"report_name\": \"WHITEBOX REPORT\",\n",
670 | " \"report_version_id\": 1,\n",
671 | " \"city\": \"Moscow\",\n",
672 | " \"model_aim\": \"Predict if candidate will work for the company\",\n",
673 | " \"model_name\": \"HR model\",\n",
674 | " \"zakazchik\": \"Kaggle\",\n",
675 | " \"high_level_department\": \"Ai Lab\",\n",
676 | " \"ds_name\": \"Btbpanda\",\n",
677 | " \"target_descr\": \"Candidate will work for the company\",\n",
678 | " \"non_target_descr\": \"Candidate will work for the company\"}\n",
679 | "\n",
680 | "auto_woe_0.generate_report(report_params, )"
681 | ]
682 | },
683 | {
684 | "cell_type": "markdown",
685 | "metadata": {},
686 | "source": [
687 | "### AutoWoE - более консервативная модель"
688 | ]
689 | },
690 | {
691 | "cell_type": "code",
692 | "execution_count": 10,
693 | "metadata": {},
694 | "outputs": [],
695 | "source": [
696 | "auto_woe_1 = AutoWoE(interpreted_model=True,\n",
697 | " monotonic=True,\n",
698 | " max_bin_count=4,\n",
699 | " select_type=None,\n",
700 | " pearson_th=0.9,\n",
701 | " auc_th=.505,\n",
702 | " vif_th=10.,\n",
703 | " imp_th=0,\n",
704 | " th_const=32,\n",
705 | " force_single_split=True,\n",
706 | " th_nan=0.01,\n",
707 | " th_cat=0.005,\n",
708 | " auc_tol=1e-4,\n",
709 | " cat_alpha=100,\n",
710 | " cat_merge_to=\"to_woe_0\",\n",
711 | " nan_merge_to=\"to_woe_0\",\n",
712 | " imp_type=\"feature_imp\",\n",
713 | " regularized_refit=False,\n",
714 | " p_val=0.05,\n",
715 | " verbose=2\n",
716 | " )\n",
717 | "\n",
718 | "auto_woe_1 = ReportDeco(auto_woe_1, )"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": 11,
724 | "metadata": {},
725 | "outputs": [
726 | {
727 | "name": "stdout",
728 | "output_type": "stream",
729 | "text": [
730 | "city processing...city_development_index processing...\n",
731 | "\n",
732 | "gender processing...\n",
733 | "relevent_experience processing...\n",
734 | "enrolled_university processing...education_level processing...\n",
735 | "\n",
736 | "experience processing...company_type processing...company_size processing...\n",
737 | "\n",
738 | "\n",
739 | "last_new_job processing...\n",
740 | "training_hours processing...\n",
741 | "dict_keys(['city', 'city_development_index', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours']) to selector !!!!!\n",
742 | "Feature selection...\n",
743 | "city -0.516274\n",
744 | "city_development_index -0.512608\n",
745 | "company_size -0.814922\n",
746 | "company_type -0.397978\n",
747 | "experience -0.175231\n",
748 | "enrolled_university -0.219507\n",
749 | "education_level -1.239627\n",
750 | "dtype: float64\n"
751 | ]
752 | }
753 | ],
754 | "source": [
755 | "auto_woe_1.fit(train,\n",
756 | " target_name=\"target\",\n",
757 | " )"
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": 12,
763 | "metadata": {},
764 | "outputs": [
765 | {
766 | "data": {
767 | "text/plain": [
768 | "array([0.06460692, 0.57321671, 0.0497262 , ..., 0.13746553, 0.07190761,\n",
769 | " 0.04153373])"
770 | ]
771 | },
772 | "execution_count": 12,
773 | "metadata": {},
774 | "output_type": "execute_result"
775 | }
776 | ],
777 | "source": [
778 | "test_prediction = auto_woe_1.predict_proba(test)\n",
779 | "test_prediction"
780 | ]
781 | },
782 | {
783 | "cell_type": "code",
784 | "execution_count": 13,
785 | "metadata": {},
786 | "outputs": [
787 | {
788 | "data": {
789 | "text/plain": [
790 | "0.8019815944109903"
791 | ]
792 | },
793 | "execution_count": 13,
794 | "metadata": {},
795 | "output_type": "execute_result"
796 | }
797 | ],
798 | "source": [
799 | "roc_auc_score(test['target'].values, test_prediction)"
800 | ]
801 | },
802 | {
803 | "cell_type": "code",
804 | "execution_count": 14,
805 | "metadata": {},
806 | "outputs": [
807 | {
808 | "name": "stderr",
809 | "output_type": "stream",
810 | "text": [
811 | "No handles with labels found to put in legend.\n",
812 | "No handles with labels found to put in legend.\n",
813 | "No handles with labels found to put in legend.\n",
814 | "No handles with labels found to put in legend.\n",
815 | "No handles with labels found to put in legend.\n",
816 | "No handles with labels found to put in legend.\n",
817 | "No handles with labels found to put in legend.\n",
818 | "No handles with labels found to put in legend.\n"
819 | ]
820 | }
821 | ],
822 | "source": [
823 | "report_params = {\"output_path\": \"HR_REPORT_2\", # папка, куда сгенерится отчет и сложатся нужные файлы\n",
824 | " \"report_name\": \"WHITEBOX REPORT\",\n",
825 | " \"report_version_id\": 2,\n",
826 | " \"city\": \"Moscow\",\n",
827 | " \"model_aim\": \"Predict if candidate will work for the company\",\n",
828 | " \"model_name\": \"HR model\",\n",
829 | " \"zakazchik\": \"Kaggle\",\n",
830 | " \"high_level_department\": \"Ai Lab\",\n",
831 | " \"ds_name\": \"Btbpanda\",\n",
832 | " \"target_descr\": \"Candidate will work for the company\",\n",
833 | " \"non_target_descr\": \"Candidate will work for the company\"}\n",
834 | "\n",
835 | "auto_woe_1.generate_report(report_params, )"
836 | ]
837 | },
838 | {
839 | "cell_type": "markdown",
840 | "metadata": {},
841 | "source": [
842 | "### WhiteBox preset - использование по аналогии с TabularAutoML"
843 | ]
844 | },
845 | {
846 | "cell_type": "code",
847 | "execution_count": 15,
848 | "metadata": {},
849 | "outputs": [],
850 | "source": [
851 | "from lightautoml.automl.presets.whitebox_presets import WhiteBoxPreset\n",
852 | "from lightautoml import Task"
853 | ]
854 | },
855 | {
856 | "cell_type": "code",
857 | "execution_count": 16,
858 | "metadata": {},
859 | "outputs": [],
860 | "source": [
861 | "task = Task('binary')\n",
862 | "automl = WhiteBoxPreset(task)"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": 17,
868 | "metadata": {},
869 | "outputs": [
870 | {
871 | "name": "stderr",
872 | "output_type": "stream",
873 | "text": [
874 | "Validation data is not set. Train will be used as valid in report and valid prediction\n"
875 | ]
876 | },
877 | {
878 | "name": "stdout",
879 | "output_type": "stream",
880 | "text": [
881 | "Start automl preset with listed constraints:\n",
882 | "- time: 3600 seconds\n",
883 | "- cpus: 4 cores\n",
884 | "- memory: 16 gb\n",
885 | "\n",
886 | "Train data shape: (15326, 13)\n",
887 | "Feats was rejected during automatic roles guess: []\n",
888 | "\n",
889 | "\n",
890 | "Layer 1 ...\n",
891 | "Train process start. Time left 3595.0072581768036 secs\n",
892 | "Start fitting Lvl_0_Pipe_0_Mod_0_WhiteBox ...\n",
893 | "\n",
894 | "===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_WhiteBox =====\n",
895 | "\n",
896 | " features [] contain too many nans or identical values\n",
897 | " features [] have low importance\n",
898 | "city processing...\n",
899 | "city_development_index processing...company_type processing...education_level processing...\n",
900 | "\n",
901 | "\n",
902 | "enrolled_university processing...\n",
903 | "gender processing...\n",
904 | "major_discipline processing...\n",
905 | "relevent_experience processing...\n",
906 | "company_size processing...\n",
907 | "experience processing...\n",
908 | "last_new_job processing...\n",
909 | "training_hours processing...\n",
910 | "dict_keys(['city', 'city_development_index', 'company_type', 'education_level', 'enrolled_university', 'gender', 'major_discipline', 'relevent_experience', 'company_size', 'experience', 'last_new_job', 'training_hours']) to selector !!!!!\n",
911 | "Feature selection...\n",
912 | "Feature training_hours removed due to low AUC value 0.5031265374717342\n",
913 | "Feature city_development_index removed due to high VIF value = 40.56438648184099\n",
914 | "C parameter range in [0.0002603488674824265:260.3488674824265], 20 values\n",
915 | "Result(score=0.7856775296767177, reg_alpha=0.020431136952654548, is_neg=True, min_weights=city -0.980620\n",
916 | "company_size -0.800535\n",
917 | "company_type -0.340185\n",
918 | "experience -0.198176\n",
919 | "enrolled_university -0.101047\n",
920 | "relevent_experience 0.000000\n",
921 | "education_level -0.624324\n",
922 | "last_new_job 0.000000\n",
923 | "gender 0.000000\n",
924 | "major_discipline -0.317699\n",
925 | "dtype: float64)\n",
926 | "Iter 0 of final refit starts with 7 features\n",
927 | "Validation data checks\n",
928 | "city -0.956550\n",
929 | "company_size -0.866063\n",
930 | "company_type -0.402941\n",
931 | "experience -0.329493\n",
932 | "enrolled_university -0.230776\n",
933 | "education_level -0.641994\n",
934 | "major_discipline -1.596907\n",
935 | "dtype: float64\n",
936 | "Lvl_0_Pipe_0_Mod_0_WhiteBox fitting and predicting completed\n",
937 | "Time left 3587.2280378341675\n",
938 | "\n",
939 | "Automl preset training completed in 12.77 seconds.\n"
940 | ]
941 | }
942 | ],
943 | "source": [
944 | "\n",
945 | "train_pred = automl.fit_predict(train.reset_index(drop=True), roles={'target': 'target'})"
946 | ]
947 | },
948 | {
949 | "cell_type": "code",
950 | "execution_count": 18,
951 | "metadata": {},
952 | "outputs": [],
953 | "source": [
954 | "test_prediction = automl.predict(test).data[:, 0]"
955 | ]
956 | },
957 | {
958 | "cell_type": "code",
959 | "execution_count": 19,
960 | "metadata": {},
961 | "outputs": [
962 | {
963 | "data": {
964 | "text/plain": [
965 | "0.7966826628232216"
966 | ]
967 | },
968 | "execution_count": 19,
969 | "metadata": {},
970 | "output_type": "execute_result"
971 | }
972 | ],
973 | "source": [
974 | "roc_auc_score(test['target'].values, test_prediction)"
975 | ]
976 | },
977 | {
978 | "cell_type": "markdown",
979 | "metadata": {},
980 | "source": [
981 | "### Сериализация модели\n",
982 | "\n",
983 | "Важно: auto_woe_1 фактически является ReportDeco объектом (отчетом), не AutoWoE. Чтобы получить AutoWoE надо обратиться к атрибуту .model. \n",
984 | "\n",
985 | "ReportDeco не рекомендуется для использования на стадии инференса. Отчет требует целевой переменной в датасете для предсказания, так как считает метрики качества. Так же инференс из объекта-отчета намного дольше из-за собственно построения отчета."
986 | ]
987 | },
988 | {
989 | "cell_type": "code",
990 | "execution_count": 20,
991 | "metadata": {},
992 | "outputs": [],
993 | "source": [
994 | "joblib.dump(auto_woe_1.model, 'model.pkl')\n",
995 | "model = joblib.load('model.pkl')"
996 | ]
997 | },
998 | {
999 | "cell_type": "markdown",
1000 | "metadata": {},
1001 | "source": [
1002 | "### SQL запрос для инференса"
1003 | ]
1004 | },
1005 | {
1006 | "cell_type": "code",
1007 | "execution_count": 21,
1008 | "metadata": {},
1009 | "outputs": [
1010 | {
1011 | "name": "stdout",
1012 | "output_type": "stream",
1013 | "text": [
1014 | "SELECT\n",
1015 | " 1 / (1 + EXP(-(\n",
1016 | " -1.111\n",
1017 | " -0.516*WOE_TAB.city\n",
1018 | " -0.513*WOE_TAB.city_development_index\n",
1019 | " -0.815*WOE_TAB.company_size\n",
1020 | " -0.398*WOE_TAB.company_type\n",
1021 | " -0.175*WOE_TAB.experience\n",
1022 | " -0.22*WOE_TAB.enrolled_university\n",
1023 | " -1.24*WOE_TAB.education_level\n",
1024 | " ))) as PROB,\n",
1025 | " WOE_TAB.*\n",
1026 | "FROM \n",
1027 | " (SELECT\n",
1028 | " CASE\n",
1029 | " WHEN (city IS NULL OR LOWER(CAST(city AS VARCHAR(50))) = 'nan') THEN 0\n",
1030 | " WHEN city IN ('city_100', 'city_102', 'city_103', 'city_116', 'city_149', 'city_159', 'city_160', 'city_45', 'city_46', 'city_64', 'city_71', 'city_73', 'city_83', 'city_99') THEN 0.213\n",
1031 | " WHEN city IN ('city_104', 'city_114', 'city_136', 'city_138', 'city_16', 'city_173', 'city_23', 'city_28', 'city_36', 'city_50', 'city_57', 'city_61', 'city_65', 'city_67', 'city_75', 'city_97') THEN 1.017\n",
1032 | " WHEN city IN ('city_11', 'city_21', 'city_74') THEN -1.455\n",
1033 | " ELSE -0.209\n",
1034 | " END AS city,\n",
1035 | " CASE\n",
1036 | " WHEN (city_development_index IS NULL OR city_development_index = 'NaN') THEN 0\n",
1037 | " WHEN city_development_index <= 0.6245 THEN -1.454\n",
1038 | " WHEN city_development_index <= 0.7915 THEN -0.121\n",
1039 | " WHEN city_development_index <= 0.9235 THEN 0.461\n",
1040 | " ELSE 1.101\n",
1041 | " END AS city_development_index,\n",
1042 | " CASE\n",
1043 | " WHEN (company_size IS NULL OR company_size = 'NaN') THEN -0.717\n",
1044 | " WHEN company_size <= 74.0 THEN 0.221\n",
1045 | " ELSE 0.467\n",
1046 | " END AS company_size,\n",
1047 | " CASE\n",
1048 | " WHEN (company_type IS NULL OR LOWER(CAST(company_type AS VARCHAR(50))) = 'nan') THEN -0.64\n",
1049 | " WHEN company_type IN ('Early Stage Startup', 'NGO', 'Other', 'Public Sector') THEN 0.164\n",
1050 | " WHEN company_type = 'Funded Startup' THEN 0.737\n",
1051 | " WHEN company_type = 'Pvt Ltd' THEN 0.398\n",
1052 | " ELSE 0\n",
1053 | " END AS company_type,\n",
1054 | " CASE\n",
1055 | " WHEN (experience IS NULL OR experience = 'NaN') THEN 0\n",
1056 | " WHEN experience <= 1.5 THEN -0.811\n",
1057 | " WHEN experience <= 7.5 THEN -0.319\n",
1058 | " WHEN experience <= 11.5 THEN 0.119\n",
1059 | " ELSE 0.533\n",
1060 | " END AS experience,\n",
1061 | " CASE\n",
1062 | " WHEN (enrolled_university IS NULL OR LOWER(CAST(enrolled_university AS VARCHAR(50))) = 'nan') THEN -0.327\n",
1063 | " WHEN enrolled_university = 'Full time course' THEN -0.614\n",
1064 | " WHEN enrolled_university = 'Part time course' THEN 0.026\n",
1065 | " WHEN enrolled_university = 'no_enrollment' THEN 0.208\n",
1066 | " ELSE 0\n",
1067 | " END AS enrolled_university,\n",
1068 | " CASE\n",
1069 | " WHEN (education_level IS NULL OR LOWER(CAST(education_level AS VARCHAR(50))) = 'nan') THEN 0.21\n",
1070 | " WHEN education_level = 'Graduate' THEN -0.166\n",
1071 | " WHEN education_level = 'High School' THEN 0.34\n",
1072 | " WHEN education_level = 'Masters' THEN 0.21\n",
1073 | " WHEN education_level IN ('Phd', 'Primary School') THEN 0.704\n",
1074 | " ELSE 0\n",
1075 | " END AS education_level\n",
1076 | " FROM global_temp.TABLE_1) as WOE_TAB\n"
1077 | ]
1078 | }
1079 | ],
1080 | "source": [
1081 | "sql_query = model.get_sql_inference_query('global_temp.TABLE_1')\n",
1082 | "print(sql_query)"
1083 | ]
1084 | },
1085 | {
1086 | "cell_type": "markdown",
1087 | "metadata": {},
1088 | "source": [
1089 | "### Проверка SQL с использованием PySpark"
1090 | ]
1091 | },
1092 | {
1093 | "cell_type": "code",
1094 | "execution_count": 23,
1095 | "metadata": {},
1096 | "outputs": [],
1097 | "source": [
1098 | "from pyspark.sql import SparkSession"
1099 | ]
1100 | },
1101 | {
1102 | "cell_type": "code",
1103 | "execution_count": null,
1104 | "metadata": {},
1105 | "outputs": [],
1106 | "source": [
1107 | "spark = SparkSession.builder \\\n",
1108 | " .master(\"local[2]\") \\\n",
1109 | " .appName(\"spark-course\") \\\n",
1110 | " .config(\"spark.driver.memory\", \"512m\") \\\n",
1111 | " .getOrCreate()\n",
1112 | "sc = spark.sparkContext"
1113 | ]
1114 | },
1115 | {
1116 | "cell_type": "code",
1117 | "execution_count": 24,
1118 | "metadata": {},
1119 | "outputs": [],
1120 | "source": [
1121 | "spark_df = spark.read.csv(\"jobs_train.csv\", header=True)\n",
1122 | "spark_df.createGlobalTempView(\"TABLE_1\")"
1123 | ]
1124 | },
1125 | {
1126 | "cell_type": "code",
1127 | "execution_count": 25,
1128 | "metadata": {},
1129 | "outputs": [],
1130 | "source": [
1131 | "res = spark.sql(sql_query).toPandas()"
1132 | ]
1133 | },
1134 | {
1135 | "cell_type": "code",
1136 | "execution_count": 26,
1137 | "metadata": {},
1138 | "outputs": [
1139 | {
1140 | "data": {
1141 | "text/html": [
1142 | "\n",
1143 | "\n",
1156 | "
\n",
1157 | " \n",
1158 | " \n",
1159 | " | \n",
1160 | " PROB | \n",
1161 | " city | \n",
1162 | " city_development_index | \n",
1163 | " company_size | \n",
1164 | " company_type | \n",
1165 | " experience | \n",
1166 | " enrolled_university | \n",
1167 | " education_level | \n",
1168 | "
\n",
1169 | " \n",
1170 | " \n",
1171 | " \n",
1172 | " 0 | \n",
1173 | " 0.365512 | \n",
1174 | " 0.213 | \n",
1175 | " 0.461 | \n",
1176 | " -0.717 | \n",
1177 | " -0.640 | \n",
1178 | " 0.533 | \n",
1179 | " 0.208 | \n",
1180 | " -0.166 | \n",
1181 | "
\n",
1182 | " \n",
1183 | " 1 | \n",
1184 | " 0.195716 | \n",
1185 | " -0.209 | \n",
1186 | " -0.121 | \n",
1187 | " 0.467 | \n",
1188 | " 0.398 | \n",
1189 | " 0.533 | \n",
1190 | " 0.208 | \n",
1191 | " -0.166 | \n",
1192 | "
\n",
1193 | " \n",
1194 | " 2 | \n",
1195 | " 0.835002 | \n",
1196 | " -1.455 | \n",
1197 | " -1.454 | \n",
1198 | " -0.717 | \n",
1199 | " -0.640 | \n",
1200 | " -0.319 | \n",
1201 | " -0.614 | \n",
1202 | " -0.166 | \n",
1203 | "
\n",
1204 | " \n",
1205 | " 3 | \n",
1206 | " 0.476161 | \n",
1207 | " -0.209 | \n",
1208 | " -0.121 | \n",
1209 | " -0.717 | \n",
1210 | " 0.398 | \n",
1211 | " -0.811 | \n",
1212 | " -0.327 | \n",
1213 | " -0.166 | \n",
1214 | "
\n",
1215 | " \n",
1216 | " 4 | \n",
1217 | " 0.117694 | \n",
1218 | " -0.209 | \n",
1219 | " -0.121 | \n",
1220 | " 0.467 | \n",
1221 | " 0.737 | \n",
1222 | " 0.533 | \n",
1223 | " 0.208 | \n",
1224 | " 0.210 | \n",
1225 | "
\n",
1226 | " \n",
1227 | " ... | \n",
1228 | " ... | \n",
1229 | " ... | \n",
1230 | " ... | \n",
1231 | " ... | \n",
1232 | " ... | \n",
1233 | " ... | \n",
1234 | " ... | \n",
1235 | " ... | \n",
1236 | "
\n",
1237 | " \n",
1238 | " 19153 | \n",
1239 | " 0.275602 | \n",
1240 | " 1.017 | \n",
1241 | " 0.461 | \n",
1242 | " -0.717 | \n",
1243 | " -0.640 | \n",
1244 | " 0.533 | \n",
1245 | " 0.208 | \n",
1246 | " -0.166 | \n",
1247 | "
\n",
1248 | " \n",
1249 | " 19154 | \n",
1250 | " 0.365512 | \n",
1251 | " 0.213 | \n",
1252 | " 0.461 | \n",
1253 | " -0.717 | \n",
1254 | " -0.640 | \n",
1255 | " 0.533 | \n",
1256 | " 0.208 | \n",
1257 | " -0.166 | \n",
1258 | "
\n",
1259 | " \n",
1260 | " 19155 | \n",
1261 | " 0.126794 | \n",
1262 | " 0.213 | \n",
1263 | " 0.461 | \n",
1264 | " 0.467 | \n",
1265 | " 0.398 | \n",
1266 | " 0.533 | \n",
1267 | " 0.208 | \n",
1268 | " -0.166 | \n",
1269 | "
\n",
1270 | " \n",
1271 | " 19156 | \n",
1272 | " 0.060842 | \n",
1273 | " 1.017 | \n",
1274 | " 0.461 | \n",
1275 | " 0.467 | \n",
1276 | " 0.398 | \n",
1277 | " -0.811 | \n",
1278 | " 0.208 | \n",
1279 | " 0.340 | \n",
1280 | "
\n",
1281 | " \n",
1282 | " 19157 | \n",
1283 | " 0.130552 | \n",
1284 | " 1.017 | \n",
1285 | " 0.461 | \n",
1286 | " -0.717 | \n",
1287 | " -0.640 | \n",
1288 | " -0.319 | \n",
1289 | " 0.208 | \n",
1290 | " 0.704 | \n",
1291 | "
\n",
1292 | " \n",
1293 | "
\n",
1294 | "
19158 rows × 8 columns
\n",
1295 | "
"
1296 | ],
1297 | "text/plain": [
1298 | " PROB city city_development_index company_size company_type \\\n",
1299 | "0 0.365512 0.213 0.461 -0.717 -0.640 \n",
1300 | "1 0.195716 -0.209 -0.121 0.467 0.398 \n",
1301 | "2 0.835002 -1.455 -1.454 -0.717 -0.640 \n",
1302 | "3 0.476161 -0.209 -0.121 -0.717 0.398 \n",
1303 | "4 0.117694 -0.209 -0.121 0.467 0.737 \n",
1304 | "... ... ... ... ... ... \n",
1305 | "19153 0.275602 1.017 0.461 -0.717 -0.640 \n",
1306 | "19154 0.365512 0.213 0.461 -0.717 -0.640 \n",
1307 | "19155 0.126794 0.213 0.461 0.467 0.398 \n",
1308 | "19156 0.060842 1.017 0.461 0.467 0.398 \n",
1309 | "19157 0.130552 1.017 0.461 -0.717 -0.640 \n",
1310 | "\n",
1311 | " experience enrolled_university education_level \n",
1312 | "0 0.533 0.208 -0.166 \n",
1313 | "1 0.533 0.208 -0.166 \n",
1314 | "2 -0.319 -0.614 -0.166 \n",
1315 | "3 -0.811 -0.327 -0.166 \n",
1316 | "4 0.533 0.208 0.210 \n",
1317 | "... ... ... ... \n",
1318 | "19153 0.533 0.208 -0.166 \n",
1319 | "19154 0.533 0.208 -0.166 \n",
1320 | "19155 0.533 0.208 -0.166 \n",
1321 | "19156 -0.811 0.208 0.340 \n",
1322 | "19157 -0.319 0.208 0.704 \n",
1323 | "\n",
1324 | "[19158 rows x 8 columns]"
1325 | ]
1326 | },
1327 | "execution_count": 26,
1328 | "metadata": {},
1329 | "output_type": "execute_result"
1330 | }
1331 | ],
1332 | "source": [
1333 | "res"
1334 | ]
1335 | },
1336 | {
1337 | "cell_type": "code",
1338 | "execution_count": 27,
1339 | "metadata": {},
1340 | "outputs": [],
1341 | "source": [
1342 | "sc.stop()"
1343 | ]
1344 | },
1345 | {
1346 | "cell_type": "code",
1347 | "execution_count": 28,
1348 | "metadata": {},
1349 | "outputs": [
1350 | {
1351 | "data": {
1352 | "text/plain": [
1353 | "array([0.36557352, 0.19577798, 0.83497665, ..., 0.12678668, 0.06083813,\n",
1354 | " 0.13061427])"
1355 | ]
1356 | },
1357 | "execution_count": 28,
1358 | "metadata": {},
1359 | "output_type": "execute_result"
1360 | }
1361 | ],
1362 | "source": [
1363 | "full_prediction = model.predict_proba(data)\n",
1364 | "full_prediction"
1365 | ]
1366 | },
1367 | {
1368 | "cell_type": "code",
1369 | "execution_count": 29,
1370 | "metadata": {},
1371 | "outputs": [
1372 | {
1373 | "data": {
1374 | "text/plain": [
1375 | "0.0002878641803194526"
1376 | ]
1377 | },
1378 | "execution_count": 29,
1379 | "metadata": {},
1380 | "output_type": "execute_result"
1381 | }
1382 | ],
1383 | "source": [
1384 | "(res['PROB'] - full_prediction).abs().max()"
1385 | ]
1386 | }
1387 | ],
1388 | "metadata": {
1389 | "kernelspec": {
1390 | "display_name": "Python 3",
1391 | "language": "python",
1392 | "name": "python3"
1393 | },
1394 | "language_info": {
1395 | "codemirror_mode": {
1396 | "name": "ipython",
1397 | "version": 3
1398 | },
1399 | "file_extension": ".py",
1400 | "mimetype": "text/x-python",
1401 | "name": "python",
1402 | "nbconvert_exporter": "python",
1403 | "pygments_lexer": "ipython3",
1404 | "version": "3.6.9"
1405 | }
1406 | },
1407 | "nbformat": 4,
1408 | "nbformat_minor": 2
1409 | }
1410 |
--------------------------------------------------------------------------------
/NLP_REPORT/BankName_char_len_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/BankName_char_len_hist.png
--------------------------------------------------------------------------------
/NLP_REPORT/BankName_tokens_len_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/BankName_tokens_len_hist.png
--------------------------------------------------------------------------------
/NLP_REPORT/Message_char_len_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/Message_char_len_hist.png
--------------------------------------------------------------------------------
/NLP_REPORT/Message_tokens_len_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/Message_tokens_len_hist.png
--------------------------------------------------------------------------------
/NLP_REPORT/concat_char_len_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/concat_char_len_hist.png
--------------------------------------------------------------------------------
/NLP_REPORT/concat_tokens_len_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/concat_tokens_len_hist.png
--------------------------------------------------------------------------------
/NLP_REPORT/report_nlp.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | LAMA report
8 |
9 |
61 |
65 |
66 |
67 |
68 |
LAMA report
69 |
70 |
710 |
721 |
722 |
723 |
--------------------------------------------------------------------------------
/NLP_REPORT/test_distribution_of_logits_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_distribution_of_logits_1.png
--------------------------------------------------------------------------------
/NLP_REPORT/test_pie_f1_metric_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_pie_f1_metric_1.png
--------------------------------------------------------------------------------
/NLP_REPORT/test_pr_curve_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_pr_curve_1.png
--------------------------------------------------------------------------------
/NLP_REPORT/test_preds_distribution_by_bins_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_preds_distribution_by_bins_1.png
--------------------------------------------------------------------------------
/NLP_REPORT/test_roc_curve_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/test_roc_curve_1.png
--------------------------------------------------------------------------------
/NLP_REPORT/valid_distribution_of_logits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_distribution_of_logits.png
--------------------------------------------------------------------------------
/NLP_REPORT/valid_pie_f1_metric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_pie_f1_metric.png
--------------------------------------------------------------------------------
/NLP_REPORT/valid_pr_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_pr_curve.png
--------------------------------------------------------------------------------
/NLP_REPORT/valid_preds_distribution_by_bins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_preds_distribution_by_bins.png
--------------------------------------------------------------------------------
/NLP_REPORT/valid_roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/NLP_REPORT/valid_roc_curve.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Materials for LightAutoML workshop (DataFest 2021)
2 |
3 | - `LightAutoML demo (Blackbox).ipynb` - How-to for LightAutoML blackbox presets (with automatic report generation)
4 | - `LightAutoML demo (Whitebox).ipynb` - How-to for AutoWoE library and LightAutoML whitebox preset (with automatic inference SQL and report generation)
5 | - `LightAutoML demo (NLP).ipynb` - How-to for LightAutoML NLP preset (with automatic report generation and model interpretation)
6 |
7 | There are also all generated reports in the repo from the tutorials above:
8 | - `tabularAutoML_model_report` - report from LightAutoML Blackbox tutorial
9 | - `HR_REPORT_1` and `HR_REPORT_2` - report from LightAutoML Whitebox tutorial
10 | - `NLP_REPORT` - report from LightAutoML NLP tutorial
11 |
12 | *******
13 | # Questions / Issues / Suggestions
14 |
15 | Write a message to us:
16 | - [Alexander Ryzhkov](https://kaggle.com/alexryzhkov) (_email_: AMRyzhkov@sberbank.ru, _telegram_: @RyzhkovAlex)
17 | - [Anton Vakhrushev](https://kaggle.com/btbpanda) (_email_: AGVakhrushev@sberbank.ru)
18 | - [Dmitry Simakov](https://kaggle.com/simakov) (_email_: Simakov.D.E@sberbank.ru)
19 |
--------------------------------------------------------------------------------
/imgs/tutorial_NLP_image_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_NLP_image_1.jpg
--------------------------------------------------------------------------------
/imgs/tutorial_NLP_image_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_NLP_image_2.jpg
--------------------------------------------------------------------------------
/imgs/tutorial_blackbox_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_blackbox_pipeline.png
--------------------------------------------------------------------------------
/imgs/tutorial_blackbox_report_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_blackbox_report_1.png
--------------------------------------------------------------------------------
/imgs/tutorial_blackbox_report_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_blackbox_report_2.png
--------------------------------------------------------------------------------
/imgs/tutorial_blackbox_report_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_blackbox_report_3.png
--------------------------------------------------------------------------------
/imgs/tutorial_whitebox_report_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_whitebox_report_1.png
--------------------------------------------------------------------------------
/imgs/tutorial_whitebox_report_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_whitebox_report_2.png
--------------------------------------------------------------------------------
/imgs/tutorial_whitebox_report_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_whitebox_report_3.png
--------------------------------------------------------------------------------
/imgs/tutorial_whitebox_report_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/imgs/tutorial_whitebox_report_4.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/lama_interactive_report.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | LAMA report
8 |
9 |
61 |
65 |
66 |
67 |
68 |
LAMA report
69 |
70 |
790 |
801 |
802 |
803 |
--------------------------------------------------------------------------------
/tabularAutoML_model_report/test_distribution_of_logits_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_distribution_of_logits_1.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/test_pie_f1_metric_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_pie_f1_metric_1.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/test_pr_curve_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_pr_curve_1.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/test_preds_distribution_by_bins_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_preds_distribution_by_bins_1.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/test_roc_curve_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/test_roc_curve_1.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/valid_distribution_of_logits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_distribution_of_logits.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/valid_pie_f1_metric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_pie_f1_metric.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/valid_pr_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_pr_curve.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/valid_preds_distribution_by_bins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_preds_distribution_by_bins.png
--------------------------------------------------------------------------------
/tabularAutoML_model_report/valid_roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sberbank-ai-lab/lightautoml-datafest-workshop/88e5a53824ff6466c3ac6514708775beefc598ca/tabularAutoML_model_report/valid_roc_curve.png
--------------------------------------------------------------------------------