├── .gitignore ├── LICENSE ├── README.md ├── api_anomaly_project ├── .gitignore ├── Dockerfile ├── app │ ├── app.py │ ├── measure_response.py │ └── ping.py ├── assets │ ├── header.png │ ├── roc_pr_curves.png │ ├── shap.png │ └── thresholds.png ├── data │ ├── supervised_clean_data.parquet │ └── supervised_clean_data_w_features.parquet ├── models │ └── hgbt_final.joblib ├── notebooks │ ├── cleaning.ipynb │ ├── eda.ipynb │ ├── feature_engineering.ipynb │ ├── htmls │ │ ├── eda.html │ │ ├── feature_engineering.html │ │ └── modelling.html │ └── modelling.ipynb ├── readme.md ├── requirements.txt └── utils │ ├── __init__.py │ ├── cleaning.py │ ├── feature_engineering.py │ ├── ml.py │ └── visualisations.py ├── deployment ├── fastapi │ ├── Dockerfile │ ├── app.py │ ├── app_test.py │ ├── loan_catboost_model.cbm │ ├── measure_response.py │ └── requirements.txt └── flask │ ├── Dockerfile │ ├── app.py │ ├── app_test.py │ ├── loan_catboost_model.cbm │ ├── measure_response.py │ └── requirements.txt ├── hp_tuning ├── bv_tradeoff.png └── hp_tuning_rf_gbt.ipynb ├── metaflow └── fraud_email │ ├── email_eda.ipynb │ ├── fradulent_emails.txt │ ├── readme.md │ ├── requirements.txt │ └── utils │ ├── __init__.py │ ├── feature_generation.py │ ├── plots.py │ ├── preprocess.py │ └── read_data.py ├── mlflow ├── mlflow_experiment_tracking.ipynb └── old_notebook.ipynb ├── mlflow_models ├── .DS_Store ├── MLProject ├── model_search.ipynb ├── python_env.yaml ├── search_params.py ├── train_hgbt.py ├── train_rf.py └── utils │ ├── __init__.py │ ├── columns.py │ ├── data_utils.py │ └── eval_utils.py ├── mlflow_project ├── MLproject ├── conda_env.yaml ├── main.py └── steps │ ├── __init__.py │ ├── download_data.py │ ├── preprocess_data.py │ ├── train_final_model.py │ └── tune_model.py ├── polars ├── basics.ipynb ├── data_preparation_pipeline.py ├── data_utils │ ├── __init__.py │ ├── feature_engineering.py │ ├── processing.py │ └── transfomation.py ├── model.ipynb ├── pipe_config.yaml └── time_analysis.ipynb ├── pyspark ├── cleaning.py ├── conda_env.yaml ├── config.yaml ├── feature_engineering.py ├── gcs_config.yaml ├── ml_prep.py ├── pipe.py ├── spark_feature_engineering.ipynb ├── spark_hp_tuning.ipynb ├── spark_intro.ipynb └── tuning.py └── tfdf └── notebooks ├── data_preprocessing.ipynb ├── model_training.ipynb └── plot.html /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | # Files 134 | .csv 135 | .json 136 | .pq 137 | .parquet -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tutorials 2 | Notebooks/scripts for youtube tutorials 3 | -------------------------------------------------------------------------------- /api_anomaly_project/.gitignore: -------------------------------------------------------------------------------- 1 | ## The .gitignore file specifies things that git should ignore. 2 | ## This default template includes entries for R, Python and visual studio 3 | 4 | ## 5 | ## Add custom entries below here. 6 | ## 7 | dst-env/ 8 | .cache/v/cache/lastfailed 9 | tests/.cache/v/cache/lastfailed 10 | .vscode/settings.json 11 | .DS_Store 12 | *.db 13 | mlruns 14 | 15 | # datasets 16 | *.csv 17 | *.json 18 | 19 | ## Python Section - See https://github.com/github/gitignore/blob/master/Python.gitignore 20 | ## 21 | 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | env/ 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | wheels/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | .hypothesis/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | 78 | # Flask stuff: 79 | instance/ 80 | .webassets-cache 81 | 82 | # Scrapy stuff: 83 | .scrapy 84 | 85 | # Sphinx documentation 86 | docs/_build/ 87 | 88 | # PyBuilder 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # pyenv 95 | .python-version 96 | 97 | # celery beat schedule file 98 | celerybeat-schedule 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # dotenv 104 | .env 105 | 106 | # virtualenv 107 | .venv 108 | venv/ 109 | ENV/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | .spyproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # mypy 122 | .mypy_cache/ 123 | 124 | ## Ignore Visual Studio temporary files, build results, and 125 | ## files generated by popular Visual Studio add-ons. 126 | ## 127 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 128 | 129 | # User-specific files 130 | *.suo 131 | *.user 132 | *.userosscache 133 | *.sln.docstates 134 | 135 | # User-specific files (MonoDevelop/Xamarin Studio) 136 | *.userprefs 137 | 138 | # Build results 139 | [Dd]ebug/ 140 | [Dd]ebugPublic/ 141 | [Rr]elease/ 142 | [Rr]eleases/ 143 | x64/ 144 | x86/ 145 | bld/ 146 | [Bb]in/ 147 | [Oo]bj/ 148 | [Ll]og/ 149 | 150 | # Visual Studio 2015 cache/options directory 151 | .vs/ 152 | # Uncomment if you have tasks that create the project's static files in wwwroot 153 | #wwwroot/ 154 | 155 | # MSTest test Results 156 | [Tt]est[Rr]esult*/ 157 | [Bb]uild[Ll]og.* 158 | 159 | # NUNIT 160 | *.VisualState.xml 161 | TestResult.xml 162 | 163 | # Build Results of an ATL Project 164 | [Dd]ebugPS/ 165 | [Rr]eleasePS/ 166 | dlldata.c 167 | 168 | # Benchmark Results 169 | BenchmarkDotNet.Artifacts/ 170 | 171 | # .NET Core 172 | project.lock.json 173 | project.fragment.lock.json 174 | artifacts/ 175 | **/Properties/launchSettings.json 176 | 177 | *_i.c 178 | *_p.c 179 | *_i.h 180 | *.ilk 181 | *.meta 182 | *.obj 183 | *.pch 184 | *.pdb 185 | *.pgc 186 | *.pgd 187 | *.rsp 188 | *.sbr 189 | *.tlb 190 | *.tli 191 | *.tlh 192 | *.tmp 193 | *.tmp_proj 194 | *.log 195 | *.vspscc 196 | *.vssscc 197 | .builds 198 | *.pidb 199 | *.svclog 200 | *.scc 201 | 202 | # Chutzpah Test files 203 | _Chutzpah* 204 | 205 | # Visual C++ cache files 206 | ipch/ 207 | *.aps 208 | *.ncb 209 | *.opendb 210 | *.opensdf 211 | *.sdf 212 | *.cachefile 213 | *.VC.db 214 | *.VC.VC.opendb 215 | 216 | # Visual Studio profiler 217 | *.psess 218 | *.vsp 219 | *.vspx 220 | *.sap 221 | 222 | # Visual Studio Trace Files 223 | *.e2e 224 | 225 | # TFS 2012 Local Workspace 226 | $tf/ 227 | 228 | # Guidance Automation Toolkit 229 | *.gpState 230 | 231 | # ReSharper is a .NET coding add-in 232 | _ReSharper*/ 233 | *.[Rr]e[Ss]harper 234 | *.DotSettings.user 235 | 236 | # JustCode is a .NET coding add-in 237 | .JustCode 238 | 239 | # TeamCity is a build add-in 240 | _TeamCity* 241 | 242 | # DotCover is a Code Coverage Tool 243 | *.dotCover 244 | 245 | # AxoCover is a Code Coverage Tool 246 | .axoCover/* 247 | !.axoCover/settings.json 248 | 249 | # Visual Studio code coverage results 250 | *.coverage 251 | *.coveragexml 252 | 253 | # NCrunch 254 | _NCrunch_* 255 | .*crunch*.local.xml 256 | nCrunchTemp_* 257 | 258 | # MightyMoose 259 | *.mm.* 260 | AutoTest.Net/ 261 | 262 | # Web workbench (sass) 263 | .sass-cache/ 264 | 265 | # Installshield output folder 266 | [Ee]xpress/ 267 | 268 | # DocProject is a documentation generator add-in 269 | DocProject/buildhelp/ 270 | DocProject/Help/*.HxT 271 | DocProject/Help/*.HxC 272 | DocProject/Help/*.hhc 273 | DocProject/Help/*.hhk 274 | DocProject/Help/*.hhp 275 | DocProject/Help/Html2 276 | DocProject/Help/html 277 | 278 | # Click-Once directory 279 | publish/ 280 | 281 | # Publish Web Output 282 | *.[Pp]ublish.xml 283 | *.azurePubxml 284 | # Note: Comment the next line if you want to checkin your web deploy settings, 285 | # but database connection strings (with potential passwords) will be unencrypted 286 | *.pubxml 287 | *.publishproj 288 | 289 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 290 | # checkin your Azure Web App publish settings, but sensitive information contained 291 | # in these scripts will be unencrypted 292 | PublishScripts/ 293 | 294 | # NuGet Packages 295 | *.nupkg 296 | # The packages folder can be ignored because of Package Restore 297 | **/[Pp]ackages/* 298 | # except build/, which is used as an MSBuild target. 299 | !**/[Pp]ackages/build/ 300 | # Uncomment if necessary however generally it will be regenerated when needed 301 | #!**/[Pp]ackages/repositories.config 302 | # NuGet v3's project.json files produces more ignorable files 303 | *.nuget.props 304 | *.nuget.targets 305 | 306 | # Microsoft Azure Build Output 307 | csx/ 308 | *.build.csdef 309 | 310 | # Microsoft Azure Emulator 311 | ecf/ 312 | rcf/ 313 | 314 | # Windows Store app package directories and files 315 | AppPackages/ 316 | BundleArtifacts/ 317 | Package.StoreAssociation.xml 318 | _pkginfo.txt 319 | *.appx 320 | 321 | # Visual Studio cache files 322 | # files ending in .cache can be ignored 323 | *.[Cc]ache 324 | # but keep track of directories ending in .cache 325 | !*.[Cc]ache/ 326 | 327 | # Others 328 | ClientBin/ 329 | ~$* 330 | *~ 331 | *.dbmdl 332 | *.dbproj.schemaview 333 | *.jfm 334 | *.pfx 335 | *.publishsettings 336 | orleans.codegen.cs 337 | 338 | # Since there are multiple workflows, uncomment next line to ignore bower_components 339 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 340 | #bower_components/ 341 | 342 | # RIA/Silverlight projects 343 | Generated_Code/ 344 | 345 | # Backup & report files from converting an old project file 346 | # to a newer Visual Studio version. Backup files are not needed, 347 | # because we have git ;-) 348 | _UpgradeReport_Files/ 349 | Backup*/ 350 | UpgradeLog*.XML 351 | UpgradeLog*.htm 352 | 353 | # SQL Server files 354 | *.mdf 355 | *.ldf 356 | *.ndf 357 | 358 | # Business Intelligence projects 359 | *.rdl.data 360 | *.bim.layout 361 | *.bim_*.settings 362 | 363 | # Microsoft Fakes 364 | FakesAssemblies/ 365 | 366 | # GhostDoc plugin setting file 367 | *.GhostDoc.xml 368 | 369 | # Node.js Tools for Visual Studio 370 | .ntvs_analysis.dat 371 | node_modules/ 372 | 373 | # Typescript v1 declaration files 374 | typings/ 375 | 376 | # Visual Studio 6 build log 377 | *.plg 378 | 379 | # Visual Studio 6 workspace options file 380 | *.opt 381 | 382 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 383 | *.vbw 384 | 385 | # Visual Studio LightSwitch build output 386 | **/*.HTMLClient/GeneratedArtifacts 387 | **/*.DesktopClient/GeneratedArtifacts 388 | **/*.DesktopClient/ModelManifest.xml 389 | **/*.Server/GeneratedArtifacts 390 | **/*.Server/ModelManifest.xml 391 | _Pvt_Extensions 392 | 393 | # Paket dependency manager 394 | .paket/paket.exe 395 | paket-files/ 396 | 397 | # FAKE - F# Make 398 | .fake/ 399 | 400 | # JetBrains Rider 401 | .idea/ 402 | *.sln.iml 403 | 404 | # CodeRush 405 | .cr/ 406 | 407 | # Python Tools for Visual Studio (PTVS) 408 | __pycache__/ 409 | *.pyc 410 | 411 | # Cake - Uncomment if you are using it 412 | # tools/** 413 | # !tools/packages.config 414 | 415 | # Tabs Studio 416 | *.tss 417 | 418 | # Telerik's JustMock configuration file 419 | *.jmconfig 420 | 421 | # BizTalk build output 422 | *.btp.cs 423 | *.btm.cs 424 | *.odx.cs 425 | *.xsd.cs 426 | 427 | # OpenCover UI analysis results 428 | OpenCover/ 429 | junit/ -------------------------------------------------------------------------------- /api_anomaly_project/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # Start from a base image 3 | FROM python:3.11-slim 4 | 5 | # Set the working directory 6 | WORKDIR /app 7 | 8 | # Copy the requirements file into the container 9 | COPY requirements.txt requirements.txt 10 | 11 | # Install the required packages 12 | RUN pip install --upgrade pip 13 | RUN pip install -r requirements.txt 14 | 15 | # Copy the application code into the container 16 | # YOUR FILES HERE 17 | 18 | # Expose the app port 19 | EXPOSE 80 20 | 21 | # Run command 22 | CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"] -------------------------------------------------------------------------------- /api_anomaly_project/app/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/app/app.py -------------------------------------------------------------------------------- /api_anomaly_project/app/measure_response.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/app/measure_response.py -------------------------------------------------------------------------------- /api_anomaly_project/app/ping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/app/ping.py -------------------------------------------------------------------------------- /api_anomaly_project/assets/header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/assets/header.png -------------------------------------------------------------------------------- /api_anomaly_project/assets/roc_pr_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/assets/roc_pr_curves.png -------------------------------------------------------------------------------- /api_anomaly_project/assets/shap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/assets/shap.png -------------------------------------------------------------------------------- /api_anomaly_project/assets/thresholds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/assets/thresholds.png -------------------------------------------------------------------------------- /api_anomaly_project/data/supervised_clean_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/data/supervised_clean_data.parquet -------------------------------------------------------------------------------- /api_anomaly_project/data/supervised_clean_data_w_features.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/data/supervised_clean_data_w_features.parquet -------------------------------------------------------------------------------- /api_anomaly_project/models/hgbt_final.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/models/hgbt_final.joblib -------------------------------------------------------------------------------- /api_anomaly_project/readme.md: -------------------------------------------------------------------------------- 1 | # API Security: Anomaly Detection App 2 | 3 | > [!WARNING] 4 | > All the metrics, plots, and insights are made up and taken from the internet 5 | 6 | ![network header](assets/header.png) 7 | 8 | ## Dataset 9 | 10 | The dataset for this project can be found on [Kaggle](https://www.kaggle.com/datasets/tangodelta/api-access-behaviour-anomaly-dataset/data) (licensed under GPL-2). 11 | 12 | Distributed micro-services based applications are typically accessed via APIs. The authors of this dataset have collected sequences of API calls from an application and put them into a graph format. For this graph, they've generated common API access patterns (i.e. sequences of API calls) and have calculated user access metrics that can be used to classify these behaviours. Also, they've manually labelled a set of these behaviour patters (our training set) and have provided the remaining sequences for us to classify. 13 | 14 | ## Objectives 15 | 16 | The main objective of this project is: 17 | 18 | > **To develop a system that will be able to detect anomalous behaviour from the API calls for the remaining sequences** 19 | 20 | To achieve this objective, it was further broken down into the following 5 technical sub-objectives: 21 | 22 | 1. To perform in-depth exploratory data analysis of the both datasets (tabular and graph) 23 | 2. To engineer new predictive features from the available graphs 24 | 3. To develop a supervised model to classify behaviour into normal and anomalous 25 | 4. To recommend a threshold that will perform better than the present baseline (ALGO-X) in terms of F1 score 26 | 5. To create an API endpoint for the trained model and deploy it 27 | 28 | ## Main Insights 29 | 30 | From the exploratory data analysis, we found out that anomalous behavviour patterns are cahracterised by: 31 | 32 | * Insight about anomaly vs normal #1 33 | * Insight about anomaly vs normal #2 34 | * Insight about anomaly vs normal #3 35 | 36 | ## Engineered Features 37 | 38 | From the provided networks, the following features were extracted: 39 | 40 | * Feature 1 - this feature helps us to measure *X* activity and is expected to be much higher in anomalies/normal behaviour 41 | * Feature 2 - this feature helps us to measure *X* activity and is expected to be much higher in anomalies/normal behaviour 42 | * Feature 3 - this feature helps us to measure *X* activity and is expected to be much higher in anomalies/normal behaviour 43 | 44 | As a result of this feature engineering work, the ROC AUC for the final model has increased by 30% and has improved F1 score uplift from the baseline model from 1.5 to 1.8. 45 | 46 | ## Model Selection 47 | 48 | Models were compared between each other using ROC AUC since we're dealing with binary classification task and the label distribution is relatively balanced. 49 | 2 models (XGBoost and LightGBM) were tuned for 50 iterations. The best performing model is LightGBM with the following parameters: 50 | 51 | ```json 52 | { 53 | colsample_by_tree: 0.2, 54 | num_trees: 2454, 55 | learning_rate: 0.02, 56 | subsample: 0.5 57 | } 58 | ``` 59 | 60 | ![ROC and PR curves](assets/roc_pr_curves.png) 61 | 62 | LightGBM has outperformed XGBoost by *X%* in terms of ROC AUC. From the PR AUC curves, we can also see that it can give use gigher level of recall with the same precision at most of the thresholds, so this model is selected for deployment. 63 | 64 | ### Model Explainability 65 | 66 | ![Shap](assets/shap.png) 67 | 68 | The selected model has a well balanced feature improtance distribution, with top 3 features being *X, Y, and ~*. The directions of SHAP values are intuitive, since we expect that anomalies have larger rate of *X* and *Y* and smaller number of *Z* 69 | Notably, the engineered features are also considered to be important (4th, 5th and 7th place), which means that the feature engineering effort was successful. 70 | 71 | ## Business Metrics 72 | 73 | To determine the achieved business metrics, we first need to set the threshold for our classifier. 74 | 75 | ![ROC and PR curves](assets/thresholds.png) 76 | 77 | From the threshold analysis, we can see that the maximum F1 score we can achieve is *X* across a variety of thresholds. For the purpose of this project, we can assume that the business is more interested in obtaining higher recall than precision, so we'll set the threshold at *X* which gives us the following metrics *(numbers are made up)*: 78 | 79 | | Threshold | 0.25 | 80 | |------------|------| 81 | | Precision | 0.7 | 82 | | Recall | 0.9 | 83 | | F1 Score | 0.85 | 84 | | Alert Rate | 0.02 | 85 | 86 | ## Prediction Service 87 | 88 | For this project, the assumtpion is that feature engineering will be handled by another serivce, so the deployment part is responsible purely for the model inference. 89 | To create the API locally, you'll need to use Docker. 90 | 91 | ### Step 1: Build Docker Image 92 | 93 | Clone the repository and go to the folder with the Dockerfile. Then run the following command to build the image. 94 | 95 | ```shell 96 | docker build -t prediction-service:latest . 97 | ``` 98 | 99 | To check if the image was created successfully, run `docker images` in you CLI and you should see `prediction-service` listed. 100 | 101 | ### Step 2: Send the Request 102 | 103 | To test if the API is working, you can run the `ping.py` file in the `app` folder. You'll need Python installed on your computer. 104 | 105 | ```shell 106 | python app/ping.py 107 | ``` 108 | 109 | ### Step 3: Measuring Response Time 110 | 111 | The following response times were measured locally by sending 100 requests per second from 1 user: 112 | 113 | | Response Time | Measure | 114 | |-------------------------------|--------------| 115 | | Median Response Time | 0.1 seconds | 116 | | 99th Percentile Response Time | 0.9 seconds | 117 | | Max Response Time | 0.95 seconds | 118 | 119 | To run these tests on your machine, you'll need to run the `measure_response.py` script 120 | 121 | ```shell 122 | python app/measure_response.py 123 | ``` 124 | 125 | ## Authors 126 | 127 | * [Antons Tocilins-Ruberts](https://github.com/aruberts) 128 | -------------------------------------------------------------------------------- /api_anomaly_project/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==2.3.1 2 | uvicorn==0.12.2 3 | fastapi==0.63.0 -------------------------------------------------------------------------------- /api_anomaly_project/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/utils/__init__.py -------------------------------------------------------------------------------- /api_anomaly_project/utils/cleaning.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | 4 | def count_missing(data: pl.DataFrame) -> pl.DataFrame: 5 | """Return a polars dataframe with missing counts per columns 6 | 7 | Args: 8 | data (pl.DataFrame): input dataframe to be analysed 9 | 10 | Returns: 11 | pl.DataFrame: dataframe with missing counts 12 | """ 13 | missing = data.select( 14 | pl.col(c).is_null().sum().alias(f"{c}_missing") for c in data.columns 15 | ) 16 | 17 | return missing 18 | -------------------------------------------------------------------------------- /api_anomaly_project/utils/feature_engineering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import plotly.express as px 3 | import polars as pl 4 | import ppscore as pps 5 | 6 | 7 | def aggregate_node_features( 8 | data: pl.DataFrame, node_features: list[str], by: str = "_id" 9 | ) -> pl.DataFrame: 10 | """Utility function to generate basic aggregation statistics features for node level features 11 | 12 | Args: 13 | data (pl.DataFrame): input dataframe 14 | node_features (list[str]): list of node features to aggregate 15 | by (str, optional): the graph ID column. Defaults to "_id". 16 | 17 | Returns: 18 | pl.DataFrame: dataframe with aggregated features 19 | """ 20 | aggs = [] 21 | for f in node_features: 22 | avg = pl.col(f).mean().alias(f"avg_{f}") 23 | min_val = pl.col(f).min().alias(f"min_{f}") 24 | max_val = pl.col(f).max().alias(f"max_{f}") 25 | std = pl.col(f).std().alias(f"std_{f}") 26 | aggs += [avg, min_val, max_val, std] 27 | agg_data = data.group_by(by).agg(aggs) 28 | 29 | return agg_data 30 | 31 | 32 | def feature_predictive_power( 33 | data: pl.DataFrame, x: str, y: str, plot: bool = True 34 | ) -> np.float32: 35 | """Utility to calcualte predictive power of a feature and plot its relationship with the target 36 | Args: 37 | data (pl.DataFrame): input dataframe 38 | x (str): name of the feature 39 | y (str): name of the target 40 | plot (bool, optional): indicator whether you want to plot the relationship. Defaults to True. 41 | 42 | Returns: 43 | np.float32: predictive power score 44 | """ 45 | data_pd = data.select([x, y]).to_pandas() 46 | score = np.float32(pps.score(data_pd, x, y)["ppscore"]).round(4) 47 | 48 | if plot: 49 | print(f"Predictive Power Score: {score}") 50 | fig = px.histogram( 51 | x=data_pd[x], 52 | color=data_pd[y], 53 | marginal="box", 54 | histnorm="probability", 55 | title=f"{x} distribution by {y}", 56 | ) 57 | fig.show() 58 | 59 | return score 60 | 61 | 62 | def get_graph_features(data: pl.DataFrame, node_features: bool = True) -> pl.DataFrame: 63 | """Pipeline function to generate graph features 64 | 65 | Args: 66 | data (pl.DataFrame): dataframe with edges 'from' and 'to' 67 | node_features (bool, optional): Indicator whether you want to create node level features. Defaults to True. 68 | 69 | Returns: 70 | pl.DataFrame: dataframe with engineered features 71 | """ 72 | graph_features = ( 73 | data.groupby("_id") 74 | .agg(pl.count().alias("n_connections"), pl.col("from"), pl.col("to")) 75 | .with_columns( 76 | pl.concat_list("from", "to") 77 | .list.unique() 78 | .list.lengths() 79 | .alias("n_unique_nodes") 80 | ) 81 | .select(["_id", "n_connections", "n_unique_nodes"]) 82 | ) 83 | 84 | if node_features: 85 | node_features_agg = aggregate_node_features( 86 | data, 87 | node_features=[ 88 | "global_source_degrees", 89 | "global_dest_degrees", 90 | "local_source_degrees", 91 | "local_dest_degrees", 92 | ], 93 | by="_id", 94 | ) 95 | 96 | graph_features = graph_features.join(node_features_agg, on="_id") 97 | 98 | return graph_features 99 | -------------------------------------------------------------------------------- /api_anomaly_project/utils/ml.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import numpy as np 3 | import numpy.typing as npt 4 | import pandas as pd 5 | import plotly.express as px 6 | from optuna import create_study 7 | from optuna.integration.mlflow import MLflowCallback 8 | from optuna.trial import FrozenTrial 9 | from sklearn.ensemble import HistGradientBoostingClassifier 10 | from sklearn.metrics import f1_score, precision_score, recall_score 11 | from sklearn.model_selection import cross_val_score 12 | 13 | 14 | def evaluate_thresholds( 15 | thresholds: npt.NDArray[np.float32], 16 | y_true: npt.NDArray[np.float32], 17 | y_pred_proba: npt.NDArray[np.float32], 18 | plot: bool = True, 19 | ) -> tuple[list[float], list[float], list[float]]: 20 | rcs = [] 21 | prs = [] 22 | f1s = [] 23 | 24 | for t in thresholds: 25 | test_binary_pred = y_pred_proba[:, 1] >= t 26 | prs.append(precision_score(y_true, test_binary_pred)) 27 | rcs.append(recall_score(y_true, test_binary_pred)) 28 | f1s.append(f1_score(y_true, test_binary_pred)) 29 | 30 | metrics_df = pd.DataFrame({"threshold": thresholds, "score": f1s, "metric": "F1"}) 31 | metrics_df = pd.concat( 32 | ( 33 | metrics_df, 34 | pd.DataFrame({"threshold": thresholds, "score": rcs, "metric": "Recall"}), 35 | ) 36 | ) 37 | metrics_df = pd.concat( 38 | ( 39 | metrics_df, 40 | pd.DataFrame( 41 | {"threshold": thresholds, "score": prs, "metric": "Precision"} 42 | ), 43 | ) 44 | ) 45 | 46 | optimal_thr = thresholds[np.argmax(f1s)] 47 | optimal_f1 = f1s[np.argmax(f1s)] 48 | optimal_rc = rcs[np.argmax(f1s)] 49 | optimal_pr = prs[np.argmax(f1s)] 50 | 51 | print("Threshold with Max F1 Score: ", optimal_thr) 52 | print(f"F1 at threshold {optimal_thr}: {optimal_f1}") 53 | print(f"Recall at threshold {optimal_thr}: {optimal_rc}") 54 | print(f"Precision at threshold {optimal_thr}: {optimal_pr} ") 55 | 56 | if plot: 57 | fig = px.line( 58 | metrics_df, 59 | x="threshold", 60 | y="score", 61 | color="metric", 62 | title="Metrics per Threshold", 63 | ) 64 | fig.show() 65 | 66 | return rcs, prs, f1s 67 | 68 | 69 | def tune_hgbt( 70 | n_trials: int, mlflc: MLflowCallback, X_train: pd.DataFrame, y_train: pd.Series 71 | ) -> FrozenTrial: 72 | @mlflc.track_in_mlflow() 73 | def objective(trial): 74 | params = { 75 | "learning_rate": 0.1, 76 | "max_iter": trial.suggest_int("max_iter", 10, 100), 77 | "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 31), 78 | "max_depth": trial.suggest_int("max_depth", 2, 10), 79 | "l2_regularization": trial.suggest_float("l2_regularization", 0, 10), 80 | } 81 | mlflow.set_tag("model_name", "HGBT") 82 | mlflow.log_params(params) 83 | 84 | gbt = HistGradientBoostingClassifier(**params) 85 | 86 | roc_auc = cross_val_score(gbt, X_train, y_train, cv=5, scoring="roc_auc").mean() 87 | print("ROC AUC (avg 5-fold):", roc_auc) 88 | 89 | return roc_auc 90 | 91 | study = create_study(direction="maximize", study_name="hgbt_tuning") 92 | study.optimize(objective, n_trials=n_trials, callbacks=[mlflc]) 93 | return study.best_trial 94 | -------------------------------------------------------------------------------- /api_anomaly_project/utils/visualisations.py: -------------------------------------------------------------------------------- 1 | import plotly.express as px 2 | import plotly.graph_objects as go 3 | 4 | import polars as pl 5 | 6 | 7 | def bar_plot(data: pl.DataFrame, column: str, title: str) -> go.Figure: 8 | """Creates a plotly barplot from Polars column 9 | 10 | Args: 11 | data (pl.DataFrame): input dataframe 12 | column (str): column to plot 13 | title (str): title for the plot 14 | 15 | Returns: 16 | go.Figure: resulting barplot as plotly Figure 17 | """ 18 | counts = data[column].value_counts(sort=True) 19 | fig = px.bar( 20 | x=counts[column].to_list(), 21 | y=counts["counts"].to_list(), 22 | text_auto=True, 23 | title=title, 24 | color_discrete_sequence=px.colors.qualitative.Antique, 25 | labels={ 26 | "x": column, 27 | "y": "Counts", 28 | }, 29 | ) 30 | fig.update_traces( 31 | textfont_size=12, textangle=0, textposition="outside", cliponaxis=False 32 | ) 33 | 34 | return fig 35 | 36 | 37 | def proportion_plot( 38 | data: pl.DataFrame, column: str, target: str, title: str 39 | ) -> go.Figure: 40 | """Creates a plotly barplot with proportions 41 | 42 | Args: 43 | data (pl.DataFrame): input dataframe 44 | column (str): column to analyse 45 | target (str): a discrete target 46 | title (str): title for the plot 47 | 48 | Returns: 49 | go.Figure: resulting barplot as plotly Figure 50 | """ 51 | counts = data.groupby(column, target).agg(pl.count()) 52 | target_counts = counts.groupby(column).agg(pl.col("count").sum().alias("total")) 53 | proportions = counts.join(target_counts, on=column) 54 | proportions = proportions.with_columns( 55 | proportion=pl.col("count") / pl.col("total") 56 | ).sort((column, target)) 57 | fig = px.bar( 58 | x=proportions[column].to_list(), 59 | y=proportions["proportion"].to_list(), 60 | color=proportions[target].to_list(), 61 | color_discrete_sequence=px.colors.qualitative.Antique, 62 | labels={ 63 | "x": column, 64 | "y": f"{target} proportion", 65 | }, 66 | title=title, 67 | ) 68 | fig.update_traces( 69 | textfont_size=12, textangle=0, textposition="outside", cliponaxis=False 70 | ) 71 | 72 | return fig 73 | 74 | 75 | def boxplot_by_bin_with_target( 76 | data: pl.DataFrame, 77 | column_to_bin: str, 78 | numeric_column: str, 79 | target: str, 80 | number_bins: int = 10, 81 | ) -> go.Figure: 82 | """Creates a plotly boxplot 83 | 84 | Args: 85 | data (pl.DataFrame): input dataframe 86 | column_to_bin (str): numeric column to bin 87 | numeric_column (str): numeric column to create a box plot from 88 | target (str): target column to colour a boxplot 89 | number_bins (int, optional): number of quantile bins to create. Defaults to 10. 90 | 91 | Returns: 92 | go.Figure: _description_ 93 | """ 94 | 95 | temp = data.select( 96 | pl.col(column_to_bin) 97 | .qcut(number_bins, allow_duplicates=True) 98 | .alias(f"{column_to_bin}_binned"), 99 | pl.col(column_to_bin), 100 | pl.col(numeric_column), 101 | pl.col(target), 102 | ) 103 | 104 | order = ( 105 | temp.groupby(f"{column_to_bin}_binned") 106 | .agg(pl.col(column_to_bin).min().alias("min")) 107 | .sort("min")[f"{column_to_bin}_binned"] 108 | .to_list() 109 | ) 110 | 111 | fig = px.box( 112 | x=temp[f"{column_to_bin}_binned"].to_list(), 113 | y=temp[numeric_column].to_list(), 114 | color=temp[target].to_list(), 115 | color_discrete_sequence=px.colors.qualitative.Antique, 116 | log_y=True, 117 | category_orders={"x": order}, 118 | labels={ 119 | "x": "", 120 | "y": numeric_column, 121 | }, 122 | ) 123 | 124 | return fig 125 | -------------------------------------------------------------------------------- /deployment/fastapi/Dockerfile: -------------------------------------------------------------------------------- 1 | # Start from a base image 2 | FROM python:3.9-slim 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Copy the requirements file into the container 8 | COPY requirements.txt requirements.txt 9 | 10 | # Install the required packages 11 | RUN pip install --upgrade pip 12 | RUN pip install -r requirements.txt 13 | 14 | # Copy the application code into the container 15 | COPY ["loan_catboost_model.cbm", "app.py", "./"] . 16 | 17 | # Expose the app port 18 | EXPOSE 80 19 | 20 | # Run command 21 | CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"] -------------------------------------------------------------------------------- /deployment/fastapi/app.py: -------------------------------------------------------------------------------- 1 | import catboost as cb 2 | import pandas as pd 3 | from pydantic import BaseModel 4 | 5 | from fastapi import FastAPI 6 | 7 | 8 | # Pydantic classes for input and output 9 | class LoanApplication(BaseModel): 10 | Term: int 11 | NoEmp: int 12 | CreateJob: int 13 | RetainedJob: int 14 | longitude: float 15 | latitude: float 16 | GrAppv: float 17 | SBA_Appv: float 18 | is_new: str 19 | FranchiseCode: str 20 | UrbanRural: int 21 | City: str 22 | State: str 23 | Bank: str 24 | BankState: str 25 | RevLineCr: str 26 | naics_first_two: str 27 | same_state: str 28 | 29 | 30 | class PredictionOut(BaseModel): 31 | default_proba: float 32 | 33 | 34 | # Load the model 35 | model = cb.CatBoostClassifier() 36 | model.load_model("loan_catboost_model.cbm") 37 | 38 | # Start the app 39 | app = FastAPI() 40 | 41 | # Home page 42 | @app.get("/") 43 | def home(): 44 | return {"message": "Loan Default Prediction App", "model_version": 0.1} 45 | 46 | 47 | # Inference endpoint 48 | @app.post("/predict", response_model=PredictionOut) 49 | def predict(payload: LoanApplication): 50 | cust_df = pd.DataFrame([payload.dict()]) 51 | preds = model.predict_proba(cust_df)[0, 1] 52 | result = {"default_proba": preds} 53 | return result 54 | -------------------------------------------------------------------------------- /deployment/fastapi/app_test.py: -------------------------------------------------------------------------------- 1 | import random 2 | from locust import HttpUser, task, constant_throughput 3 | 4 | test_applications = [ 5 | { 6 | "Term": 84, 7 | "NoEmp": 5, 8 | "CreateJob": 0, 9 | "RetainedJob": 5, 10 | "longitude": -77.9221, 11 | "latitude": 35.3664, 12 | "GrAppv": 1500000.0, 13 | "SBA_Appv": 1275000.0, 14 | "is_new": True, 15 | "FranchiseCode": "0", 16 | "UrbanRural": 1, 17 | "City": "Other", 18 | "State": "NC", 19 | "Bank": "BBCN BANK", 20 | "BankState": "CA", 21 | "RevLineCr": "N", 22 | "naics_first_two": "45", 23 | "same_state": False, 24 | }, 25 | { 26 | "Term": 19, 27 | "NoEmp": 10, 28 | "CreateJob": 0, 29 | "RetainedJob": 10, 30 | "longitude": -85.0117, 31 | "latitude": 41.0699, 32 | "GrAppv": 3500000.0, 33 | "SBA_Appv": 1750000.0, 34 | "is_new": False, 35 | "FranchiseCode": "1", 36 | "UrbanRural": 2, 37 | "City": "Other", 38 | "State": "IN", 39 | "Bank": "WELLS FARGO BANK NATL ASSOC", 40 | "BankState": "SD", 41 | "RevLineCr": "Y", 42 | "naics_first_two": "81", 43 | "same_state": False, 44 | }, 45 | ] 46 | 47 | 48 | class BankLoan(HttpUser): 49 | wait_time = constant_throughput(1) 50 | 51 | @task 52 | def predict(self): 53 | self.client.post( 54 | "/predict", 55 | json=random.choice(test_applications), 56 | timeout=1, 57 | ) 58 | -------------------------------------------------------------------------------- /deployment/fastapi/loan_catboost_model.cbm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/deployment/fastapi/loan_catboost_model.cbm -------------------------------------------------------------------------------- /deployment/fastapi/measure_response.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | if __name__ == "__main__": 7 | # Example loan application 8 | application = { 9 | "Term": 84, 10 | "NoEmp": 5, 11 | "CreateJob": 0, 12 | "RetainedJob": 5, 13 | "longitude": -77.9221, 14 | "latitude": 35.3664, 15 | "GrAppv": 1500000.0, 16 | "SBA_Appv": 1275000.0, 17 | "is_new": "True", 18 | "FranchiseCode": "0", 19 | "UrbanRural": 1, 20 | "City": "Other", 21 | "State": "NC", 22 | "Bank": "BBCN BANK", 23 | "BankState": "CA", 24 | "RevLineCr": "N", 25 | "naics_first_two": "45", 26 | "same_state": "False" 27 | } 28 | 29 | # Location of my server 30 | url = "https://default-service-ni4eqbkvca-nw.a.run.app/predict" 31 | 32 | # Measure the response time 33 | all_times = [] 34 | # For 1000 times 35 | for i in tqdm(range(100)): 36 | t0 = time.time_ns() // 1_000_000 37 | # Send a request 38 | resp = requests.post(url, json=application) 39 | t1 = time.time_ns() // 1_000_000 40 | # Measure how much time it took to get a response in ms 41 | time_taken = t1 - t0 42 | all_times.append(time_taken) 43 | 44 | # Print out the results 45 | print("Response time in ms:") 46 | print("Median:", np.quantile(all_times, 0.5)) 47 | print("95th precentile:", np.quantile(all_times, 0.95)) 48 | print("Max:", np.max(all_times)) 49 | -------------------------------------------------------------------------------- /deployment/fastapi/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.92.0 2 | pydantic==1.10.5 3 | uvicorn==0.20.0 4 | catboost==1.1.1 5 | numpy==1.21.5 6 | pandas==1.5.2 7 | gunicorn==20.1.0 -------------------------------------------------------------------------------- /deployment/flask/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image is Python 3.9 2 | FROM python:3.9-slim 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Copy the requirements file into the container 8 | COPY requirements.txt requirements.txt 9 | 10 | # Install the required packages 11 | RUN pip install --upgrade pip 12 | RUN pip install -r requirements.txt 13 | 14 | # Copy the model and application code into the container 15 | COPY ["loan_catboost_model.cbm", "app.py", "./"] . 16 | 17 | # Run the app using gunicorn 18 | ENTRYPOINT [ "gunicorn", "--bind=0.0.0.0:8989", "app:app" ] 19 | -------------------------------------------------------------------------------- /deployment/flask/app.py: -------------------------------------------------------------------------------- 1 | import catboost as cb 2 | import pandas as pd 3 | 4 | from flask import Flask, jsonify, request 5 | 6 | # Load the model 7 | model = cb.CatBoostClassifier() 8 | model.load_model("loan_catboost_model.cbm") 9 | 10 | # Init the app 11 | app = Flask("default") 12 | 13 | 14 | # Setup prediction endpoint 15 | @app.route("/predict", methods=["POST"]) 16 | def predict(): 17 | # Get the provided JSON 18 | X = request.get_json() 19 | # Perform a prediction 20 | preds = model.predict_proba(pd.DataFrame(X, index=[0]))[0, 1] 21 | # Output json with prediction 22 | result = {"default_proba": preds} 23 | return jsonify(result) 24 | 25 | 26 | if __name__ == "__main__": 27 | # Run the app on local host and port 8989 28 | app.run(debug=True, host="0.0.0.0", port=8989) 29 | -------------------------------------------------------------------------------- /deployment/flask/app_test.py: -------------------------------------------------------------------------------- 1 | import random 2 | from locust import HttpUser, task, constant_throughput 3 | 4 | test_applications = [ 5 | { 6 | "Term": 84, 7 | "NoEmp": 5, 8 | "CreateJob": 0, 9 | "RetainedJob": 5, 10 | "longitude": -77.9221, 11 | "latitude": 35.3664, 12 | "GrAppv": 1500000.0, 13 | "SBA_Appv": 1275000.0, 14 | "is_new": True, 15 | "FranchiseCode": "0", 16 | "UrbanRural": 1, 17 | "City": "Other", 18 | "State": "NC", 19 | "Bank": "BBCN BANK", 20 | "BankState": "CA", 21 | "RevLineCr": "N", 22 | "naics_first_two": "45", 23 | "same_state": False, 24 | }, 25 | { 26 | "Term": 19, 27 | "NoEmp": 10, 28 | "CreateJob": 0, 29 | "RetainedJob": 10, 30 | "longitude": -85.0117, 31 | "latitude": 41.0699, 32 | "GrAppv": 3500000.0, 33 | "SBA_Appv": 1750000.0, 34 | "is_new": False, 35 | "FranchiseCode": "1", 36 | "UrbanRural": 2, 37 | "City": "Other", 38 | "State": "IN", 39 | "Bank": "WELLS FARGO BANK NATL ASSOC", 40 | "BankState": "SD", 41 | "RevLineCr": "Y", 42 | "naics_first_two": "81", 43 | "same_state": False, 44 | }, 45 | ] 46 | 47 | 48 | class BankLoan(HttpUser): 49 | wait_time = constant_throughput(1) 50 | 51 | @task 52 | def predict(self): 53 | self.client.post( 54 | "/predict", 55 | json=random.choice(test_applications), 56 | timeout=1, 57 | ) 58 | -------------------------------------------------------------------------------- /deployment/flask/loan_catboost_model.cbm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/deployment/flask/loan_catboost_model.cbm -------------------------------------------------------------------------------- /deployment/flask/measure_response.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | if __name__ == "__main__": 7 | # Example loan application 8 | application = { 9 | "Term": 84, 10 | "NoEmp": 5, 11 | "CreateJob": 0, 12 | "RetainedJob": 5, 13 | "longitude": -77.9221, 14 | "latitude": 35.3664, 15 | "GrAppv": 1500000.0, 16 | "SBA_Appv": 1275000.0, 17 | "is_new": True, 18 | "FranchiseCode": "0", 19 | "UrbanRural": 1, 20 | "City": "Other", 21 | "State": "NC", 22 | "Bank": "BBCN BANK", 23 | "BankState": "CA", 24 | "RevLineCr": "N", 25 | "naics_first_two": "45", 26 | "same_state": False, 27 | } 28 | 29 | # Location of my server 30 | url = "http://0.0.0.0:8989/predict" 31 | 32 | # Measure the response time 33 | all_times = [] 34 | # For 1000 times 35 | for i in tqdm(range(1000)): 36 | t0 = time.time_ns() // 1_000_000 37 | # Send a request 38 | resp = requests.post(url, json=application) 39 | t1 = time.time_ns() // 1_000_000 40 | # Measure how much time it took to get a response in ms 41 | time_taken = t1 - t0 42 | all_times.append(time_taken) 43 | 44 | # Print out the results 45 | print("Response time in ms:") 46 | print("Median:", np.quantile(all_times, 0.5)) 47 | print("95th precentile:", np.quantile(all_times, 0.95)) 48 | print("Max:", np.max(all_times)) 49 | -------------------------------------------------------------------------------- /deployment/flask/requirements.txt: -------------------------------------------------------------------------------- 1 | flask==2.2.3 2 | catboost==1.1.1 3 | numpy==1.21.5 4 | pandas==1.5.2 5 | gunicorn==20.1.0 -------------------------------------------------------------------------------- /hp_tuning/bv_tradeoff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/hp_tuning/bv_tradeoff.png -------------------------------------------------------------------------------- /metaflow/fraud_email/fradulent_emails.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/metaflow/fraud_email/fradulent_emails.txt -------------------------------------------------------------------------------- /metaflow/fraud_email/readme.md: -------------------------------------------------------------------------------- 1 | # Text Processing Pipeline using Polars 2 | 3 | This repository contains the code for my medium post **Fast String Processing with Polars - Scam Emails Dataset**. 4 | 5 | The project implements a text processing pipeline using the Polars library for efficient data manipulation and analysis. The pipeline is designed to handle text data, perform various preprocessing tasks, and extract useful features from the text. 6 | 7 | ## Dataset 8 | 9 | The dataset used in this project is the CLAIR collection of fraud emails by Radev, D. (2008). The dataset can be accessed from the ACL Data and Code Repository under the identifier ADCR2008T001. More information about the dataset can be found at [ACL Data and Code Repository](http://aclweb.org/aclwiki). 10 | 11 | ## Dependencies 12 | 13 | The following dependencies are required to run the text processing pipeline: 14 | 15 | ``` 16 | numpy==1.23.5 17 | pandas==1.5.3 18 | polars==0.17.14 19 | nltk==3.8.1 20 | scikit-learn==1.2.2 21 | matplotlib==3.7.1 22 | wordcloud==1.9.2 23 | ``` 24 | 25 | ## Run in a Notebook 26 | 27 | 1. Install the required dependencies using `pip` 28 | `pip install -r requirements.txt` 29 | 30 | 2. Navigate to `email_eda.ipynb` and run the code to load, pre-process, clean, and tokenise the emails. Additionally, it will cluster the texts and generate the wordcloud for each cluster. 31 | -------------------------------------------------------------------------------- /metaflow/fraud_email/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.5 2 | pandas==1.5.3 3 | polars==0.17.14 4 | nltk==3.8.1 5 | scikit-learn==1.2.2 6 | matplotlib==3.7.1 7 | wordcloud==1.9.2 -------------------------------------------------------------------------------- /metaflow/fraud_email/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/metaflow/fraud_email/utils/__init__.py -------------------------------------------------------------------------------- /metaflow/fraud_email/utils/feature_generation.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | 4 | def extract_fields(emails: pl.DataFrame) -> pl.DataFrame: 5 | """ 6 | Extracts specific fields from a DataFrame containing email data and 7 | returns a modified DataFrame. 8 | 9 | Args: 10 | emails (pl.DataFrame): A DataFrame containing email data. 11 | 12 | Returns: 13 | pl.DataFrame: A modified DataFrame with extracted fields. 14 | """ 15 | email_pattern = r"From:\s*([^<\n\s]+)" 16 | subject_pattern = r"Subject:\s*(.*)" 17 | name_email_pattern = r'From:\s*"?([^"<]+)"?\s*<([^>]+)>' 18 | 19 | emails = ( 20 | emails.with_columns( 21 | pl.col("emails").str.extract(name_email_pattern, 2).alias("sender_email"), 22 | pl.col("emails").str.extract(name_email_pattern, 1).alias("sender_name"), 23 | pl.col("emails").str.extract(subject_pattern, 1).alias("subject"), 24 | ) 25 | .with_columns( 26 | pl.when(pl.col("sender_email").is_null()) 27 | .then(pl.col("emails").str.extract(email_pattern, 1)) 28 | .otherwise(pl.col("sender_email")) 29 | .alias("sender_email") 30 | ) 31 | .with_columns( 32 | pl.col("emails") 33 | .str.replace("Status: RO", "Status: O", literal=True) 34 | .str.split("Status: O") 35 | .arr.get(1) 36 | .alias("email_text") 37 | ) 38 | ) 39 | 40 | return emails 41 | 42 | 43 | def email_features(data: pl.DataFrame, col: str) -> pl.DataFrame: 44 | """ 45 | Computes additional features for a specified column in a DataFrame 46 | containing email data and returns a modified DataFrame. 47 | 48 | Args: 49 | data (pl.DataFrame): A DataFrame containing email data. 50 | col (str): The name of the column in the DataFrame to compute features for. 51 | 52 | Returns: 53 | pl.DataFrame: A modified DataFrame with additional computed features. 54 | """ 55 | data = data.with_columns( 56 | pl.col(col).str.n_chars().alias(f"{col}_length"), 57 | ).with_columns( 58 | (pl.col(col).str.count_match(r"[A-Z]") / pl.col(f"{col}_length")).alias( 59 | f"{col}_percent_capital" 60 | ), 61 | (pl.col(col).str.count_match(r"[^A-Za-z ]") / pl.col(f"{col}_length")).alias( 62 | f"{col}_percent_digits" 63 | ), 64 | ) 65 | 66 | return data 67 | -------------------------------------------------------------------------------- /metaflow/fraud_email/utils/plots.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from wordcloud import WordCloud 3 | 4 | 5 | def generate_word_cloud(text: str): 6 | """ 7 | Generate and display a word cloud image based on the provided text. 8 | 9 | Args: 10 | text (str): The input text to generate the word cloud from. 11 | 12 | Returns: 13 | None 14 | """ 15 | # Generate a word cloud image 16 | wordcloud = WordCloud( 17 | max_words=100, background_color="white", width=1600, height=800 18 | ).generate(text) 19 | 20 | plt.figure(figsize=(20, 10), facecolor="k") 21 | plt.imshow(wordcloud) 22 | plt.axis("off") 23 | plt.tight_layout(pad=0) 24 | plt.show() 25 | -------------------------------------------------------------------------------- /metaflow/fraud_email/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | 4 | def email_clean( 5 | data: pl.DataFrame, col: str, new_col_name: str | None = None 6 | ) -> pl.DataFrame: 7 | """ 8 | Cleans and preprocesses the text in a specified column of a DataFrame containing 9 | email data, and returns a modified DataFrame. 10 | 11 | Args: 12 | data (pl.DataFrame): A DataFrame containing email data. 13 | col (str): The name of the column in the DataFrame to clean and preprocess. 14 | new_col_name (str | None, optional): The name for the new column with cleaned data. Defaults to None. 15 | 16 | Returns: 17 | pl.DataFrame: A modified DataFrame with the cleaned and preprocessed text. 18 | 19 | """ 20 | data = data.with_columns( 21 | pl.col(col) 22 | .str.replace_all(r"<.*?>", " ") 23 | .str.replace_all(r"[^a-zA-Z\s]+", " ") 24 | .str.replace_all(r"\s+", " ") 25 | .str.to_lowercase() 26 | .alias(new_col_name if new_col_name is not None else col) 27 | ) 28 | 29 | return data 30 | 31 | 32 | def tokenise_text(data: pl.DataFrame, col: str, split_token: str = " ") -> pl.DataFrame: 33 | """ 34 | Tokenizes the text in a specified column of a DataFrame containing email data and returns a modified DataFrame. 35 | 36 | Args: 37 | data (pl.DataFrame): A DataFrame containing email data. 38 | col (str): The name of the column in the DataFrame to tokenize. 39 | split_token (str, optional): The token used to split the text into tokens. Defaults to " ". 40 | 41 | Returns: 42 | pl.DataFrame: A modified DataFrame with tokenized text. 43 | """ 44 | data = data.with_columns( 45 | pl.col(col).str.split(split_token).alias(f"{col}_tokenised") 46 | ) 47 | 48 | return data 49 | 50 | 51 | def remove_stopwords( 52 | data: pl.DataFrame, stopwords: set | list, col: str 53 | ) -> pl.DataFrame: 54 | """Removes stopwords from the text in a specified column of a DataFrame containing email data and returns a modified DataFrame. 55 | 56 | Args: 57 | data (pl.DataFrame): A DataFrame containing email data. 58 | stopwords (set | list): A set or list of stopwords to be removed from the text. 59 | col (str): The name of the column in the DataFrame to remove stopwords from. 60 | 61 | Returns: 62 | pl.DataFrame: A modified DataFrame with stopwords removed from the text. 63 | """ 64 | data = data.with_columns( 65 | pl.col(col) 66 | .arr.eval( 67 | pl.when( 68 | (~pl.element().is_in(stopwords)) & (pl.element().str.n_chars() > 2) 69 | ).then(pl.element()) 70 | ) 71 | .arr.eval(pl.element().drop_nulls()) 72 | ) 73 | return data 74 | -------------------------------------------------------------------------------- /metaflow/fraud_email/utils/read_data.py: -------------------------------------------------------------------------------- 1 | def load_emails_txt(path: str, split_str: str = "From r ") -> list[str]: 2 | """ 3 | Loads emails from a text file and returns them as a list. 4 | 5 | Args: 6 | path (str): The file path to the text file. 7 | split_str (str, optional): The string used to split the text file into 8 | individual emails. Defaults to "From r ". 9 | 10 | Returns: 11 | list[str]: A list of emails extracted from the text file. 12 | """ 13 | with open(path, "r", encoding="utf-8", errors="ignore") as file: 14 | text = file.read() 15 | 16 | emails = text.split(split_str) 17 | 18 | return emails 19 | -------------------------------------------------------------------------------- /mlflow_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/mlflow_models/.DS_Store -------------------------------------------------------------------------------- /mlflow_models/MLProject: -------------------------------------------------------------------------------- 1 | name: hp_search 2 | 3 | python_env: python_env.yaml 4 | 5 | entry_points: 6 | # Use Hyperopt to optimize hyperparams of the train entry_point. 7 | search_params: 8 | parameters: 9 | training_data: {type: string, default: "hemanthsai7/loandefault"} 10 | max_runs: {type: int, default: 10} 11 | model_type: {type: str, default: "hgbt"} 12 | command: "python -O search_params.py {training_data} --max-runs {max_runs} --model-type {model_type}" 13 | 14 | # train Random Forest model with default HPs 15 | train_rf: 16 | parameters: 17 | dset_name: {type: string, default: "sgpjesus/bank-account-fraud-dataset-neurips-2022"} 18 | max_depth: {type: int, default: 5} 19 | max_features: {type: float, default: 0.1} 20 | class_weight: {type: str, default: "balanced"} 21 | min_samples_leaf: {type: int, default: 10} 22 | command: "python train_rf.py {dset_name} 23 | --max-depth {max_depth} 24 | --max-features {max_features} 25 | --class-weight {class_weight} 26 | --min-samples-leaf {min_samples_leaf}" 27 | 28 | # train HistGradientBoosted model with default parameters 29 | train_hgbt: 30 | parameters: 31 | dset_name: {type: string, default: "sgpjesus/bank-account-fraud-dataset-neurips-2022"} 32 | max_depth: {type: int, default: 20} 33 | learning_rate: {type: float, default: 0.1} 34 | class_weight: {type: str, default: "balanced"} 35 | max_leaf_nodes: {type: int, default: 31} 36 | l2_regularization: {type: int, default: 1.} 37 | command: "python train_hgbt.py {dset_name} 38 | --max-depth {max_depth} 39 | --learning-rate {learning_rate} 40 | --class-weight {class_weight} 41 | --max-leaf-nodes {max_leaf_nodes} 42 | --l2-regularization {l2_regularization}" -------------------------------------------------------------------------------- /mlflow_models/model_search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "\n", 11 | "import mlflow\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "import requests\n", 15 | "from mlflow.tracking import MlflowClient\n", 16 | "\n", 17 | "from train_rf import CATEGORICAL_FEATURES, NUMERICAL_FEATURES\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "current_experiment=dict(mlflow.get_experiment_by_name(\"loan\"))\n", 27 | "experiment_id=current_experiment['experiment_id']\n", 28 | "\n", 29 | "# Get this from UI or CLI\n", 30 | "rf_parent_run = \"03046a89d08346a5bda301cc7c745885\"" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Find the best model" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Experiment had 10 HP tuning round\n", 50 | "Best run - 0 with PR AUC of 0.104\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "# To access MLFlow stuff we need to work with MlflowClient\n", 56 | "client = MlflowClient()\n", 57 | "\n", 58 | "# Searches runs for a specific attribute and filters them by Parent Run ID\n", 59 | "runs = client.search_runs(\n", 60 | " [experiment_id], \n", 61 | " f\"tags.mlflow.parentRunId = '{rf_parent_run}'\", \n", 62 | " order_by=[\"metrics.test_PR_AUC DESC\"]\n", 63 | ")\n", 64 | "\n", 65 | "# Select the best run according to test_PR_AUC metric\n", 66 | "best_run = np.argmax([f.data.metrics['test_PR_AUC'] for f in runs])\n", 67 | "best_pr_auc = np.round(runs[best_run].data.metrics['test_PR_AUC'], 4)\n", 68 | "\n", 69 | "print(f\"Experiment had {len(runs)} HP tuning round\")\n", 70 | "print(f\"Best run - {best_run} with PR AUC of {best_pr_auc}\")" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "Best model URI - runs:/1d2537d89cb04760b3b9bc501ee0854f/sklearn_models\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "# log-model history is stored as string, so we need to \"jsonify\" it first\n", 88 | "log_model_info = json.loads(runs[best_run].data.tags['mlflow.log-model.history'])[0]\n", 89 | "\n", 90 | "# Construct a valid model URI\n", 91 | "model_uri = 'runs:/' + log_model_info['run_id'] + '/' + log_model_info['artifact_path']\n", 92 | "print(f\"Best model URI - {model_uri}\")\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Load the best model" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 16, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# Data sample to test the model\n", 109 | "data = pd.read_csv(\"./data/raw/train.csv\", nrows=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 17, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stderr", 119 | "output_type": "stream", 120 | "text": [ 121 | "2023/02/14 11:46:29 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:\n", 122 | " - category-encoders (current: 2.6.0, required: category-encoders==2.3.0)\n", 123 | "To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.\n" 124 | ] 125 | }, 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "array([[0.4980769, 0.5019231]])" 130 | ] 131 | }, 132 | "execution_count": 17, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "# Load the model as pyfunc\n", 139 | "sklearn_pyfunc = mlflow.pyfunc.load_model(model_uri=model_uri)\n", 140 | "sklearn_pyfunc.predict(data)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## Register and Promote" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 19, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stderr", 157 | "output_type": "stream", 158 | "text": [ 159 | "Successfully registered model 'loan_model'.\n", 160 | "2023/02/14 11:51:20 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: loan_model, version 1\n", 161 | "Created version '1' of model 'loan_model'.\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "model_name = 'loan_model'\n", 167 | "model_version = 1\n", 168 | "\n", 169 | "# Register model\n", 170 | "mlflow.register_model(model_uri, model_name)\n", 171 | "\n", 172 | "# Promote to Production\n", 173 | "logs = client.transition_model_version_stage(name=model_name, version=model_version, stage=\"Production\")" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Load from Production Model Registry" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 20, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stderr", 190 | "output_type": "stream", 191 | "text": [ 192 | "2023/02/14 11:54:25 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:\n", 193 | " - category-encoders (current: 2.6.0, required: category-encoders==2.3.0)\n", 194 | "To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.\n" 195 | ] 196 | }, 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "array([[0.4980769, 0.5019231]])" 201 | ] 202 | }, 203 | "execution_count": 20, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "stage = 'Production'\n", 210 | "\n", 211 | "# Since it's a registered model in Production, we can load it like this now!\n", 212 | "# No need for model URIs\n", 213 | "model_registry_path = f'models:/{model_name}/{stage}'\n", 214 | "production_model = mlflow.pyfunc.load_model(model_registry_path)\n", 215 | "\n", 216 | "production_model.predict(data)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "## Serve models" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "Run this command in the terminal: `mlflow models serve --model-uri models:/loan_model/Production -p 5001`" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "### Call from server" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 22, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "# Prepare the data to be sent to API\n", 247 | "example = data[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]\n", 248 | "to_send = example.to_dict(orient='split')\n", 249 | "to_send.pop(\"index\", None)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 21, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "# Prediction endpoint\n", 259 | "url = 'http://127.0.0.1:5001/invocations'\n", 260 | "\n", 261 | "# Preprocess the example\n", 262 | "response = requests.post(url=url, data=json.dumps({\"dataframe_split\" :to_send}), headers={\"Content-type\": \"application/json\"})\n", 263 | "\n", 264 | "# Load the response\n", 265 | "response_json = json.loads(response.text)\n", 266 | "print(response_json)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [] 275 | } 276 | ], 277 | "metadata": { 278 | "interpreter": { 279 | "hash": "a2df742b932880654a3f6652148a9c802dc0dfad475f6beda4797814052023f2" 280 | }, 281 | "kernelspec": { 282 | "display_name": "Python 3.9.13 ('base')", 283 | "language": "python", 284 | "name": "python3" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.9.13" 297 | }, 298 | "orig_nbformat": 4 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 2 302 | } 303 | -------------------------------------------------------------------------------- /mlflow_models/python_env.yaml: -------------------------------------------------------------------------------- 1 | python: "3.9" 2 | build_dependencies: 3 | - pip 4 | dependencies: 5 | - numpy>=1.21 6 | - click>=8.0 7 | - pandas>=1.5 8 | - scipy>=1.7 9 | - scikit-learn==1.2.1 10 | - mlflow>=2.1 11 | - hyperopt==0.2.7 12 | - protobuf 13 | - kaggle 14 | - category-encoders==2.3.0 -------------------------------------------------------------------------------- /mlflow_models/search_params.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from hyperopt import fmin, hp, tpe 4 | from hyperopt.pyll import scope 5 | 6 | import mlflow.projects 7 | from mlflow.tracking import MlflowClient 8 | 9 | 10 | @click.command( 11 | help=""" 12 | Perform hyperparameter search with Hyperopt library. 13 | Optimize PR AUC. 14 | """ 15 | ) 16 | @click.option( 17 | "--max-runs", 18 | type=click.INT, 19 | default=10, 20 | help="Maximum number of runs to evaluate." 21 | ) 22 | @click.option( 23 | "--model-type", 24 | type=click.STRING, 25 | default="hgbt", 26 | help="Model type to tune" 27 | ) 28 | @click.argument("training_data") 29 | def train(training_data, max_runs, model_type): 30 | """ 31 | Run hyperparameter optimization. 32 | """ 33 | # create random file to store run ids of the training tasks 34 | tracking_client = MlflowClient() 35 | 36 | def new_eval(experiment_id): 37 | """ 38 | Create a new eval function 39 | :experiment_id: Experiment id for the training run 40 | :return: new eval function. 41 | """ 42 | 43 | def eval(params): 44 | """ 45 | Train sklearn model with given parameters by invoking MLflow run. 46 | :param params: Parameters to the train script we optimize over 47 | :return: The metric value evaluated on the validation data. 48 | """ 49 | with mlflow.start_run(nested=True) as child_run: 50 | if model_type == "rf": 51 | # Params used to train RF 52 | ( 53 | max_depth, max_features, 54 | class_weight, min_samples_leaf 55 | ) = params 56 | # Run the training script as MLflow sub-run 57 | p = mlflow.projects.run( 58 | uri=".", 59 | entry_point="train_rf", 60 | run_id=child_run.info.run_id, 61 | parameters={ 62 | "dset_name": training_data, 63 | "max_depth": str(max_depth), 64 | "max_features": str(max_features), 65 | "class_weight": str(class_weight), 66 | "min_samples_leaf": str(min_samples_leaf), 67 | }, 68 | experiment_id=experiment_id, 69 | synchronous=False, 70 | ) 71 | # No idea why, but it's needed? 72 | succeeded = p.wait() 73 | # Log params 74 | mlflow.log_params( 75 | { 76 | "max_depth": max_depth, 77 | "max_features": max_features, 78 | "class_weight": class_weight, 79 | "min_samples_leaf": min_samples_leaf, 80 | } 81 | ) 82 | elif model_type == "hgbt": 83 | # Params used to train HGBT 84 | ( 85 | max_depth, 86 | max_leaf_nodes, 87 | class_weight, 88 | l2_regularization, 89 | learning_rate, 90 | ) = params 91 | # Run the train_hgbt as sub-run 92 | p = mlflow.projects.run( 93 | uri=".", 94 | entry_point="train_hgbt", 95 | run_id=child_run.info.run_id, 96 | parameters={ 97 | "dset_name": training_data, 98 | "learning_rate": str(learning_rate), 99 | "max_leaf_nodes": str(max_leaf_nodes), 100 | "max_depth": str(max_depth), 101 | "class_weight": str(class_weight), 102 | "l2_regularization": str(l2_regularization), 103 | }, 104 | experiment_id=experiment_id, 105 | synchronous=False, 106 | ) 107 | succeeded = p.wait() 108 | mlflow.log_params( 109 | { 110 | "learning_rate": learning_rate, 111 | "max_leaf_nodes": max_leaf_nodes, 112 | "max_depth": max_depth, 113 | "class_weight": class_weight, 114 | "l2_regularization": l2_regularization, 115 | } 116 | ) 117 | print(succeeded) 118 | 119 | # Grab the test metrics from the MLflow run 120 | training_run = tracking_client.get_run(p.run_id) 121 | metrics = training_run.data.metrics 122 | test_prauc = metrics["test_PR_AUC"] 123 | 124 | return -test_prauc 125 | 126 | return eval 127 | 128 | if model_type == "rf": 129 | # Search space for RF 130 | space = [ 131 | scope.int(hp.quniform("max_depth", 1, 30, q=1)), 132 | hp.uniform("max_features", 0.05, 0.8), 133 | hp.choice("class_weight", ["balanced", None]), 134 | scope.int(hp.quniform("min_samples_leaf", 5, 100, q=5)), 135 | ] 136 | elif model_type == "hgbt": 137 | # Search space for HGBT 138 | space = [ 139 | scope.int(hp.quniform("max_depth", 1, 30, q=1)), 140 | scope.int(hp.quniform("max_leaf_nodes", 5, 100, q=5)), 141 | hp.choice("class_weight", ["balanced", None]), 142 | hp.uniform("l2_regularization", 0.0, 20.0), 143 | hp.uniform("learning_rate", 0.01, 0.1), 144 | ] 145 | else: 146 | raise ValueError(f"Model type {model_type} is not supported") 147 | 148 | # This starts the actual search_rf.py experiment run 149 | with mlflow.start_run() as run: 150 | # Get parent ID 151 | experiment_id = run.info.experiment_id 152 | 153 | # Optimisation function that takes parent id and search params as input 154 | best = fmin( 155 | fn=new_eval(experiment_id), 156 | space=space, 157 | algo=tpe.suggest, 158 | max_evals=max_runs, 159 | ) 160 | mlflow.set_tag("best params", str(best)) 161 | 162 | 163 | if __name__ == "__main__": 164 | train() 165 | -------------------------------------------------------------------------------- /mlflow_models/train_hgbt.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import click 4 | import mlflow 5 | import pandas as pd 6 | from sklearn.compose import ColumnTransformer 7 | from sklearn.ensemble import HistGradientBoostingClassifier 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.pipeline import Pipeline 10 | from sklearn.preprocessing import OrdinalEncoder 11 | 12 | from utils.columns import CATEGORICAL_FEATURES, NUMERICAL_FEATURES, TARGET 13 | from utils.data_utils import load_raw_data 14 | from utils.eval_utils import eval_and_log_metrics 15 | 16 | 17 | @click.command( 18 | help="Trains HGBT Model" 19 | "The input is expected in csv format." 20 | "The model and its metrics are logged with mlflow." 21 | ) 22 | @click.option("--max-depth", type=click.INT, default=20, help="Depth of the trees") 23 | @click.option( 24 | "--max-leaf-nodes", 25 | type=click.INT, 26 | default=31, 27 | help="The maximum number of leaves for each tree", 28 | ) 29 | @click.option( 30 | "--class-weight", type=click.STRING, default="balanced", help="Weight of labels" 31 | ) 32 | @click.option( 33 | "--l2-regularization", 34 | type=click.FLOAT, 35 | default=1.0, 36 | help="The L2 regularization parameter", 37 | ) 38 | @click.option( 39 | "--learning-rate", 40 | type=click.FLOAT, 41 | default=0.1, 42 | help="The learning rate, also known as shrinkage", 43 | ) 44 | @click.argument("dset_name") 45 | def run( 46 | dset_name, max_depth, max_leaf_nodes, class_weight, l2_regularization, learning_rate 47 | ): 48 | """ 49 | This function trains and logs an HistGradientBoostingClassifier model on a dataset. 50 | 51 | :param dset_name: The name of the dataset to be used. (str) 52 | :param max_depth: The maximum depth of the decision tree. (int) 53 | :param max_leaf_nodes: The maximum number of leaf nodes in the decision tree. (int) 54 | :param class_weight: The weight to be given to different classes in the target column. (str or None) 55 | :param l2_regularization: The L2 regularization value to be used by the model. (float) 56 | :param learning_rate: The learning rate to be used by the model. (float) 57 | 58 | :returns: None 59 | 60 | The function starts an MLflow run and logs various metrics such as accuracy, precision, and recall. 61 | It also logs the trained model using the mlflow.sklearn.log_model function. 62 | """ 63 | warnings.filterwarnings("ignore") 64 | # Read data 65 | csv_loc = load_raw_data(dset_name, file_name="train.csv") 66 | data = pd.read_csv(csv_loc) 67 | 68 | # Transform categoricals into category type 69 | data[CATEGORICAL_FEATURES] = data[CATEGORICAL_FEATURES].astype("category") 70 | 71 | # Train/test split 72 | train, test = train_test_split(data, random_state=42) 73 | 74 | # Separate X and y 75 | train_x = train[NUMERICAL_FEATURES + CATEGORICAL_FEATURES] 76 | train_y = train[[TARGET]] 77 | test_x = test[NUMERICAL_FEATURES + CATEGORICAL_FEATURES] 78 | test_y = test[[TARGET]] 79 | 80 | # Start the experiemnt 81 | with mlflow.start_run(): 82 | # Pass the params into dictionary 83 | hgbt_params = { 84 | "learning_rate": learning_rate, 85 | "max_leaf_nodes": max_leaf_nodes, 86 | "max_depth": max_depth, 87 | "class_weight": class_weight if class_weight != "None" else None, 88 | "l2_regularization": l2_regularization, 89 | } 90 | # Define model 91 | hgbt = HistGradientBoostingClassifier( 92 | **hgbt_params, 93 | categorical_features=CATEGORICAL_FEATURES, 94 | max_iter=10000, 95 | early_stopping=True, 96 | validation_fraction=10 97 | ) 98 | # Define transform 99 | transformer = ColumnTransformer( 100 | transformers=[ 101 | ("categorical", OrdinalEncoder(), CATEGORICAL_FEATURES) 102 | ], # HGBT still needs this 103 | verbose_feature_names_out=False, # to not alter categorical names 104 | remainder="passthrough", 105 | ) 106 | # Define pipeline 107 | pipeline = Pipeline(steps=[("prep", transformer), ("model", hgbt)]).set_output( 108 | transform="pandas" 109 | ) 110 | # Fit the pipeline 111 | pipeline.fit(train_x, train_y) 112 | # Evaluate on testset 113 | test_preds = pipeline.predict_proba(test_x) 114 | eval_and_log_metrics("test", test_y, test_preds[:, 1]) 115 | # Save the pipeline 116 | mlflow.sklearn.log_model( 117 | pipeline, "sklearn_models", pyfunc_predict_fn="predict_proba" 118 | ) 119 | 120 | 121 | if __name__ == "__main__": 122 | run() 123 | -------------------------------------------------------------------------------- /mlflow_models/train_rf.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import click 4 | import mlflow 5 | import pandas as pd 6 | from category_encoders import WOEEncoder 7 | from sklearn.compose import ColumnTransformer 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.pipeline import Pipeline 11 | 12 | from utils.columns import CATEGORICAL_FEATURES, NUMERICAL_FEATURES, TARGET 13 | from utils.data_utils import load_raw_data 14 | from utils.eval_utils import eval_and_log_metrics 15 | 16 | 17 | @click.command( 18 | help="Trains RF Model" 19 | "The input is expected in csv format." 20 | "The model and its metrics are logged with mlflow." 21 | ) 22 | @click.option("--max-depth", type=click.INT, default=5, help="Depth of the trees") 23 | @click.option( 24 | "--max-features", type=click.FLOAT, default=0.1, help="Fraction of features to use" 25 | ) 26 | @click.option( 27 | "--class-weight", type=click.STRING, default="balanced", help="Weight of labels" 28 | ) 29 | @click.option( 30 | "--min-samples-leaf", 31 | type=click.INT, 32 | default=10, 33 | help="Minimum number of samples required to be at a leaf node.", 34 | ) 35 | @click.argument("dset_name") 36 | def run( 37 | dset_name: str, 38 | max_depth: int, 39 | max_features: float, 40 | class_weight: str, 41 | min_samples_leaf: int, 42 | ): 43 | """ 44 | This function trains and logs a Random Forest Classifier pipeline with mlflow. 45 | 46 | :param dset_name: The name of the dataset to use for training. 47 | :param max_depth: The maximum depth of the tree in the Random Forest Classifier. 48 | :param max_features: The maximum number of features to consider when looking for the best split. 49 | :param class_weight: The weighting of the classes. Can be None, 'balanced', or a dictionary. 50 | :param min_samples_leaf: The minimum number of samples required to be at a leaf node. 51 | 52 | :return: None 53 | """ 54 | warnings.filterwarnings("ignore") 55 | # Read data 56 | csv_loc = load_raw_data(dset_name, file_name="train.csv") 57 | data = pd.read_csv(csv_loc) 58 | 59 | # Split data into train/test 60 | train, test = train_test_split(data, random_state=42) 61 | 62 | # Separate X and y 63 | train_x = train[NUMERICAL_FEATURES + CATEGORICAL_FEATURES] 64 | train_y = train[[TARGET]] 65 | 66 | test_x = test[NUMERICAL_FEATURES + CATEGORICAL_FEATURES] 67 | test_y = test[[TARGET]] 68 | 69 | # Star the experiment 70 | with mlflow.start_run(): 71 | # Pass the parameters into dictionary 72 | rf_params = { 73 | "max_depth": max_depth, 74 | "max_features": max_features, 75 | "class_weight": class_weight if class_weight != "None" else None, 76 | "min_samples_leaf": min_samples_leaf, 77 | } 78 | # Define model 79 | rf = RandomForestClassifier(**rf_params) 80 | # Define transform 81 | transformer = ColumnTransformer( 82 | transformers=[("categorical", WOEEncoder(), CATEGORICAL_FEATURES)], 83 | remainder="passthrough", 84 | ) 85 | # Define pipeline 86 | pipeline = Pipeline(steps=[("prep", transformer), ("model", rf)]) 87 | # Fit the model 88 | pipeline.fit(train_x, train_y) 89 | # Evaluate and log metrics 90 | test_preds = pipeline.predict_proba(test_x) 91 | eval_and_log_metrics("test", test_y, test_preds[:, 1]) 92 | # Save the model 93 | mlflow.sklearn.log_model( 94 | pipeline, "sklearn_models", pyfunc_predict_fn="predict_proba" 95 | ) # predict_proba because we want to predict a probabiltiy when deployed 96 | 97 | 98 | if __name__ == "__main__": 99 | run() 100 | -------------------------------------------------------------------------------- /mlflow_models/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/mlflow_models/utils/__init__.py -------------------------------------------------------------------------------- /mlflow_models/utils/columns.py: -------------------------------------------------------------------------------- 1 | TARGET = 'Loan Status' 2 | 3 | NUMERICAL_FEATURES = [ 4 | "Loan Amount", 5 | "Funded Amount", 6 | "Funded Amount Investor", 7 | "Term", 8 | "Interest Rate", 9 | "Home Ownership", 10 | "Debit to Income", 11 | "Delinquency - two years", 12 | "Inquires - six months", 13 | "Open Account", 14 | "Public Record", 15 | "Revolving Balance", 16 | "Revolving Utilities", 17 | "Total Accounts", 18 | "Total Received Interest", 19 | "Total Received Late Fee", 20 | "Recoveries", 21 | "Collection Recovery Fee", 22 | "Collection 12 months Medical", 23 | "Last week Pay", 24 | "Accounts Delinquent", 25 | "Total Collection Amount", 26 | "Total Current Balance", 27 | "Total Revolving Credit Limit", 28 | ] 29 | 30 | CATEGORICAL_FEATURES = [ 31 | "Batch Enrolled", 32 | "Grade", 33 | "Sub Grade", 34 | "Employment Duration", 35 | "Verification Status", 36 | "Payment Plan", 37 | "Loan Title", 38 | "Initial List Status", 39 | "Application Type", 40 | ] 41 | -------------------------------------------------------------------------------- /mlflow_models/utils/data_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | import kaggle 5 | 6 | 7 | def load_raw_data(dset_name: str, file_name: str,): 8 | """Downloads and unpacks Kaggle data 9 | 10 | Args: 11 | dset_name (str, optional): name of kaggle dataset. 12 | Follows the format - username/dataset-name. 13 | For example - "sgpjesus/bank-account-fraud-dataset-neurips-2022". 14 | file_name (str, optional): name of the extracted file. 15 | Should be specified in case there are many files in the zip archive 16 | 17 | Raises: 18 | Exception: if kaggle API was not setup 19 | 20 | Returns: 21 | str: location of the downlaoded and extracted csv file 22 | """ 23 | zip_destination_folder = "./data/" 24 | raw_destination_folder = os.path.join(zip_destination_folder, "raw") 25 | 26 | # Check if the Kaggle API key was created 27 | if not os.path.exists(os.path.expanduser("~/.kaggle/kaggle.json")): 28 | raise Exception( 29 | """ 30 | Kaggle API key not found. 31 | Make sure to follow the instructions to set up your Kaggle API key. 32 | """ 33 | ) 34 | 35 | # Download the dataset into a current folder 36 | kaggle.api.dataset_download_files(dset_name, path=zip_destination_folder) 37 | 38 | # Check if the destination folder exists, and create it if it does not 39 | if not os.path.exists(raw_destination_folder): 40 | os.makedirs(raw_destination_folder) 41 | 42 | # Open the zip file in read mode 43 | zip_name = os.path.join( 44 | zip_destination_folder, 45 | f"{dset_name.split('/')[1]}.zip" 46 | ) 47 | with zipfile.ZipFile(zip_name, "r") as zip_ref: 48 | # Extract all the files to the destination folder 49 | zip_ref.extractall(raw_destination_folder) 50 | 51 | csv_location = os.path.join(raw_destination_folder, file_name) 52 | 53 | return csv_location 54 | -------------------------------------------------------------------------------- /mlflow_models/utils/eval_utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import average_precision_score 2 | import mlflow 3 | 4 | def eval_and_log_metrics(prefix, actual, pred): 5 | pr = average_precision_score(actual, pred) 6 | mlflow.log_metric("{}_PR_AUC".format(prefix), pr) 7 | return pr -------------------------------------------------------------------------------- /mlflow_project/MLproject: -------------------------------------------------------------------------------- 1 | name: fraud_detection 2 | 3 | conda_env: conda_env.yaml 4 | 5 | entry_points: 6 | main: 7 | parameters: 8 | dset: {type: str, default: sgpjesus/bank-account-fraud-dataset-neurips-2022} 9 | n_trials: {type: int, default: 10} 10 | command: "python main.py {dset} {n_trials}" 11 | -------------------------------------------------------------------------------- /mlflow_project/conda_env.yaml: -------------------------------------------------------------------------------- 1 | name: fraud 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.9 6 | - pip 7 | - pip: 8 | - click 9 | - mlflow>=2.1 10 | - kaggle 11 | - polars 12 | - catboost 13 | - optuna 14 | - pandas 15 | - sklearn -------------------------------------------------------------------------------- /mlflow_project/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import mlflow 4 | from steps.download_data import load_raw_data 5 | from steps.preprocess_data import preprocess_data 6 | from steps.tune_model import tune_model 7 | from steps.train_final_model import train_model 8 | 9 | class bcolors: 10 | HEADER = '\033[95m' 11 | OKBLUE = '\033[94m' 12 | OKCYAN = '\033[96m' 13 | OKGREEN = '\033[92m' 14 | WARNING = '\033[93m' 15 | FAIL = '\033[91m' 16 | ENDC = '\033[0m' 17 | BOLD = '\033[1m' 18 | UNDERLINE = '\033[4m' 19 | 20 | 21 | def pipeline(): 22 | mlflow.set_experiment("fraud") 23 | file_location = load_raw_data(sys.argv[1]) 24 | print(f"{bcolors.OKCYAN}Data is loaded{bcolors.ENDC}") 25 | 26 | file_dirs = preprocess_data(file_location, missing_thr=0.95) 27 | print(f"{bcolors.OKCYAN}Data is preprocessed{bcolors.ENDC}") 28 | best_params = tune_model( 29 | train_path=file_dirs["train-data-dir"], 30 | val_path=file_dirs["val-data-dir"], 31 | n_trials=int(sys.argv[2]), 32 | ) 33 | print(f"{bcolors.OKCYAN}HP tuning is finished{bcolors.ENDC}") 34 | best_params["n_estimators"] = 1000 35 | best_params["objective"] = "Logloss" 36 | 37 | roc, pr = train_model( 38 | best_params, 39 | train_path=file_dirs["train-data-dir"], 40 | val_path=file_dirs["val-data-dir"], 41 | test_path=file_dirs["test-data-dir"], 42 | ) 43 | print(f"{bcolors.OKGREEN}Final model is trained. \nTestset ROC AUC: {roc}\nTestset PR AUC: {pr}{bcolors.ENDC}") 44 | 45 | 46 | if __name__ == "__main__": 47 | pipeline() 48 | -------------------------------------------------------------------------------- /mlflow_project/steps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/mlflow_project/steps/__init__.py -------------------------------------------------------------------------------- /mlflow_project/steps/download_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | import kaggle 5 | 6 | 7 | def load_raw_data(dset_name): 8 | zip_destination_folder = "./data/" 9 | raw_destination_folder = os.path.join(zip_destination_folder, "raw") 10 | 11 | # Check if the Kaggle API key was created 12 | if not os.path.exists(os.path.expanduser("~/.kaggle/kaggle.json")): 13 | raise Exception( 14 | "Kaggle API key not found. Make sure to follow the instructions to set up your Kaggle API key." 15 | ) 16 | 17 | # Download the dataset into a current folder 18 | kaggle.api.dataset_download_files( 19 | dset_name, 20 | path=zip_destination_folder, 21 | ) 22 | 23 | # Check if the destination folder exists, and create it if it does not 24 | if not os.path.exists(raw_destination_folder): 25 | os.makedirs(raw_destination_folder) 26 | 27 | # Open the zip file in read mode 28 | zip_name = os.path.join( 29 | zip_destination_folder, "bank-account-fraud-dataset-neurips-2022.zip" 30 | ) 31 | with zipfile.ZipFile(zip_name, "r") as zip_ref: 32 | # Extract all the files to the destination folder 33 | zip_ref.extractall(raw_destination_folder) 34 | 35 | # TODO: make file name a param as well 36 | csv_location = os.path.join(raw_destination_folder, "Base.csv") 37 | 38 | return csv_location -------------------------------------------------------------------------------- /mlflow_project/steps/preprocess_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import polars as pl 3 | 4 | 5 | def process_nans(df: pl.DataFrame, drop_thr: float = 0.95) -> pl.DataFrame: 6 | for col in df.get_columns(): 7 | nulls_prop = col.is_null().mean() 8 | print(f"{col.name} - {nulls_prop * 100}% missing") 9 | # drop if missing more than a threshold 10 | if nulls_prop >= drop_thr: 11 | print("Dropping", col.name) 12 | df = df.select([pl.exclude(col.name)]) 13 | # If some values are missing 14 | elif nulls_prop > 0: 15 | print("Imputing", col.name) 16 | # If numeric, impute with median 17 | if col.is_numeric(): 18 | fill_value = col.median() 19 | else: 20 | # Else, impute with mode 21 | fill_value = col.mode() 22 | df = df.select( 23 | [ 24 | # Exclude the original column 25 | pl.exclude(col.name), 26 | # Include the imputed one 27 | pl.col(col.name).fill_null(value=fill_value), 28 | ] 29 | ) 30 | 31 | return df 32 | 33 | def drop_static(df:pl.DataFrame) -> pl.DataFrame: 34 | for col in df.get_columns(): 35 | std = col.std() 36 | # drop if missing more than a threshold 37 | if std == 0: 38 | print("Dropping", col.name) 39 | df = df.select([pl.exclude(col.name)]) 40 | 41 | return df 42 | 43 | 44 | def train_val_test_split(df, test_size=0.2, val_size=0.2): 45 | df_train = df.filter( 46 | pl.col("month") < df['month'].quantile(0.8) 47 | ) 48 | 49 | df_test = df.filter( 50 | pl.col("month") >= df['month'].quantile(0.8) 51 | ) 52 | 53 | df_val = df_train.filter( 54 | pl.col("month") >= df_train['month'].quantile(0.8) 55 | ) 56 | 57 | df_train = df_train.filter( 58 | pl.col("month") < df_train['month'].quantile(0.8) 59 | ) 60 | 61 | return df_train, df_val, df_test 62 | 63 | def preprocess_data(dset_path, missing_thr): 64 | df = pl.read_csv(dset_path) 65 | # Preprocess nulls 66 | df = process_nans(df, missing_thr) 67 | # Drop static 68 | df = drop_static(df) 69 | # Train/val/test split 70 | train_df, val_df, test_df = train_val_test_split(df) 71 | # Save data 72 | split_destination_folder = './data/processed' 73 | if not os.path.exists(split_destination_folder): 74 | os.makedirs(split_destination_folder) 75 | 76 | train_df.write_parquet('./data/processed/train.parquet') 77 | val_df.write_parquet('./data/processed/validation.parquet') 78 | test_df.write_parquet('./data/processed/test.parquet') 79 | 80 | file_locations = { 81 | 'train-data-dir': './data/processed/train.parquet', 82 | 'val-data-dir': './data/processed/validation.parquet', 83 | 'test-data-dir': './data/processed/test.parquet', 84 | } 85 | 86 | return file_locations -------------------------------------------------------------------------------- /mlflow_project/steps/train_final_model.py: -------------------------------------------------------------------------------- 1 | import catboost as cb 2 | import click 3 | import pandas as pd 4 | from sklearn.metrics import average_precision_score, roc_auc_score 5 | from steps.tune_model import CATEGORICAL_FEATURES, NUMERICAL_FEATURES, TARGET, read_cb_data 6 | import mlflow 7 | 8 | def train_model(params, train_path, val_path, test_path): 9 | train_dataset = read_cb_data( 10 | train_path, 11 | numeric_features=NUMERICAL_FEATURES, 12 | categorical_features=CATEGORICAL_FEATURES, 13 | target_feature=TARGET 14 | ) 15 | val_dataset = read_cb_data( 16 | val_path, 17 | numeric_features=NUMERICAL_FEATURES, 18 | categorical_features=CATEGORICAL_FEATURES, 19 | target_feature=TARGET 20 | ) 21 | test_dataset = read_cb_data( 22 | test_path, 23 | numeric_features=NUMERICAL_FEATURES, 24 | categorical_features=CATEGORICAL_FEATURES, 25 | target_feature=TARGET 26 | ) 27 | mlflow.set_experiment("fraud") 28 | experiment = mlflow.get_experiment_by_name("fraud") 29 | client = mlflow.tracking.MlflowClient() 30 | run = client.create_run(experiment.experiment_id) 31 | with mlflow.start_run(run_id = run.info.run_id): 32 | gbm = cb.CatBoostClassifier(**params) 33 | gbm.fit(train_dataset, eval_set=val_dataset, early_stopping_rounds=50) 34 | preds = gbm.predict_proba(test_dataset) 35 | ap = average_precision_score(test_dataset.get_label(), preds[:, 1]) 36 | roc = roc_auc_score(test_dataset.get_label(), preds[:, 1]) 37 | 38 | mlflow.log_metric("Test ROC AUC", roc) 39 | mlflow.log_metric("Test PR AUC", ap) 40 | mlflow.log_params(params) 41 | mlflow.catboost.log_model(gbm, "catboost_model") 42 | 43 | return roc, ap 44 | 45 | 46 | if __name__ == "__main__": 47 | train_model() 48 | -------------------------------------------------------------------------------- /mlflow_project/steps/tune_model.py: -------------------------------------------------------------------------------- 1 | import catboost as cb 2 | import mlflow 3 | import optuna 4 | import pandas as pd 5 | from optuna.integration.mlflow import MLflowCallback 6 | from sklearn.metrics import average_precision_score, roc_auc_score 7 | 8 | 9 | TARGET = "fraud_bool" 10 | 11 | CATEGORICAL_FEATURES = [ 12 | "payment_type", 13 | "employment_status", 14 | "housing_status", 15 | "source", 16 | "device_os", 17 | ] 18 | NUMERICAL_FEATURES = [ 19 | "income", 20 | "name_email_similarity", 21 | "prev_address_months_count", 22 | "current_address_months_count", 23 | "customer_age", 24 | "days_since_request", 25 | "intended_balcon_amount", 26 | "zip_count_4w", 27 | "velocity_6h", 28 | "velocity_24h", 29 | "velocity_4w", 30 | "bank_branch_count_8w", 31 | "date_of_birth_distinct_emails_4w", 32 | "credit_risk_score", 33 | "email_is_free", 34 | "phone_home_valid", 35 | "phone_mobile_valid", 36 | "bank_months_count", 37 | "has_other_cards", 38 | "proposed_credit_limit", 39 | "foreign_request", 40 | "session_length_in_minutes", 41 | "keep_alive_session", 42 | "device_distinct_emails_8w", 43 | "month", 44 | ] 45 | 46 | 47 | def read_cb_data( 48 | path: str, numeric_features: list, categorical_features: list, target_feature: str 49 | ): 50 | data = pd.read_parquet(path) 51 | dataset = cb.Pool( 52 | data=data[numeric_features + categorical_features], 53 | label=data[target_feature], 54 | cat_features=categorical_features, 55 | ) 56 | return dataset 57 | 58 | 59 | def tune_model(train_path, val_path, n_trials): 60 | train_dataset = read_cb_data( 61 | train_path, 62 | numeric_features=NUMERICAL_FEATURES, 63 | categorical_features=CATEGORICAL_FEATURES, 64 | target_feature=TARGET, 65 | ) 66 | val_dataset = read_cb_data( 67 | val_path, 68 | numeric_features=NUMERICAL_FEATURES, 69 | categorical_features=CATEGORICAL_FEATURES, 70 | target_feature=TARGET, 71 | ) 72 | 73 | def objective(trial): 74 | mlflow.set_experiment("fraud") 75 | experiment = mlflow.get_experiment_by_name("fraud") 76 | client = mlflow.tracking.MlflowClient() 77 | run = client.create_run(experiment.experiment_id) 78 | with mlflow.start_run(run_id = run.info.run_id): 79 | param = { 80 | "n_estimators": 1000, 81 | "objective": "Logloss", 82 | "subsample": trial.suggest_uniform("subsample", 0.4, 1.0), 83 | "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-3, 10.0), 84 | "learning_rate": trial.suggest_uniform("learning_rate", 0.006, 0.02), 85 | "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 0.5), 86 | "depth": trial.suggest_int("depth", 2, 12), 87 | "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300), 88 | } 89 | mlflow.log_params(param) 90 | gbm = cb.CatBoostClassifier(**param) 91 | gbm.fit(train_dataset, eval_set=val_dataset, early_stopping_rounds=50) 92 | 93 | preds = gbm.predict_proba(val_dataset) 94 | ap = average_precision_score(val_dataset.get_label(), preds[:, 1]) 95 | roc = roc_auc_score(val_dataset.get_label(), preds[:, 1]) 96 | mlflow.log_metric("Val PR AUC", ap) 97 | mlflow.log_metric("Val ROC AUC", roc) 98 | return ap 99 | 100 | study = optuna.create_study(direction="maximize") 101 | study.optimize(objective, n_trials=n_trials) 102 | return study.best_trial.params 103 | -------------------------------------------------------------------------------- /polars/data_preparation_pipeline.py: -------------------------------------------------------------------------------- 1 | """Pipeline script to prepare and save data for modelling""" 2 | import time 3 | 4 | import polars as pl 5 | import yaml 6 | 7 | from data_utils.feature_engineering import ( 8 | add_period_features, 9 | add_rolling_features, 10 | basic_feature_engineering, 11 | ) 12 | from data_utils.processing import clean_data, read_category_mappings 13 | from data_utils.transfomation import create_target_df 14 | 15 | 16 | def pipeline(): 17 | """Pipeline that reads, cleans, and transofrms data into 18 | the format we need for modelling 19 | """ 20 | # Read and unwrap the config 21 | with open("pipe_config.yaml", "r") as file: 22 | pipe_config = yaml.safe_load(file) 23 | 24 | date_column_format = pipe_config["date_column_format"] 25 | ratios_config = pipe_config["ratio_features"] 26 | diffs_config = pipe_config["difference_features"] 27 | dates_config = pipe_config["date_features"] 28 | 29 | id_to_category = read_category_mappings(pipe_config["category_map_path"]) 30 | col_mappings = {"category_id": id_to_category} 31 | 32 | output_data = ( 33 | pl.scan_csv(pipe_config["data_path"]) 34 | .pipe(clean_data, date_column_format, col_mappings) 35 | .pipe(basic_feature_engineering, ratios_config, diffs_config, dates_config) 36 | .pipe( 37 | create_target_df, 38 | time_to_trending_thr=pipe_config["max_time_to_trending"], 39 | original_join_cols=pipe_config["join_columns"], 40 | other_cols=pipe_config["base_columns"], 41 | ) 42 | .pipe( 43 | add_rolling_features, 44 | "first_day_in_trending", 45 | pipe_config["aggregate_windows"], 46 | ) 47 | .pipe( 48 | add_period_features, 49 | "first_day_in_trending", 50 | pipe_config["aggregate_windows"], 51 | ) 52 | ).collect() 53 | 54 | return output_data 55 | 56 | 57 | if __name__ == "__main__": 58 | t0 = time.time() 59 | output = pipeline() 60 | t1 = time.time() 61 | print("Pipeline took", t1 - t0, "seconds") 62 | print("Output shape", output.shape) 63 | print("Output columns:", output.columns) 64 | output.write_parquet("./data/modelling_data.parquet") 65 | -------------------------------------------------------------------------------- /polars/data_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/polars/data_utils/__init__.py -------------------------------------------------------------------------------- /polars/data_utils/feature_engineering.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import polars as pl 4 | 5 | 6 | def ratio_features(features_config: Dict[str, List[str]]) -> List[pl.Expr]: 7 | expressions = [] 8 | for name, cols in features_config.items(): 9 | expressions.append((pl.col(cols[0]) / pl.col(cols[1])).alias(name)) 10 | 11 | return expressions 12 | 13 | 14 | def diff_features(features_config: Dict[str, List[str]]) -> List[pl.Expr]: 15 | expressions = [] 16 | for name, cols in features_config.items(): 17 | expressions.append((pl.col(cols[0]) - pl.col(cols[1])).alias(name)) 18 | 19 | return expressions 20 | 21 | 22 | def date_features(features_config: Dict[str, List[str]]) -> List[pl.Expr]: 23 | expressions = [] 24 | for col, features in features_config.items(): 25 | if "weekday" in features: 26 | expressions.append(pl.col(col).dt.weekday().alias(f"{col}_weekday")) 27 | if "month" in features: 28 | expressions.append(pl.col(col).dt.month().alias(f"{col}_month")) 29 | if "year" in features: 30 | expressions.append(pl.col(col).dt.year().alias(f"{col}_year")) 31 | 32 | return expressions 33 | 34 | 35 | def basic_feature_engineering( 36 | data: pl.LazyFrame, 37 | ratios_config: Dict[str, List[str]], 38 | diffs_config: Dict[str, List[str]], 39 | dates_config: Dict[str, List[str]], 40 | ) -> pl.LazyFrame: 41 | ratio_expressions = ratio_features(ratios_config) 42 | date_diff_expressions = diff_features(diffs_config) 43 | date_expressions = date_features(dates_config) 44 | 45 | data = data.with_columns( 46 | ratio_expressions + date_diff_expressions + date_expressions 47 | ) 48 | return data 49 | 50 | 51 | def build_channel_rolling(df: pl.LazyFrame, date_col: str, period: int) -> pl.LazyFrame: 52 | channel_aggs = ( 53 | df.sort(date_col) 54 | .groupby_rolling( 55 | index_column=date_col, 56 | period=f"{period}d", 57 | by="channel_title", 58 | closed="left", # 59 | ) 60 | .agg( 61 | pl.col("video_id") 62 | .n_unique() 63 | .alias(f"channel_num_trending_videos_last_{period}_days"), 64 | pl.col("days_in_trending") 65 | .max() 66 | .alias(f"channel_max_days_in_trending_{period}_days"), 67 | pl.col("days_in_trending") 68 | .mean() 69 | .alias(f"channel_avg_days_in_trending_{period}_days"), 70 | ) 71 | .fill_null(0) 72 | ) 73 | 74 | return channel_aggs 75 | 76 | 77 | def add_rolling_features( 78 | df: pl.LazyFrame, date_col: str, periods: List[int] 79 | ) -> pl.LazyFrame: 80 | for period in periods: 81 | rolling_features = build_channel_rolling(df, date_col, period) 82 | df = df.join(rolling_features, on=["channel_title", "first_day_in_trending"]) 83 | 84 | return df 85 | 86 | 87 | def build_period_features(df: pl.LazyFrame, date_col: str, period: int) -> pl.LazyFrame: 88 | general_aggs = ( 89 | df.sort(date_col) 90 | .groupby_dynamic( 91 | index_column=date_col, 92 | every="1d", 93 | period=f"{period}d", 94 | closed="left", 95 | ) 96 | .agg( 97 | pl.col("video_id") 98 | .n_unique() 99 | .alias(f"general_num_trending_videos_last_{period}_days"), 100 | pl.col("days_in_trending") 101 | .max() 102 | .alias(f"general_max_days_in_trending_{period}_days"), 103 | pl.col("days_in_trending") 104 | .mean() 105 | .alias(f"general_avg_days_in_trending_{period}_days"), 106 | ) 107 | .with_columns( 108 | # shift match values with previous period 109 | pl.col(f"general_num_trending_videos_last_{period}_days").shift(period), 110 | pl.col(f"general_max_days_in_trending_{period}_days").shift(period), 111 | pl.col(f"general_avg_days_in_trending_{period}_days").shift(period), 112 | ) 113 | .fill_null(0) 114 | ) 115 | 116 | return general_aggs 117 | 118 | 119 | def add_period_features( 120 | df: pl.LazyFrame, date_col: str, periods: List[int] 121 | ) -> pl.LazyFrame: 122 | for period in periods: 123 | rolling_features = build_period_features(df, date_col, period) 124 | df = df.join(rolling_features, on=["first_day_in_trending"]) 125 | 126 | return df 127 | -------------------------------------------------------------------------------- /polars/data_utils/processing.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Dict, List 3 | 4 | import polars as pl 5 | 6 | 7 | def read_category_mappings(path: str) -> Dict[int, str]: 8 | with open(path, "r") as f: 9 | categories = json.load(f) 10 | 11 | id_to_category = {} 12 | for c in categories["items"]: 13 | id_to_category[int(c["id"])] = c["snippet"]["title"] 14 | 15 | return id_to_category 16 | 17 | 18 | def parse_dates(date_cols: Dict[str, str]) -> List[pl.Expr]: 19 | expressions = [] 20 | for date_col, fmt in date_cols.items(): 21 | expressions.append(pl.col(date_col).str.to_date(format=fmt)) 22 | 23 | return expressions 24 | 25 | 26 | def map_dict_columns( 27 | mapping_cols: Dict[str, Dict[str | int, str | int]] 28 | ) -> List[pl.Expr]: 29 | expressions = [] 30 | for col, mapping in mapping_cols.items(): 31 | expressions.append(pl.col(col).map_dict(mapping)) 32 | return expressions 33 | 34 | 35 | def clean_data( 36 | df: pl.LazyFrame, 37 | date_cols_config: Dict[str, str], 38 | mapping_cols_config: Dict[str, Dict[str | int, str | int]], 39 | ) -> pl.LazyFrame: 40 | parse_dates_expressions = parse_dates(date_cols=date_cols_config) 41 | mapping_expressions = map_dict_columns(mapping_cols_config) 42 | 43 | df = df.with_columns(parse_dates_expressions + mapping_expressions) 44 | return df 45 | -------------------------------------------------------------------------------- /polars/data_utils/transfomation.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import polars as pl 4 | 5 | 6 | def join_original_features( 7 | main: pl.LazyFrame, 8 | original: pl.LazyFrame, 9 | main_join_cols: List[str], 10 | original_join_cols: List[str], 11 | other_cols: List[str], 12 | ) -> pl.LazyFrame: 13 | original_features = original.select(original_join_cols + other_cols).unique( 14 | original_join_cols 15 | ) # unique ensures one row per video + date 16 | main = main.join( 17 | original_features, 18 | left_on=main_join_cols, 19 | right_on=original_join_cols, 20 | how="left", 21 | ) 22 | 23 | return main 24 | 25 | 26 | def create_target_df( 27 | df: pl.LazyFrame, 28 | time_to_trending_thr: int, 29 | original_join_cols: List[str], 30 | other_cols: List[str], 31 | ) -> pl.LazyFrame: 32 | # Create a DF with video ID per row and corresponding days to trending and days in trending (target) 33 | target = ( 34 | df.groupby(["video_id"]) 35 | .agg( 36 | pl.col("days_to_trending").min().dt.days(), 37 | pl.col("trending_date").min().dt.date().alias("first_day_in_trending"), 38 | pl.col("trending_date").max().dt.date().alias("last_day_in_trending"), 39 | (pl.col("trending_date").max() - pl.col("trending_date").min()) 40 | .dt.days() 41 | .alias("days_in_trending"), 42 | ) 43 | .filter(pl.col("days_to_trending") <= time_to_trending_thr) 44 | ) 45 | 46 | # Join features to the aggregates 47 | target = join_original_features( 48 | main=target, 49 | original=df, 50 | main_join_cols=["video_id", "first_day_in_trending"], 51 | original_join_cols=original_join_cols, 52 | other_cols=other_cols, 53 | ) 54 | 55 | return target 56 | -------------------------------------------------------------------------------- /polars/pipe_config.yaml: -------------------------------------------------------------------------------- 1 | data_path: "./youtube/GBvideos.csv" 2 | category_map_path: "./youtube/GB_category_id.json" 3 | 4 | # Pre-processing config 5 | date_column_format: 6 | trending_date: "%y.%d.%m" 7 | publish_time: "%Y-%m-%dT%H:%M:%S%.fZ" 8 | 9 | # Feature engineering config 10 | ratio_features: 11 | likes_to_dislikes: 12 | - likes 13 | - dislikes 14 | likes_to_views: 15 | - likes 16 | - views 17 | comments_to_views: 18 | - comment_count 19 | - views 20 | 21 | difference_features: 22 | days_to_trending: 23 | - trending_date 24 | - publish_time 25 | 26 | date_features: 27 | trending_date: 28 | - weekday 29 | 30 | # Filtering config 31 | max_time_to_trending: 60 32 | 33 | # Features config 34 | join_columns: 35 | - video_id 36 | - trending_date 37 | 38 | base_columns: 39 | - views 40 | - likes 41 | - dislikes 42 | - comment_count 43 | - comments_disabled 44 | - ratings_disabled 45 | - video_error_or_removed 46 | - likes_to_dislikes 47 | - likes_to_views 48 | - comments_to_views 49 | - trending_date_weekday 50 | - channel_title 51 | - tags 52 | - description 53 | - category_id 54 | 55 | aggregate_windows: 56 | - 7 57 | - 30 58 | - 180 59 | -------------------------------------------------------------------------------- /pyspark/cleaning.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import DataFrame 3 | 4 | 5 | def get_static(data: DataFrame, cols_to_analyse): 6 | """Return the list of static columns 7 | 8 | Args: 9 | data (DataFrame): input PySpark dataframe 10 | cols_to_analyse (list[str]): list of columns to analyse 11 | 12 | Returns: 13 | list[str]: list of static columns 14 | """ 15 | unique_counts = data.agg( 16 | *(F.countDistinct(F.col(c)).alias(c) for c in cols_to_analyse) 17 | ).first() 18 | static_cols = [c for c in unique_counts.asDict() if unique_counts[c] == 1] 19 | print("Static columns:", static_cols) 20 | return static_cols 21 | 22 | 23 | def remove_rare_categories( 24 | data: DataFrame, columns, min_count: int = 100 25 | ) -> DataFrame: 26 | """Removes rare categories in categorical features by substituting 27 | them with 'Other' 28 | 29 | Args: 30 | data (DataFrame): input PySpark dataframe 31 | columns (list[str]): list of categorical features to process 32 | min_count (int, optional): minimum number of times for category 33 | to appear to not be considered rare. Defaults to 100. 34 | 35 | Returns: 36 | DataFrame: processed PySpark dataframe 37 | """ 38 | categorical_valid_values = {} 39 | 40 | for c in columns: 41 | # Find frequent values 42 | categorical_valid_values[c] = ( 43 | data.groupby(c) 44 | .count() 45 | .filter(F.col("count") > min_count) 46 | .select(c) 47 | .toPandas() 48 | .values.ravel() 49 | ) 50 | 51 | data = data.withColumn( 52 | c, 53 | F.when( 54 | F.col(c).isin(list(categorical_valid_values[c])), F.col(c) 55 | ).otherwise(F.lit("Other").alias(c)), 56 | ) 57 | 58 | return data 59 | -------------------------------------------------------------------------------- /pyspark/conda_env.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - python=3.10 4 | - pyyaml=6.0.1 5 | - mmh3==4.0.1 6 | - numpy==1.26.2 7 | - pandas=2.0.3 8 | - pyspark==3.5.0 9 | - hyperopt==0.2.7 10 | name: iot -------------------------------------------------------------------------------- /pyspark/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | categorical_features: 3 | - proto 4 | - service 5 | - conn_state 6 | - history 7 | filepaths: 8 | - ./iot_malware/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv 9 | model_output_path: ./pipeline 10 | na_fill_vals: 11 | conn_state: missing 12 | duration: -999999 13 | history: missing 14 | orig_bytes: -999999 15 | orig_ip_bytes: -999999 16 | orig_pkts: -999999 17 | proto: missing 18 | resp_bytes: -999999 19 | resp_ip_bytes: -999999 20 | resp_pkts: -999999 21 | service: missing 22 | numerical_features: 23 | - duration 24 | - orig_bytes 25 | - resp_bytes 26 | - missed_bytes 27 | - orig_pkts 28 | - orig_ip_bytes 29 | - resp_pkts 30 | - resp_ip_bytes 31 | random_split: true 32 | tuning_rounds: 0 33 | # - ./iot_malware/CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv 34 | # - ./iot_malware/CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv 35 | # - ./iot_malware/CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv 36 | # - ./iot_malware/CTU-IoT-Malware-Capture-35-1conn.log.labeled.csv 37 | # - ./iot_malware/CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv 38 | # - ./iot_malware/CTU-IoT-Malware-Capture-44-1conn.log.labeled.csv 39 | # - ./iot_malware/CTU-IoT-Malware-Capture-48-1conn.log.labeled.csv 40 | # - ./iot_malware/CTU-IoT-Malware-Capture-60-1conn.log.labeled.csv 41 | # - ./iot_malware/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv -------------------------------------------------------------------------------- /pyspark/feature_engineering.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import Column, Window, WindowSpec 3 | 4 | 5 | def mins_to_secs(mins: int) -> int: 6 | """Transforms minutes to seconds 7 | 8 | Args: 9 | mins (int): number of minutes to be transformed 10 | 11 | Returns: 12 | int: numeber of seconds 13 | """ 14 | return mins * 60 15 | 16 | 17 | def generate_window( 18 | window_in_minutes: int, partition_by: str, timestamp_col: str 19 | ) -> WindowSpec: 20 | """Generates window expressions for PySpark 21 | 22 | Args: 23 | window_in_minutes (int): Number of minutes you want in the rolling window 24 | partition_by (str): Column to partition by e.g. IP or user account 25 | timestamp_col (str): Column with timestamp data type 26 | 27 | Returns: 28 | _type_: _description_ 29 | """ 30 | window = ( 31 | Window() 32 | .partitionBy(F.col(partition_by)) 33 | .orderBy(F.col(timestamp_col).cast("long")) 34 | .rangeBetween(-mins_to_secs(window_in_minutes), -1) 35 | ) 36 | 37 | return window 38 | 39 | 40 | def generate_rolling_aggregate( 41 | col: str, 42 | partition_by: str | None = None, 43 | operation: str = "count", 44 | timestamp_col: str = "dt", 45 | window_in_minutes: int = 1, 46 | ) -> Column: 47 | """Rolling aggregate experession constructor 48 | 49 | Args: 50 | col (str): Name of column to aggregate 51 | partition_by (str | None, optional): Column to partition by. Defaults to None. 52 | operation (str, optional): What type of aggregation should be done. Defaults to "count". 53 | timestamp_col (str, optional): Timestamp column in your PySpark DF. Defaults to "dt". 54 | window_in_minutes (int, optional): Number of minutes for the window. Defaults to 1. 55 | 56 | Raises: 57 | ValueError: _description_ 58 | 59 | Returns: 60 | Column: _description_ 61 | """ 62 | if partition_by is None: 63 | partition_by = col 64 | 65 | match operation: 66 | case "count": 67 | return F.count(col).over( 68 | generate_window( 69 | window_in_minutes=window_in_minutes, 70 | partition_by=col, 71 | timestamp_col=timestamp_col, 72 | ) 73 | ) 74 | case "sum": 75 | return F.sum(col).over( 76 | generate_window( 77 | window_in_minutes=window_in_minutes, 78 | partition_by=col, 79 | timestamp_col=timestamp_col, 80 | ) 81 | ) 82 | case "avg": 83 | return F.avg(col).over( 84 | generate_window( 85 | window_in_minutes=window_in_minutes, 86 | partition_by=col, 87 | timestamp_col=timestamp_col, 88 | ) 89 | ) 90 | case _: 91 | raise ValueError(f"Operation {operation} is not defined") 92 | -------------------------------------------------------------------------------- /pyspark/gcs_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | categorical_features: 3 | - proto 4 | - service 5 | - conn_state 6 | - history 7 | filepaths: 8 | - gs://iot-data-demo/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv 9 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv 10 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv 11 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv 12 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv 13 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-35-1conn.log.labeled.csv 14 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv 15 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-44-1conn.log.labeled.csv 16 | model_output_path: gs://iot-data-demo/best_pipeline 17 | na_fill_vals: 18 | conn_state: missing 19 | duration: -999999 20 | history: missing 21 | orig_bytes: -999999 22 | orig_ip_bytes: -999999 23 | orig_pkts: -999999 24 | proto: missing 25 | resp_bytes: -999999 26 | resp_ip_bytes: -999999 27 | resp_pkts: -999999 28 | service: missing 29 | numerical_features: 30 | - duration 31 | - orig_bytes 32 | - resp_bytes 33 | - missed_bytes 34 | - orig_pkts 35 | - orig_ip_bytes 36 | - resp_pkts 37 | - resp_ip_bytes 38 | random_split: true 39 | tuning_rounds: 0 -------------------------------------------------------------------------------- /pyspark/ml_prep.py: -------------------------------------------------------------------------------- 1 | import mmh3 2 | import pyspark.sql.functions as F 3 | from pyspark.sql import DataFrame 4 | from pyspark.sql.types import LongType 5 | 6 | 7 | @F.udf(returnType=LongType()) 8 | def hash_udf(x): 9 | return mmh3.hash64(str(x))[0] 10 | 11 | 12 | def hash_split( 13 | data: DataFrame, col: str, test_size: float = 0.2 14 | ) -> tuple[DataFrame, DataFrame]: 15 | data = data.withColumn("hash", hash_udf(F.col(col))) 16 | 17 | # 80/20 split 18 | train_thr = data.approxQuantile( 19 | "hash", probabilities=[test_size], relativeError=0.01 20 | )[0] 21 | train = data.where(F.col("hash") >= train_thr).drop("hash") 22 | test = data.where(F.col("hash") < train_thr).drop("hash") 23 | 24 | return train, test 25 | 26 | 27 | def ip_based_split( 28 | data: DataFrame, col: str, test_size: float = 0.2 29 | ) -> tuple[DataFrame, DataFrame]: 30 | # Get list of IPs with > 20% malicious activity 31 | bad_ips = ( 32 | data.groupby("source_ip") 33 | .agg(F.avg(F.col("is_bad")).alias("bad_avg")) 34 | .where(F.col("bad_avg") > 0.2) 35 | .select("source_ip") 36 | .toPandas() 37 | .values.ravel() 38 | ) 39 | bad_ips = list(bad_ips) 40 | print(bad_ips) 41 | 42 | data = data.withColumn("ip_hash", hash_udf(F.col("source_ip"))) 43 | 44 | # Split good IPs 45 | good_df = data.where(~F.col("source_ip").isin(bad_ips)) 46 | bad_df = data.where(F.col("source_ip").isin(bad_ips)) 47 | print("Original Sizes") 48 | print("Good", good_df.count()) 49 | print("Bad", bad_df.count()) 50 | 51 | # 80/20 split 52 | good_train, good_test = hash_split(good_df, col, test_size) 53 | print("Good data", good_train.count(), good_test.count()) 54 | bad_train, bad_test = hash_split(bad_df, col, test_size) 55 | print("Bad data", bad_train.count(), bad_test.count()) 56 | 57 | train = good_train.union(bad_train) 58 | test = good_test.union(bad_test) 59 | 60 | return train, test 61 | -------------------------------------------------------------------------------- /pyspark/pipe.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | import yaml 3 | from hyperopt import hp 4 | from pyspark.ml import Pipeline 5 | from pyspark.ml.classification import RandomForestClassifier 6 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 7 | from pyspark.ml.feature import StringIndexer, VectorAssembler 8 | from pyspark.sql import SparkSession 9 | 10 | from cleaning import get_static, remove_rare_categories 11 | from feature_engineering import generate_rolling_aggregate 12 | from ml_prep import ip_based_split 13 | from tuning import tune_rf 14 | 15 | # Read and set configs 16 | with open("gcs_config.yaml", "r") as file: 17 | conf = yaml.safe_load(file) 18 | 19 | numerical_features: list[str] = conf["numerical_features"] 20 | categorical_features: list[str] = conf["categorical_features"] 21 | 22 | spark = SparkSession.builder.appName("LocalTest").getOrCreate() 23 | spark.sparkContext.setLogLevel("WARN") 24 | 25 | # Read in and do some basic processing 26 | df = ( 27 | spark.read.option("delimiter", "|") 28 | .csv(conf["filepaths"], inferSchema=True, header=True) 29 | .withColumns( 30 | { 31 | "is_bad": F.when(F.col("label") != "Benign", 1).otherwise(0), 32 | "dt": F.to_timestamp(F.from_unixtime("ts")), 33 | } 34 | ) 35 | .withColumnsRenamed( 36 | { 37 | "id.orig_h": "source_ip", 38 | "id.orig_p": "source_port", 39 | "id.resp_h": "dest_ip", 40 | "id.resp_p": "dest_port", 41 | } 42 | ) 43 | .withColumns({n: F.col(n).cast("double") for n in numerical_features}) 44 | .replace("-", None) 45 | .fillna(conf["na_fill_vals"]) 46 | ) 47 | 48 | # Find and drop static columns 49 | static_numerical = get_static(df, numerical_features) 50 | static_categorical = get_static(df, categorical_features) 51 | numerical_features = [f for f in numerical_features if f not in static_numerical] 52 | categorical_features = [f for f in categorical_features if f not in static_categorical] 53 | categorical_features_indexed = [c + "_ind" for c in categorical_features] 54 | input_features = numerical_features + categorical_features_indexed 55 | 56 | # Process categorical 57 | df = remove_rare_categories( 58 | df.drop(*static_numerical + static_categorical), categorical_features, min_count=100 59 | ) 60 | 61 | # Feature engineering 62 | df = df.withColumns( 63 | { 64 | "source_ip_count_last_min": generate_rolling_aggregate( 65 | col="source_ip", operation="count", timestamp_col="dt", window_in_minutes=1 66 | ), 67 | "source_ip_count_last_30_mins": generate_rolling_aggregate( 68 | col="source_ip", operation="count", timestamp_col="dt", window_in_minutes=30 69 | ), 70 | "source_port_count_last_min": generate_rolling_aggregate( 71 | col="source_port", 72 | operation="count", 73 | timestamp_col="dt", 74 | window_in_minutes=1, 75 | ), 76 | "source_port_count_last_30_mins": generate_rolling_aggregate( 77 | col="source_port", 78 | operation="count", 79 | timestamp_col="dt", 80 | window_in_minutes=30, 81 | ), 82 | "source_ip_avg_pkts_last_min": generate_rolling_aggregate( 83 | col="orig_pkts", 84 | partition_by="source_ip", 85 | operation="avg", 86 | timestamp_col="dt", 87 | window_in_minutes=1, 88 | ), 89 | "source_ip_avg_pkts_last_30_mins": generate_rolling_aggregate( 90 | col="orig_pkts", 91 | partition_by="source_ip", 92 | operation="avg", 93 | timestamp_col="dt", 94 | window_in_minutes=30, 95 | ), 96 | "source_ip_avg_bytes_last_min": generate_rolling_aggregate( 97 | col="orig_ip_bytes", 98 | partition_by="source_ip", 99 | operation="avg", 100 | timestamp_col="dt", 101 | window_in_minutes=1, 102 | ), 103 | "source_ip_avg_bytes_last_30_mins": generate_rolling_aggregate( 104 | col="orig_ip_bytes", 105 | partition_by="source_ip", 106 | operation="avg", 107 | timestamp_col="dt", 108 | window_in_minutes=30, 109 | ), 110 | } 111 | ) 112 | 113 | if conf["random_split"]: 114 | df_train, df_test = df.randomSplit(weights=[0.8, 0.2], seed=200) 115 | else: 116 | df_train, df_test = ip_based_split(df, "source_ip", 0.2) 117 | 118 | roc = BinaryClassificationEvaluator(labelCol="is_bad", metricName="areaUnderROC") 119 | ind = StringIndexer( 120 | inputCols=categorical_features, 121 | outputCols=categorical_features_indexed, 122 | handleInvalid="skip", 123 | ) 124 | va = VectorAssembler( 125 | inputCols=input_features, outputCol="features", handleInvalid="skip" 126 | ) 127 | 128 | if conf["tuning_rounds"] > 0: 129 | df_train, df_val = df_train.randomSplit(weights=[0.8, 0.2], seed=200) 130 | search_space = { 131 | "numTrees": hp.uniformint("numTrees", 10, 500), 132 | "maxDepth": hp.uniformint("maxDepth", 2, 10), 133 | } 134 | print(f"Tuning the model for {conf['tuning_rounds']} rounds") 135 | best_params = tune_rf( 136 | train=df_train, 137 | val=df_val, 138 | string_indexer=ind, 139 | vector_assembler=va, 140 | evaluator=roc, 141 | param_grid=search_space, 142 | tuning_rounds=conf["tuning_rounds"], 143 | ) 144 | else: 145 | print("Skipping the tuning...") 146 | best_params = {"numTrees": 10, "maxDepth": 4} 147 | 148 | best_rf = RandomForestClassifier( 149 | featuresCol="features", 150 | labelCol="is_bad", 151 | numTrees=best_params["numTrees"], 152 | maxDepth=best_params["maxDepth"], 153 | ) 154 | 155 | best_pipeline = Pipeline(stages=[ind, va, best_rf]) 156 | best_pipeline = best_pipeline.fit(df_train) 157 | test_preds = best_pipeline.transform(df_test) 158 | 159 | score = roc.evaluate(test_preds) 160 | print("ROC AUC", score) 161 | best_pipeline.save(conf["model_output_path"]) 162 | -------------------------------------------------------------------------------- /pyspark/spark_feature_engineering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# PySpark Project Step-by-Step: Part 2\n", 8 | "\n", 9 | "This notebook will walk you through 2 more steps in the ML lifecycle - **Feature Engineering** and **Model Fitting & Evaluation**.
\n", 10 | "* In the feature engineering part you'll see how to perform common aggregates using analytical functions.\n", 11 | "* In the modelling part you'll see how to prepare your data for modelling in PySpark, and how to fit a model using MLLib.\n", 12 | "* Finally, we'll see how we can evaluate the model we've built." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from pyspark.sql import SparkSession\n", 22 | "from pyspark.sql import Window\n", 23 | "import pyspark.sql.functions as F\n", 24 | "from pyspark.ml.feature import StringIndexer, VectorAssembler\n", 25 | "from pyspark.ml import Pipeline\n", 26 | "from pyspark.ml.classification import RandomForestClassifier" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "spark = (\n", 36 | " SparkSession.builder.appName(\"iot\")\n", 37 | " .getOrCreate()\n", 38 | ")\n", 39 | "spark.sparkContext.setLogLevel(\"ERROR\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Read Data" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "df = spark.read.parquet(\"processed.pq\").withColumn(\n", 56 | " \"is_bad\", F.when(F.col(\"label\") != \"Benign\", 1).otherwise(0)\n", 57 | ")\n", 58 | "df.show(5)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Feature Engineering\n", 66 | "\n", 67 | "Since we have a time-component to this data, we can engineer all sorts of rolling features. The ones that I'll cover here are:\n", 68 | "* Number of times we've seen this source IP in the last minute\n", 69 | "* Number of times we've seen this destination IP in the last minute\n", 70 | "* Number of times we've seen this source PORT in the last minute\n", 71 | "* Number of times we've seen this destination PORT in the last minute\n", 72 | "\n", 73 | "To calculate these features, we'll need to use analytical functions. " 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "def mins_to_secs(mins):\n", 83 | " return mins * 60\n", 84 | "\n", 85 | "\n", 86 | "def generate_window(window_in_minutes: int, partition_by: str, timestamp_col: str):\n", 87 | " window = (\n", 88 | " Window()\n", 89 | " .partitionBy(F.col(partition_by))\n", 90 | " .orderBy(F.col(timestamp_col).cast(\"long\"))\n", 91 | " .rangeBetween(-mins_to_secs(window_in_minutes), -1)\n", 92 | " )\n", 93 | "\n", 94 | " return window\n", 95 | "\n", 96 | "\n", 97 | "def generate_rolling_aggregate(\n", 98 | " col: str,\n", 99 | " partition_by: str | None = None,\n", 100 | " operation: str = \"count\",\n", 101 | " timestamp_col: str = \"dt\",\n", 102 | " window_in_minutes: int = 1,\n", 103 | "):\n", 104 | " if partition_by is None:\n", 105 | " partition_by = col\n", 106 | "\n", 107 | " match operation:\n", 108 | " case \"count\":\n", 109 | " return F.count(col).over(\n", 110 | " generate_window(\n", 111 | " window_in_minutes=window_in_minutes,\n", 112 | " partition_by=col,\n", 113 | " timestamp_col=timestamp_col,\n", 114 | " )\n", 115 | " )\n", 116 | " case \"sum\":\n", 117 | " return F.sum(col).over(\n", 118 | " generate_window(\n", 119 | " window_in_minutes=window_in_minutes,\n", 120 | " partition_by=col,\n", 121 | " timestamp_col=timestamp_col,\n", 122 | " )\n", 123 | " )\n", 124 | " case \"avg\":\n", 125 | " return F.avg(col).over(\n", 126 | " generate_window(\n", 127 | " window_in_minutes=window_in_minutes,\n", 128 | " partition_by=col,\n", 129 | " timestamp_col=timestamp_col,\n", 130 | " )\n", 131 | " )\n", 132 | " case _:\n", 133 | " raise ValueError(f\"Operation {operation} is not defined\")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Generate Rolling Count Features" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "Due to the nicely defined functions above, generating rolling averages and counts is a piece of cake!" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "df = df.withColumns({\n", 157 | " \"source_ip_count_last_min\": generate_rolling_aggregate(col=\"source_ip\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=1),\n", 158 | " \"source_ip_count_last_30_mins\": generate_rolling_aggregate(col=\"source_ip\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=30),\n", 159 | " \"source_port_count_last_min\": generate_rolling_aggregate(col=\"source_port\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=1),\n", 160 | " \"source_port_count_last_30_mins\": generate_rolling_aggregate(col=\"source_port\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=30),\n", 161 | " \"dest_ip_count_last_min\": generate_rolling_aggregate(col=\"dest_ip\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=1),\n", 162 | " \"dest_ip_count_last_30_mins\": generate_rolling_aggregate(col=\"dest_ip\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=30),\n", 163 | " \"dest_port_count_last_min\": generate_rolling_aggregate(col=\"dest_port\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=1),\n", 164 | " \"dest_port_count_last_30_mins\": generate_rolling_aggregate(col=\"dest_port\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=30),\n", 165 | " \"source_ip_avg_pkts_last_min\": generate_rolling_aggregate(col=\"orig_pkts\", partition_by=\"source_ip\", operation=\"avg\", timestamp_col=\"dt\", window_in_minutes=1),\n", 166 | " \"source_ip_avg_pkts_last_30_mins\": generate_rolling_aggregate(col=\"orig_pkts\", partition_by=\"source_ip\", operation=\"avg\", timestamp_col=\"dt\", window_in_minutes=30),\n", 167 | " \"source_ip_avg_bytes_last_min\": generate_rolling_aggregate(col=\"orig_ip_bytes\", partition_by=\"source_ip\", operation=\"avg\", timestamp_col=\"dt\", window_in_minutes=1),\n", 168 | " \"source_ip_avg_bytes_last_30_mins\": generate_rolling_aggregate(col=\"orig_ip_bytes\", partition_by=\"source_ip\", operation=\"avg\", timestamp_col=\"dt\", window_in_minutes=30),\n", 169 | "})" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "df.show(5)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Now,execute and save the resulting table into a new parquet file" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "df.write.mode(\"overwrite\").parquet(\"feature_engineered.pq\")" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "df_fe = spark.read.parquet(\"feature_engineered.pq\")" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "Let's compare the speed of calling the old `df` vs the new `df_fe`..." 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "df_fe.show(10)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "Such a drastic difference is because when you call `df.show()` it's going to execute all of the very expensive operations we did. Instead, it's better to construct a new dataframe for the analysis." 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## Preprocessing" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "df_fe.columns[:5]" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "numerical_features = [\n", 252 | " \"duration\",\n", 253 | " \"orig_bytes\",\n", 254 | " \"resp_bytes\",\n", 255 | " \"orig_pkts\",\n", 256 | " \"orig_ip_bytes\",\n", 257 | " \"resp_pkts\",\n", 258 | " \"resp_ip_bytes\",\n", 259 | " \"source_ip_count_last_min\",\n", 260 | " \"source_ip_count_last_30_mins\",\n", 261 | " \"source_port_count_last_min\",\n", 262 | " \"source_port_count_last_30_mins\",\n", 263 | " # \"dest_ip_count_last_min\",\n", 264 | " # \"dest_ip_count_last_30_mins\",\n", 265 | " # \"dest_port_count_last_min\",\n", 266 | " # \"dest_port_count_last_30_mins\",\n", 267 | " \"source_ip_avg_pkts_last_min\",\n", 268 | " \"source_ip_avg_pkts_last_30_mins\",\n", 269 | " \"source_ip_avg_bytes_last_min\",\n", 270 | " \"source_ip_avg_bytes_last_30_mins\",\n", 271 | "]\n", 272 | "categorical_features = [\"proto\", \"service\", \"conn_state\", \"history\"]\n", 273 | "categorical_features_indexed = [c + \"_index\" for c in categorical_features]\n", 274 | "\n", 275 | "input_features = numerical_features + categorical_features_indexed" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "### Remove rare categories" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "df_fe.select([F.count_distinct(c) for c in categorical_features]).show()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "categorical_valid_values = {}\n", 301 | "\n", 302 | "for c in categorical_features:\n", 303 | " # Find frequent values\n", 304 | " categorical_valid_values[c] = (\n", 305 | " df_fe.groupby(c)\n", 306 | " .count()\n", 307 | " .filter(F.col(\"count\") > 100)\n", 308 | " .select(c)\n", 309 | " .toPandas()\n", 310 | " .values.ravel()\n", 311 | " )\n", 312 | "\n", 313 | " df_fe = df_fe.withColumn(\n", 314 | " c,\n", 315 | " F.when(F.col(c).isin(list(categorical_valid_values[c])), F.col(c)).otherwise(\n", 316 | " F.lit(\"Other\").alias(c)\n", 317 | " ),\n", 318 | " )" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "df_fe.select([F.count_distinct(c) for c in categorical_features]).show()" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "## Train/Test Split\n", 335 | "Train test split will need to be done using the source IP address, otherwise we risk leaking data. The best way to do this is by splitting the IP addresses at random, and then filtering the data frame according to the IP address." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "df_fe.groupby(\"source_ip\").agg(F.sum(F.col(\"is_bad\")).alias(\"bad_sum\")).orderBy(\"bad_sum\", ascending=False).show(5)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "# Training non-malicious IPs (80%)\n", 354 | "train_ips = (\n", 355 | " df_fe.where(\n", 356 | " ~F.col(\"source_ip\").isin([\"192.168.100.103\", \"192.168.2.5\", \"192.168.2.1\"])\n", 357 | " )\n", 358 | " .select(F.col(\"source_ip\"), F.lit(1).alias(\"is_train\"))\n", 359 | " .dropDuplicates()\n", 360 | " .sample(0.8)\n", 361 | ")\n", 362 | "\n", 363 | "\n", 364 | "df_fe = df_fe.join(train_ips, \"source_ip\", \"left\")\n", 365 | "\n", 366 | "# Add 1 malicious IP to training and testing data\n", 367 | "df_train = df_fe.where((F.col(\"is_train\") == 1) | (F.col(\"source_ip\") == \"192.168.100.103\"))\n", 368 | "df_test = df_fe.where((F.col(\"is_train\") != 1) | (F.col(\"source_ip\") == \"192.168.2.5\"))" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "## Pipeline" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "ind = StringIndexer(inputCols=categorical_features, outputCols=categorical_features_indexed, handleInvalid='skip')\n", 385 | "va = VectorAssembler(inputCols=input_features, outputCol=\"features\", handleInvalid='skip' )\n", 386 | "rf = RandomForestClassifier(featuresCol=\"features\", labelCol=\"is_bad\", numTrees=100)\n", 387 | "\n", 388 | "pipeline = Pipeline(stages=[ind, va, rf])" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "## Fit and Predict" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "pipeline = pipeline.fit(df_train)\n", 405 | "test_preds = pipeline.transform(df_test)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "## Evaluate" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", 422 | "\n", 423 | "roc = BinaryClassificationEvaluator(labelCol=\"is_bad\", metricName=\"areaUnderROC\")\n", 424 | "print(\"ROC AUC\", roc.evaluate(test_preds))\n", 425 | "\n", 426 | "pr = BinaryClassificationEvaluator(labelCol=\"is_bad\", metricName=\"areaUnderPR\")\n", 427 | "print(\"PR AUC\", pr.evaluate(test_preds))" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "import pandas as pd\n", 437 | "\n", 438 | "pd.DataFrame(\n", 439 | " {\n", 440 | " \"importance\": list(pipeline.stages[-1].featureImportances),\n", 441 | " \"feature\": pipeline.stages[-2].getInputCols(),\n", 442 | " }\n", 443 | ").sort_values(\"importance\", ascending=False)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "## Export" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "pipeline.stages[-1].save(\"rf_basic\")" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "pipeline.save(\"pipeline_basic\")" 469 | ] 470 | } 471 | ], 472 | "metadata": { 473 | "kernelspec": { 474 | "display_name": "dev", 475 | "language": "python", 476 | "name": "python3" 477 | }, 478 | "language_info": { 479 | "codemirror_mode": { 480 | "name": "ipython", 481 | "version": 3 482 | }, 483 | "file_extension": ".py", 484 | "mimetype": "text/x-python", 485 | "name": "python", 486 | "nbconvert_exporter": "python", 487 | "pygments_lexer": "ipython3", 488 | "version": "3.10.13" 489 | } 490 | }, 491 | "nbformat": 4, 492 | "nbformat_minor": 2 493 | } 494 | -------------------------------------------------------------------------------- /pyspark/spark_hp_tuning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "from pyspark.sql import Window\n", 11 | "import pyspark.sql.functions as F\n", 12 | "from pyspark.ml.feature import StringIndexer, VectorAssembler\n", 13 | "from pyspark.ml import Pipeline\n", 14 | "from pyspark.ml.classification import RandomForestClassifier\n", 15 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", 16 | "\n", 17 | "from hyperopt import fmin, tpe, hp, STATUS_OK, Trials" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Start Session" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "spark = (\n", 34 | " SparkSession.builder.appName(\"iot\")\n", 35 | " .getOrCreate()\n", 36 | ")\n", 37 | "spark.sparkContext.setLogLevel(\"ERROR\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Read data" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "df = spark.read.parquet(\"feature_engineered.pq\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "df.show(5)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "numerical_features = [\n", 72 | " \"duration\",\n", 73 | " \"orig_bytes\",\n", 74 | " \"resp_bytes\",\n", 75 | " \"orig_pkts\",\n", 76 | " \"orig_ip_bytes\",\n", 77 | " \"resp_pkts\",\n", 78 | " \"resp_ip_bytes\",\n", 79 | " \"source_ip_count_last_min\",\n", 80 | " \"source_ip_count_last_30_mins\",\n", 81 | " \"source_port_count_last_min\",\n", 82 | " \"source_port_count_last_30_mins\",\n", 83 | " \"source_ip_avg_pkts_last_min\",\n", 84 | " \"source_ip_avg_pkts_last_30_mins\",\n", 85 | " \"source_ip_avg_bytes_last_min\",\n", 86 | " \"source_ip_avg_bytes_last_30_mins\",\n", 87 | "]\n", 88 | "categorical_features = [\"proto\", \"service\", \"conn_state\", \"history\"]\n", 89 | "categorical_features_indexed = [c + \"_index\" for c in categorical_features]\n", 90 | "\n", 91 | "input_features = numerical_features + categorical_features_indexed" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "categorical_valid_values = {}\n", 101 | "\n", 102 | "for c in categorical_features:\n", 103 | " # Find frequent values\n", 104 | " categorical_valid_values[c] = (\n", 105 | " df.groupby(c)\n", 106 | " .count()\n", 107 | " .filter(F.col(\"count\") > 100)\n", 108 | " .select(c)\n", 109 | " .toPandas()\n", 110 | " .values.ravel()\n", 111 | " )\n", 112 | "\n", 113 | " df_fe = df.withColumn(\n", 114 | " c,\n", 115 | " F.when(F.col(c).isin(list(categorical_valid_values[c])), F.col(c)).otherwise(\n", 116 | " F.lit(\"Other\").alias(c)\n", 117 | " ),\n", 118 | " )" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "df_train, df_test = df_fe.randomSplit(weights=[0.8, 0.2], seed=42)\n", 128 | "df_train, df_val = df_train.randomSplit(weights=[0.8, 0.2], seed=42)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## HP Tuning" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "from tuning import tune_rf" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "search_space = {\n", 154 | " \"numTrees\": hp.uniformint(\"numTrees\", 10, 500),\n", 155 | " \"maxDepth\": hp.uniformint(\"maxDepth\", 2, 10),\n", 156 | "}\n", 157 | "\n", 158 | "roc = BinaryClassificationEvaluator(labelCol=\"is_bad\", metricName=\"areaUnderROC\")\n", 159 | "\n", 160 | "ind = StringIndexer(\n", 161 | " inputCols=categorical_features,\n", 162 | " outputCols=categorical_features_indexed,\n", 163 | " handleInvalid=\"skip\",\n", 164 | ")\n", 165 | "va = VectorAssembler(\n", 166 | " inputCols=input_features, outputCol=\"features\", handleInvalid=\"skip\"\n", 167 | ")\n", 168 | "\n", 169 | "best_params = tune_rf(df_train, df_val, ind, va, roc, search_space)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "best_rf = RandomForestClassifier(\n", 179 | " featuresCol=\"features\",\n", 180 | " labelCol=\"is_bad\",\n", 181 | " numTrees=best_params[\"numTrees\"],\n", 182 | " maxDepth=best_params[\"maxDepth\"],\n", 183 | ")\n", 184 | "\n", 185 | "best_pipeline = Pipeline(stages=[ind, va, best_rf])\n", 186 | "\n", 187 | "best_pipeline = best_pipeline.fit(df_train)\n", 188 | "test_preds = best_pipeline.transform(df_test)\n", 189 | "\n", 190 | "score = roc.evaluate(test_preds)\n", 191 | "score" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "best_pipeline.save(\"best_pipeline\")" 201 | ] 202 | } 203 | ], 204 | "metadata": { 205 | "kernelspec": { 206 | "display_name": "dev", 207 | "language": "python", 208 | "name": "python3" 209 | }, 210 | "language_info": { 211 | "codemirror_mode": { 212 | "name": "ipython", 213 | "version": 3 214 | }, 215 | "file_extension": ".py", 216 | "mimetype": "text/x-python", 217 | "name": "python", 218 | "nbconvert_exporter": "python", 219 | "pygments_lexer": "ipython3", 220 | "version": "3.10.13" 221 | } 222 | }, 223 | "nbformat": 4, 224 | "nbformat_minor": 2 225 | } 226 | -------------------------------------------------------------------------------- /pyspark/spark_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql.functions import (\n", 18 | " from_unixtime,\n", 19 | " to_timestamp,\n", 20 | " min,\n", 21 | " max,\n", 22 | " sum,\n", 23 | " avg,\n", 24 | " col,\n", 25 | " countDistinct,\n", 26 | " broadcast,\n", 27 | " date_trunc,\n", 28 | " count,\n", 29 | ")\n", 30 | "from pyspark.sql import Window\n", 31 | "import pyspark.sql.functions as F\n", 32 | "import plotly.express as px" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "# Read Files" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "filepaths = [\"./iot_malware/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv\", \"./iot_malware/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv\"]\n", 49 | "\n", 50 | "\n", 51 | "spark = (\n", 52 | " SparkSession.builder.appName(\"iot\")\n", 53 | " .getOrCreate()\n", 54 | ")\n", 55 | "spark.sparkContext.setLogLevel(\"ERROR\")\n", 56 | "spark.sparkContext.version" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "df = spark.read.option(\"delimiter\", \"|\").csv(filepaths, inferSchema = True, header = True)\n", 66 | "df.show(5)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "df.printSchema()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Pre-processing" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "df = df.withColumn(\"dt\", from_unixtime(\"ts\")).withColumn(\"dt\", to_timestamp(\"dt\"))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "df = df.withColumnsRenamed(\n", 101 | " {\n", 102 | " \"id.orig_h\": \"source_ip\",\n", 103 | " \"id.orig_p\": \"source_port\",\n", 104 | " \"id.resp_h\": \"dest_ip\",\n", 105 | " \"id.resp_p\": \"dest_port\",\n", 106 | " }\n", 107 | ")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Dataset Quality Checks" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### Min, Max datetime" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "df.agg(\n", 131 | " min(\"dt\").alias(\"min_date\"), \n", 132 | " max(\"dt\").alias(\"max_date\")\n", 133 | ").show()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Shape" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "df.count(), len(df.columns)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Static Columns" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "to_analyse = [\n", 166 | " \"source_ip\",\n", 167 | " \"source_port\",\n", 168 | " \"dest_ip\",\n", 169 | " \"dest_port\",\n", 170 | " \"proto\",\n", 171 | " \"service\",\n", 172 | " \"duration\",\n", 173 | " \"orig_bytes\",\n", 174 | " \"resp_bytes\",\n", 175 | " \"conn_state\",\n", 176 | " \"local_orig\",\n", 177 | " \"local_resp\",\n", 178 | " \"missed_bytes\",\n", 179 | " \"history\",\n", 180 | " \"orig_pkts\",\n", 181 | " \"orig_ip_bytes\",\n", 182 | " \"resp_pkts\",\n", 183 | " \"resp_ip_bytes\",\n", 184 | " \"tunnel_parents\",\n", 185 | " \"label\",\n", 186 | " \"detailed-label\",\n", 187 | "]\n", 188 | "\n", 189 | "unique_counts = df.agg(*(countDistinct(col(c)).alias(c) for c in to_analyse))\n", 190 | "print(unique_counts.show())" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "unique_counts = unique_counts.first()\n", 200 | "static_cols = [c for c in unique_counts.asDict() if unique_counts[c] == 1]\n", 201 | "print(\"Dataset has\", len(static_cols), \"static columns: \", static_cols)\n", 202 | "df = df.drop(*static_cols)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "### Count Distinct Values" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "source_ips = df.select(col(\"source_ip\")).distinct()\n", 219 | "dest_ips = df.select(col(\"dest_ip\")).distinct()\n", 220 | "common_ips = source_ips.join(broadcast(dest_ips), source_ips.source_ip == dest_ips.dest_ip, how='inner')\n", 221 | "\n", 222 | "\n", 223 | "print(\"Source IPs count:\", source_ips.count())\n", 224 | "print(\"Dest IPs count:\", dest_ips.count())\n", 225 | "print(\"IPs as both:\", common_ips.count())" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "source_ports = df.select(col(\"source_port\")).distinct()\n", 235 | "dest_ports = df.select(col(\"dest_port\")).distinct()\n", 236 | "common_ports = source_ports.join(broadcast(dest_ports), source_ports.source_port == dest_ports.dest_port, how='inner')\n", 237 | "\n", 238 | "\n", 239 | "print(\"Source Ports count:\", source_ports.count())\n", 240 | "print(\"Dest Ports count:\", dest_ports.count())\n", 241 | "print(\"Ports as both:\", common_ports.count())" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### Count Nulls" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "df = df.replace(\"-\", None)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "remaining_cols = [f for f in to_analyse if f not in static_cols]\n", 267 | "df.select(\n", 268 | " [count(F.when(F.isnan(c) | col(c).isNull(), c)).alias(c) for c in remaining_cols]\n", 269 | ").show()" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "## Time-Series Plots" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "df = df.withColumns(\n", 286 | " {\n", 287 | " \"day\": date_trunc(\"day\", \"dt\"),\n", 288 | " \"hour\": date_trunc(\"hour\", \"dt\"),\n", 289 | " \"minute\": date_trunc(\"minute\", \"dt\"),\n", 290 | " \"second\": date_trunc(\"second\", \"dt\"),\n", 291 | " }\n", 292 | ")" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "for agg in ['day', 'hour', 'minute']:\n", 302 | " plotting_table = df.groupBy([agg, \"label\"]).agg(count(\"uid\").alias(\"counts\")).orderBy(agg).toPandas()\n", 303 | " fig = px.line(plotting_table, x=agg, y=\"counts\", color=\"label\", title=f'Event Counts per {agg}')\n", 304 | " fig.show()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "## Univariate Data Analysis" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "def counts(df, var):\n", 321 | " var_counts = df.groupBy(var).count().orderBy(\"count\", ascending=False)\n", 322 | " var_counts = var_counts.withColumn(\n", 323 | " \"percent\", F.round(col(\"count\") / sum(col(\"count\")).over(Window.partitionBy()), 4)\n", 324 | " )\n", 325 | " var_counts.show()\n", 326 | " fig = px.bar(var_counts.toPandas(), x=var, y=\"count\")\n", 327 | " fig.show()\n", 328 | "\n", 329 | "\n", 330 | "categorical_columns = [\"proto\", \"service\", \"conn_state\", \"history\", \"label\"]\n", 331 | "\n", 332 | "for c in categorical_columns:\n", 333 | " counts(df, c)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "## Prepare for Modelling" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "numerical_cols = [\n", 350 | " \"duration\",\n", 351 | " \"orig_bytes\",\n", 352 | " \"resp_bytes\",\n", 353 | " \"orig_pkts\",\n", 354 | " \"orig_ip_bytes\",\n", 355 | " \"resp_pkts\",\n", 356 | " \"resp_ip_bytes\",\n", 357 | "]\n", 358 | "categorical_cols = [\"proto\", \"service\", \"conn_state\"]\n", 359 | "label = \"label\"\n", 360 | "\n", 361 | "all_cols = numerical_cols + categorical_cols" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "recast_cols = {}\n", 371 | "fill_vals = {}\n", 372 | "for c in numerical_cols:\n", 373 | " recast_cols[c] = col(c).cast(\"double\")\n", 374 | " fill_vals[c] = -999999\n", 375 | "\n", 376 | "for c in categorical_cols:\n", 377 | " fill_vals[c] = 'missing'\n", 378 | " \n", 379 | "df = df.withColumns(recast_cols)\n", 380 | "df = df.fillna(fill_vals)\n" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "## Full Pipeline" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "static_cols = [\"local_orig\", \"local_resp\", \"missed_bytes\", \"tunnel_parents\"]\n", 397 | "\n", 398 | "recast_cols = {\n", 399 | " \"duration\": col(\"duration\").cast(\"double\"),\n", 400 | " \"orig_bytes\": col(\"orig_bytes\").cast(\"double\"),\n", 401 | " \"resp_bytes\": col(\"resp_bytes\").cast(\"double\"),\n", 402 | " \"orig_ip_bytes\": col(\"orig_ip_bytes\").cast(\"double\"),\n", 403 | " \"orig_pkts\": col(\"orig_pkts\").cast(\"double\"),\n", 404 | " \"resp_pkts\": col(\"resp_pkts\").cast(\"double\"),\n", 405 | " \"resp_ip_bytes\": col(\"resp_ip_bytes\").cast(\"double\"),\n", 406 | "}\n", 407 | "\n", 408 | "fill_vals = {\n", 409 | " \"duration\": -999999,\n", 410 | " \"orig_bytes\": -999999,\n", 411 | " \"resp_bytes\": -999999,\n", 412 | " \"orig_pkts\": -999999,\n", 413 | " \"orig_ip_bytes\": -999999,\n", 414 | " \"resp_pkts\": -999999,\n", 415 | " \"resp_ip_bytes\": -999999,\n", 416 | " \"history\": \"missing\",\n", 417 | " \"proto\": \"missing\",\n", 418 | " \"service\": \"missing\",\n", 419 | " \"conn_state\": \"missing\",\n", 420 | "}\n", 421 | "\n", 422 | "preprocessed_data = (\n", 423 | " spark.read.option(\"delimiter\", \"|\")\n", 424 | " .csv(filepaths, inferSchema=True, header=True)\n", 425 | " .withColumn(\"dt\", to_timestamp(from_unixtime(\"ts\")))\n", 426 | " .withColumns(\n", 427 | " {\n", 428 | " \"day\": date_trunc(\"day\", \"dt\"),\n", 429 | " \"hour\": date_trunc(\"hour\", \"dt\"),\n", 430 | " \"minute\": date_trunc(\"minute\", \"dt\"),\n", 431 | " \"second\": date_trunc(\"second\", \"dt\"),\n", 432 | " }\n", 433 | " )\n", 434 | " .withColumnsRenamed(\n", 435 | " {\n", 436 | " \"id.orig_h\": \"source_ip\",\n", 437 | " \"id.orig_p\": \"source_port\",\n", 438 | " \"id.resp_h\": \"dest_ip\",\n", 439 | " \"id.resp_p\": \"dest_port\",\n", 440 | " }\n", 441 | " )\n", 442 | " .drop(*static_cols)\n", 443 | " .replace(\"-\", None)\n", 444 | " .withColumns(recast_cols)\n", 445 | " .fillna(fill_vals)\n", 446 | ")\n", 447 | "\n", 448 | "preprocessed_data.show()" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "## Write Out" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "preprocessed_data.writeparquet(\"processed.pq\")" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "read_in = spark.read.parquet(\"processed.pq\")\n", 474 | "read_in.show()" 475 | ] 476 | } 477 | ], 478 | "metadata": { 479 | "kernelspec": { 480 | "display_name": "dev", 481 | "language": "python", 482 | "name": "python3" 483 | }, 484 | "language_info": { 485 | "codemirror_mode": { 486 | "name": "ipython", 487 | "version": 3 488 | }, 489 | "file_extension": ".py", 490 | "mimetype": "text/x-python", 491 | "name": "python", 492 | "nbconvert_exporter": "python", 493 | "pygments_lexer": "ipython3", 494 | "version": "3.10.13" 495 | } 496 | }, 497 | "nbformat": 4, 498 | "nbformat_minor": 2 499 | } 500 | -------------------------------------------------------------------------------- /pyspark/tuning.py: -------------------------------------------------------------------------------- 1 | from hyperopt import STATUS_OK, Trials, fmin, tpe 2 | from hyperopt.pyll.base import Apply 3 | from pyspark.ml import Pipeline 4 | from pyspark.ml.classification import RandomForestClassifier 5 | from pyspark.ml.evaluation import Evaluator 6 | from pyspark.ml.feature import StringIndexer, VectorAssembler 7 | from pyspark.sql import DataFrame 8 | 9 | 10 | def tune_rf( 11 | train: DataFrame, 12 | val: DataFrame, 13 | string_indexer: StringIndexer, 14 | vector_assembler: VectorAssembler, 15 | evaluator: Evaluator, 16 | param_grid: dict[str, Apply], 17 | tuning_rounds: int = 10, 18 | ): 19 | def objective(params): 20 | rf = RandomForestClassifier( 21 | featuresCol="features", 22 | labelCol="is_bad", 23 | numTrees=params["numTrees"], 24 | maxDepth=params["maxDepth"], 25 | ) 26 | 27 | pipeline = Pipeline(stages=[string_indexer, vector_assembler, rf]) 28 | 29 | pipeline = pipeline.fit(train) 30 | val_df = pipeline.transform(val) 31 | 32 | score = evaluator.evaluate(val_df) 33 | return {"loss": -score, "status": STATUS_OK} 34 | 35 | rf_trials = Trials() 36 | 37 | argmin = fmin( 38 | fn=objective, 39 | space=param_grid, 40 | algo=tpe.suggest, 41 | max_evals=tuning_rounds, 42 | trials=rf_trials, 43 | ) 44 | 45 | return argmin 46 | -------------------------------------------------------------------------------- /tfdf/notebooks/data_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "2023-03-17 17:18:41.007743: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", 13 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import plotly.express as px" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "data = pd.read_csv(\"SBAnational.csv\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def get_frequent(x, thr=0.005):\n", 47 | " count_norm = x.value_counts(normalize=True)\n", 48 | " frequent = count_norm[count_norm >= thr]\n", 49 | " return frequent.index\n", 50 | "\n", 51 | "\n", 52 | "def plot_numeric_boxplots(data, target, feature):\n", 53 | " fig = px.box(\n", 54 | " data,\n", 55 | " x=target,\n", 56 | " y=feature,\n", 57 | " )\n", 58 | " fig.show()\n", 59 | "\n", 60 | "\n", 61 | "def plot_category_props(data, x, target):\n", 62 | " prop = data.groupby(x)[target].mean()\n", 63 | " fig = px.bar(x=prop.index, y=prop.values, labels={\"x\": x, \"y\": target})\n", 64 | " fig.show()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "data['is_default'] = ~data['ChgOffDate'].isna()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Feature Cleaning" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "frequent_city = get_frequent(data['City'])\n", 90 | "data['City'] = data['City'].apply(lambda x: x if x in frequent_city else 'Other')\n", 91 | "\n", 92 | "frequent_banks = get_frequent(data['Bank'])\n", 93 | "data['Bank'] = data['Bank'].apply(lambda x: x if x in frequent_banks else 'Other')\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 6, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "frequent_fr_code = get_frequent(data[\"FranchiseCode\"].astype(str))\n", 103 | "data[\"FranchiseCode\"] = data[\"FranchiseCode\"].apply(\n", 104 | " lambda x: str(x) if str(x) in frequent_fr_code else \"Other\"\n", 105 | ")\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "data['RevLineCr'] = data['RevLineCr'].apply(lambda x: x if x in (\"Y\", 'N') else 'Other')\n", 115 | "data['LowDoc'] = data['LowDoc'].apply(lambda x: x if x in (\"Y\", 'N') else 'Other')" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 8, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "data['GrAppv'] = data['GrAppv'].apply(lambda x: float(x.replace('$', '').replace('.', '').replace(',', '')))\n", 125 | "data['SBA_Appv'] = data['SBA_Appv'].apply(lambda x: float(x.replace('$', '').replace('.', '').replace(',', '')))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 9, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "1.0 644869\n", 137 | "2.0 253125\n", 138 | "0.0 1034\n", 139 | "Name: NewExist, dtype: int64" 140 | ] 141 | }, 142 | "execution_count": 9, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "data['NewExist'].value_counts()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 10, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "data['is_new'] = data['NewExist'].apply(lambda x: x == 2)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Feature Engineering" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 11, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "data['same_state'] = data['State'] == data['BankState']" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 12, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "import pgeocode\n", 183 | "\n", 184 | "zip_codes = data['Zip'].astype(str).unique()\n", 185 | "nomi = pgeocode.Nominatim('us')\n", 186 | "zip_aug = nomi.query_postal_code(zip_codes)\n", 187 | "\n", 188 | "zip_long_map = dict(zip(zip_aug['postal_code'].values, zip_aug['longitude'].values))\n", 189 | "zip_lat_map = dict(zip(zip_aug['postal_code'].values, zip_aug['latitude'].values))\n", 190 | "\n", 191 | "data['longitude'] = data['Zip'].astype(str).map(zip_long_map)\n", 192 | "data['latitude'] = data['Zip'].astype(str).map(zip_lat_map)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## Featur Selection" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 13, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "NUMERIC_FEATURES = [\n", 209 | " \"Term\",\n", 210 | " \"NoEmp\",\n", 211 | " \"CreateJob\",\n", 212 | " \"RetainedJob\",\n", 213 | " \"longitude\",\n", 214 | " \"latitude\",\n", 215 | " \"GrAppv\",\n", 216 | " \"SBA_Appv\",\n", 217 | "]\n", 218 | "\n", 219 | "CATEGORICAL_FEATURES = [\n", 220 | " \"is_new\",\n", 221 | " \"FranchiseCode\",\n", 222 | " \"UrbanRural\",\n", 223 | " \"City\",\n", 224 | " \"State\",\n", 225 | " \"Bank\",\n", 226 | " \"BankState\",\n", 227 | " \"RevLineCr\",\n", 228 | " \"naics_first_two\",\n", 229 | " \"same_state\",\n", 230 | "]\n", 231 | "\n", 232 | "TARGET = \"is_default\"\n" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 14, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "clean_data = data[['ApprovalFY'] + NUMERIC_FEATURES + CATEGORICAL_FEATURES + [TARGET]]" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 15, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "clean_data = clean_data[clean_data['ApprovalFY'] != '1976A']\n", 251 | "clean_data['ApprovalFY'] = clean_data['ApprovalFY'].astype(int)\n", 252 | "clean_data.to_parquet(\"loan_data_clean.parquet\")" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## Data Split" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 16, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "test_thr = np.quantile(clean_data['ApprovalFY'], 0.90)\n", 269 | "train_data = clean_data[clean_data['ApprovalFY'] <= test_thr]\n", 270 | "test_data = clean_data[clean_data['ApprovalFY'] > test_thr]" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 17, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "val_thr = np.quantile(train_data['ApprovalFY'], 0.90)\n", 280 | "val_data = train_data[train_data['ApprovalFY'] > val_thr]\n", 281 | "train_data = train_data[train_data['ApprovalFY'] <= val_thr]" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 104, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "((802301, 20), (39540, 20), (57305, 20))" 293 | ] 294 | }, 295 | "execution_count": 104, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "train_data.shape, val_data.shape, test_data.shape" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 19, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "train_data.to_parquet('train_data.parquet', index=False)\n", 311 | "val_data.to_parquet('val_data.parquet', index=False)\n", 312 | "test_data.to_parquet('test_data.parquet', index=False)" 313 | ] 314 | }, 315 | { 316 | "attachments": {}, 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "[data split](#)" 321 | ] 322 | } 323 | ], 324 | "metadata": { 325 | "interpreter": { 326 | "hash": "a2df742b932880654a3f6652148a9c802dc0dfad475f6beda4797814052023f2" 327 | }, 328 | "kernelspec": { 329 | "display_name": "Python 3.9.13", 330 | "language": "python", 331 | "name": "python3" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 3 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython3", 343 | "version": "3.10.9" 344 | }, 345 | "orig_nbformat": 4 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 2 349 | } 350 | -------------------------------------------------------------------------------- /tfdf/notebooks/plot.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 426 | --------------------------------------------------------------------------------