├── .gitignore
├── LICENSE
├── README.md
├── api_anomaly_project
├── .gitignore
├── Dockerfile
├── app
│ ├── app.py
│ ├── measure_response.py
│ └── ping.py
├── assets
│ ├── header.png
│ ├── roc_pr_curves.png
│ ├── shap.png
│ └── thresholds.png
├── data
│ ├── supervised_clean_data.parquet
│ └── supervised_clean_data_w_features.parquet
├── models
│ └── hgbt_final.joblib
├── notebooks
│ ├── cleaning.ipynb
│ ├── eda.ipynb
│ ├── feature_engineering.ipynb
│ ├── htmls
│ │ ├── eda.html
│ │ ├── feature_engineering.html
│ │ └── modelling.html
│ └── modelling.ipynb
├── readme.md
├── requirements.txt
└── utils
│ ├── __init__.py
│ ├── cleaning.py
│ ├── feature_engineering.py
│ ├── ml.py
│ └── visualisations.py
├── deployment
├── fastapi
│ ├── Dockerfile
│ ├── app.py
│ ├── app_test.py
│ ├── loan_catboost_model.cbm
│ ├── measure_response.py
│ └── requirements.txt
└── flask
│ ├── Dockerfile
│ ├── app.py
│ ├── app_test.py
│ ├── loan_catboost_model.cbm
│ ├── measure_response.py
│ └── requirements.txt
├── hp_tuning
├── bv_tradeoff.png
└── hp_tuning_rf_gbt.ipynb
├── metaflow
└── fraud_email
│ ├── email_eda.ipynb
│ ├── fradulent_emails.txt
│ ├── readme.md
│ ├── requirements.txt
│ └── utils
│ ├── __init__.py
│ ├── feature_generation.py
│ ├── plots.py
│ ├── preprocess.py
│ └── read_data.py
├── mlflow
├── mlflow_experiment_tracking.ipynb
└── old_notebook.ipynb
├── mlflow_models
├── .DS_Store
├── MLProject
├── model_search.ipynb
├── python_env.yaml
├── search_params.py
├── train_hgbt.py
├── train_rf.py
└── utils
│ ├── __init__.py
│ ├── columns.py
│ ├── data_utils.py
│ └── eval_utils.py
├── mlflow_project
├── MLproject
├── conda_env.yaml
├── main.py
└── steps
│ ├── __init__.py
│ ├── download_data.py
│ ├── preprocess_data.py
│ ├── train_final_model.py
│ └── tune_model.py
├── polars
├── basics.ipynb
├── data_preparation_pipeline.py
├── data_utils
│ ├── __init__.py
│ ├── feature_engineering.py
│ ├── processing.py
│ └── transfomation.py
├── model.ipynb
├── pipe_config.yaml
└── time_analysis.ipynb
├── pyspark
├── cleaning.py
├── conda_env.yaml
├── config.yaml
├── feature_engineering.py
├── gcs_config.yaml
├── ml_prep.py
├── pipe.py
├── spark_feature_engineering.ipynb
├── spark_hp_tuning.ipynb
├── spark_intro.ipynb
└── tuning.py
└── tfdf
└── notebooks
├── data_preprocessing.ipynb
├── model_training.ipynb
└── plot.html
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # pipenv
90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
93 | # install all needed dependencies.
94 | #Pipfile.lock
95 |
96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
97 | __pypackages__/
98 |
99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
133 | # Files
134 | .csv
135 | .json
136 | .pq
137 | .parquet
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tutorials
2 | Notebooks/scripts for youtube tutorials
3 |
--------------------------------------------------------------------------------
/api_anomaly_project/.gitignore:
--------------------------------------------------------------------------------
1 | ## The .gitignore file specifies things that git should ignore.
2 | ## This default template includes entries for R, Python and visual studio
3 |
4 | ##
5 | ## Add custom entries below here.
6 | ##
7 | dst-env/
8 | .cache/v/cache/lastfailed
9 | tests/.cache/v/cache/lastfailed
10 | .vscode/settings.json
11 | .DS_Store
12 | *.db
13 | mlruns
14 |
15 | # datasets
16 | *.csv
17 | *.json
18 |
19 | ## Python Section - See https://github.com/github/gitignore/blob/master/Python.gitignore
20 | ##
21 |
22 | # Byte-compiled / optimized / DLL files
23 | __pycache__/
24 | *.py[cod]
25 | *$py.class
26 |
27 | # C extensions
28 | *.so
29 |
30 | # Distribution / packaging
31 | .Python
32 | env/
33 | build/
34 | develop-eggs/
35 | dist/
36 | downloads/
37 | eggs/
38 | .eggs/
39 | lib/
40 | lib64/
41 | parts/
42 | sdist/
43 | var/
44 | wheels/
45 | *.egg-info/
46 | .installed.cfg
47 | *.egg
48 |
49 | # PyInstaller
50 | # Usually these files are written by a python script from a template
51 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
52 | *.manifest
53 | *.spec
54 |
55 | # Installer logs
56 | pip-log.txt
57 | pip-delete-this-directory.txt
58 |
59 | # Unit test / coverage reports
60 | htmlcov/
61 | .tox/
62 | .coverage
63 | .coverage.*
64 | .cache
65 | nosetests.xml
66 | coverage.xml
67 | *.cover
68 | .hypothesis/
69 |
70 | # Translations
71 | *.mo
72 | *.pot
73 |
74 | # Django stuff:
75 | *.log
76 | local_settings.py
77 |
78 | # Flask stuff:
79 | instance/
80 | .webassets-cache
81 |
82 | # Scrapy stuff:
83 | .scrapy
84 |
85 | # Sphinx documentation
86 | docs/_build/
87 |
88 | # PyBuilder
89 | target/
90 |
91 | # Jupyter Notebook
92 | .ipynb_checkpoints
93 |
94 | # pyenv
95 | .python-version
96 |
97 | # celery beat schedule file
98 | celerybeat-schedule
99 |
100 | # SageMath parsed files
101 | *.sage.py
102 |
103 | # dotenv
104 | .env
105 |
106 | # virtualenv
107 | .venv
108 | venv/
109 | ENV/
110 |
111 | # Spyder project settings
112 | .spyderproject
113 | .spyproject
114 |
115 | # Rope project settings
116 | .ropeproject
117 |
118 | # mkdocs documentation
119 | /site
120 |
121 | # mypy
122 | .mypy_cache/
123 |
124 | ## Ignore Visual Studio temporary files, build results, and
125 | ## files generated by popular Visual Studio add-ons.
126 | ##
127 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
128 |
129 | # User-specific files
130 | *.suo
131 | *.user
132 | *.userosscache
133 | *.sln.docstates
134 |
135 | # User-specific files (MonoDevelop/Xamarin Studio)
136 | *.userprefs
137 |
138 | # Build results
139 | [Dd]ebug/
140 | [Dd]ebugPublic/
141 | [Rr]elease/
142 | [Rr]eleases/
143 | x64/
144 | x86/
145 | bld/
146 | [Bb]in/
147 | [Oo]bj/
148 | [Ll]og/
149 |
150 | # Visual Studio 2015 cache/options directory
151 | .vs/
152 | # Uncomment if you have tasks that create the project's static files in wwwroot
153 | #wwwroot/
154 |
155 | # MSTest test Results
156 | [Tt]est[Rr]esult*/
157 | [Bb]uild[Ll]og.*
158 |
159 | # NUNIT
160 | *.VisualState.xml
161 | TestResult.xml
162 |
163 | # Build Results of an ATL Project
164 | [Dd]ebugPS/
165 | [Rr]eleasePS/
166 | dlldata.c
167 |
168 | # Benchmark Results
169 | BenchmarkDotNet.Artifacts/
170 |
171 | # .NET Core
172 | project.lock.json
173 | project.fragment.lock.json
174 | artifacts/
175 | **/Properties/launchSettings.json
176 |
177 | *_i.c
178 | *_p.c
179 | *_i.h
180 | *.ilk
181 | *.meta
182 | *.obj
183 | *.pch
184 | *.pdb
185 | *.pgc
186 | *.pgd
187 | *.rsp
188 | *.sbr
189 | *.tlb
190 | *.tli
191 | *.tlh
192 | *.tmp
193 | *.tmp_proj
194 | *.log
195 | *.vspscc
196 | *.vssscc
197 | .builds
198 | *.pidb
199 | *.svclog
200 | *.scc
201 |
202 | # Chutzpah Test files
203 | _Chutzpah*
204 |
205 | # Visual C++ cache files
206 | ipch/
207 | *.aps
208 | *.ncb
209 | *.opendb
210 | *.opensdf
211 | *.sdf
212 | *.cachefile
213 | *.VC.db
214 | *.VC.VC.opendb
215 |
216 | # Visual Studio profiler
217 | *.psess
218 | *.vsp
219 | *.vspx
220 | *.sap
221 |
222 | # Visual Studio Trace Files
223 | *.e2e
224 |
225 | # TFS 2012 Local Workspace
226 | $tf/
227 |
228 | # Guidance Automation Toolkit
229 | *.gpState
230 |
231 | # ReSharper is a .NET coding add-in
232 | _ReSharper*/
233 | *.[Rr]e[Ss]harper
234 | *.DotSettings.user
235 |
236 | # JustCode is a .NET coding add-in
237 | .JustCode
238 |
239 | # TeamCity is a build add-in
240 | _TeamCity*
241 |
242 | # DotCover is a Code Coverage Tool
243 | *.dotCover
244 |
245 | # AxoCover is a Code Coverage Tool
246 | .axoCover/*
247 | !.axoCover/settings.json
248 |
249 | # Visual Studio code coverage results
250 | *.coverage
251 | *.coveragexml
252 |
253 | # NCrunch
254 | _NCrunch_*
255 | .*crunch*.local.xml
256 | nCrunchTemp_*
257 |
258 | # MightyMoose
259 | *.mm.*
260 | AutoTest.Net/
261 |
262 | # Web workbench (sass)
263 | .sass-cache/
264 |
265 | # Installshield output folder
266 | [Ee]xpress/
267 |
268 | # DocProject is a documentation generator add-in
269 | DocProject/buildhelp/
270 | DocProject/Help/*.HxT
271 | DocProject/Help/*.HxC
272 | DocProject/Help/*.hhc
273 | DocProject/Help/*.hhk
274 | DocProject/Help/*.hhp
275 | DocProject/Help/Html2
276 | DocProject/Help/html
277 |
278 | # Click-Once directory
279 | publish/
280 |
281 | # Publish Web Output
282 | *.[Pp]ublish.xml
283 | *.azurePubxml
284 | # Note: Comment the next line if you want to checkin your web deploy settings,
285 | # but database connection strings (with potential passwords) will be unencrypted
286 | *.pubxml
287 | *.publishproj
288 |
289 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
290 | # checkin your Azure Web App publish settings, but sensitive information contained
291 | # in these scripts will be unencrypted
292 | PublishScripts/
293 |
294 | # NuGet Packages
295 | *.nupkg
296 | # The packages folder can be ignored because of Package Restore
297 | **/[Pp]ackages/*
298 | # except build/, which is used as an MSBuild target.
299 | !**/[Pp]ackages/build/
300 | # Uncomment if necessary however generally it will be regenerated when needed
301 | #!**/[Pp]ackages/repositories.config
302 | # NuGet v3's project.json files produces more ignorable files
303 | *.nuget.props
304 | *.nuget.targets
305 |
306 | # Microsoft Azure Build Output
307 | csx/
308 | *.build.csdef
309 |
310 | # Microsoft Azure Emulator
311 | ecf/
312 | rcf/
313 |
314 | # Windows Store app package directories and files
315 | AppPackages/
316 | BundleArtifacts/
317 | Package.StoreAssociation.xml
318 | _pkginfo.txt
319 | *.appx
320 |
321 | # Visual Studio cache files
322 | # files ending in .cache can be ignored
323 | *.[Cc]ache
324 | # but keep track of directories ending in .cache
325 | !*.[Cc]ache/
326 |
327 | # Others
328 | ClientBin/
329 | ~$*
330 | *~
331 | *.dbmdl
332 | *.dbproj.schemaview
333 | *.jfm
334 | *.pfx
335 | *.publishsettings
336 | orleans.codegen.cs
337 |
338 | # Since there are multiple workflows, uncomment next line to ignore bower_components
339 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
340 | #bower_components/
341 |
342 | # RIA/Silverlight projects
343 | Generated_Code/
344 |
345 | # Backup & report files from converting an old project file
346 | # to a newer Visual Studio version. Backup files are not needed,
347 | # because we have git ;-)
348 | _UpgradeReport_Files/
349 | Backup*/
350 | UpgradeLog*.XML
351 | UpgradeLog*.htm
352 |
353 | # SQL Server files
354 | *.mdf
355 | *.ldf
356 | *.ndf
357 |
358 | # Business Intelligence projects
359 | *.rdl.data
360 | *.bim.layout
361 | *.bim_*.settings
362 |
363 | # Microsoft Fakes
364 | FakesAssemblies/
365 |
366 | # GhostDoc plugin setting file
367 | *.GhostDoc.xml
368 |
369 | # Node.js Tools for Visual Studio
370 | .ntvs_analysis.dat
371 | node_modules/
372 |
373 | # Typescript v1 declaration files
374 | typings/
375 |
376 | # Visual Studio 6 build log
377 | *.plg
378 |
379 | # Visual Studio 6 workspace options file
380 | *.opt
381 |
382 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
383 | *.vbw
384 |
385 | # Visual Studio LightSwitch build output
386 | **/*.HTMLClient/GeneratedArtifacts
387 | **/*.DesktopClient/GeneratedArtifacts
388 | **/*.DesktopClient/ModelManifest.xml
389 | **/*.Server/GeneratedArtifacts
390 | **/*.Server/ModelManifest.xml
391 | _Pvt_Extensions
392 |
393 | # Paket dependency manager
394 | .paket/paket.exe
395 | paket-files/
396 |
397 | # FAKE - F# Make
398 | .fake/
399 |
400 | # JetBrains Rider
401 | .idea/
402 | *.sln.iml
403 |
404 | # CodeRush
405 | .cr/
406 |
407 | # Python Tools for Visual Studio (PTVS)
408 | __pycache__/
409 | *.pyc
410 |
411 | # Cake - Uncomment if you are using it
412 | # tools/**
413 | # !tools/packages.config
414 |
415 | # Tabs Studio
416 | *.tss
417 |
418 | # Telerik's JustMock configuration file
419 | *.jmconfig
420 |
421 | # BizTalk build output
422 | *.btp.cs
423 | *.btm.cs
424 | *.odx.cs
425 | *.xsd.cs
426 |
427 | # OpenCover UI analysis results
428 | OpenCover/
429 | junit/
--------------------------------------------------------------------------------
/api_anomaly_project/Dockerfile:
--------------------------------------------------------------------------------
1 |
2 | # Start from a base image
3 | FROM python:3.11-slim
4 |
5 | # Set the working directory
6 | WORKDIR /app
7 |
8 | # Copy the requirements file into the container
9 | COPY requirements.txt requirements.txt
10 |
11 | # Install the required packages
12 | RUN pip install --upgrade pip
13 | RUN pip install -r requirements.txt
14 |
15 | # Copy the application code into the container
16 | # YOUR FILES HERE
17 |
18 | # Expose the app port
19 | EXPOSE 80
20 |
21 | # Run command
22 | CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
--------------------------------------------------------------------------------
/api_anomaly_project/app/app.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/app/app.py
--------------------------------------------------------------------------------
/api_anomaly_project/app/measure_response.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/app/measure_response.py
--------------------------------------------------------------------------------
/api_anomaly_project/app/ping.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/app/ping.py
--------------------------------------------------------------------------------
/api_anomaly_project/assets/header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/assets/header.png
--------------------------------------------------------------------------------
/api_anomaly_project/assets/roc_pr_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/assets/roc_pr_curves.png
--------------------------------------------------------------------------------
/api_anomaly_project/assets/shap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/assets/shap.png
--------------------------------------------------------------------------------
/api_anomaly_project/assets/thresholds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/assets/thresholds.png
--------------------------------------------------------------------------------
/api_anomaly_project/data/supervised_clean_data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/data/supervised_clean_data.parquet
--------------------------------------------------------------------------------
/api_anomaly_project/data/supervised_clean_data_w_features.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/data/supervised_clean_data_w_features.parquet
--------------------------------------------------------------------------------
/api_anomaly_project/models/hgbt_final.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/models/hgbt_final.joblib
--------------------------------------------------------------------------------
/api_anomaly_project/readme.md:
--------------------------------------------------------------------------------
1 | # API Security: Anomaly Detection App
2 |
3 | > [!WARNING]
4 | > All the metrics, plots, and insights are made up and taken from the internet
5 |
6 | 
7 |
8 | ## Dataset
9 |
10 | The dataset for this project can be found on [Kaggle](https://www.kaggle.com/datasets/tangodelta/api-access-behaviour-anomaly-dataset/data) (licensed under GPL-2).
11 |
12 | Distributed micro-services based applications are typically accessed via APIs. The authors of this dataset have collected sequences of API calls from an application and put them into a graph format. For this graph, they've generated common API access patterns (i.e. sequences of API calls) and have calculated user access metrics that can be used to classify these behaviours. Also, they've manually labelled a set of these behaviour patters (our training set) and have provided the remaining sequences for us to classify.
13 |
14 | ## Objectives
15 |
16 | The main objective of this project is:
17 |
18 | > **To develop a system that will be able to detect anomalous behaviour from the API calls for the remaining sequences**
19 |
20 | To achieve this objective, it was further broken down into the following 5 technical sub-objectives:
21 |
22 | 1. To perform in-depth exploratory data analysis of the both datasets (tabular and graph)
23 | 2. To engineer new predictive features from the available graphs
24 | 3. To develop a supervised model to classify behaviour into normal and anomalous
25 | 4. To recommend a threshold that will perform better than the present baseline (ALGO-X) in terms of F1 score
26 | 5. To create an API endpoint for the trained model and deploy it
27 |
28 | ## Main Insights
29 |
30 | From the exploratory data analysis, we found out that anomalous behavviour patterns are cahracterised by:
31 |
32 | * Insight about anomaly vs normal #1
33 | * Insight about anomaly vs normal #2
34 | * Insight about anomaly vs normal #3
35 |
36 | ## Engineered Features
37 |
38 | From the provided networks, the following features were extracted:
39 |
40 | * Feature 1 - this feature helps us to measure *X* activity and is expected to be much higher in anomalies/normal behaviour
41 | * Feature 2 - this feature helps us to measure *X* activity and is expected to be much higher in anomalies/normal behaviour
42 | * Feature 3 - this feature helps us to measure *X* activity and is expected to be much higher in anomalies/normal behaviour
43 |
44 | As a result of this feature engineering work, the ROC AUC for the final model has increased by 30% and has improved F1 score uplift from the baseline model from 1.5 to 1.8.
45 |
46 | ## Model Selection
47 |
48 | Models were compared between each other using ROC AUC since we're dealing with binary classification task and the label distribution is relatively balanced.
49 | 2 models (XGBoost and LightGBM) were tuned for 50 iterations. The best performing model is LightGBM with the following parameters:
50 |
51 | ```json
52 | {
53 | colsample_by_tree: 0.2,
54 | num_trees: 2454,
55 | learning_rate: 0.02,
56 | subsample: 0.5
57 | }
58 | ```
59 |
60 | 
61 |
62 | LightGBM has outperformed XGBoost by *X%* in terms of ROC AUC. From the PR AUC curves, we can also see that it can give use gigher level of recall with the same precision at most of the thresholds, so this model is selected for deployment.
63 |
64 | ### Model Explainability
65 |
66 | 
67 |
68 | The selected model has a well balanced feature improtance distribution, with top 3 features being *X, Y, and ~*. The directions of SHAP values are intuitive, since we expect that anomalies have larger rate of *X* and *Y* and smaller number of *Z*
69 | Notably, the engineered features are also considered to be important (4th, 5th and 7th place), which means that the feature engineering effort was successful.
70 |
71 | ## Business Metrics
72 |
73 | To determine the achieved business metrics, we first need to set the threshold for our classifier.
74 |
75 | 
76 |
77 | From the threshold analysis, we can see that the maximum F1 score we can achieve is *X* across a variety of thresholds. For the purpose of this project, we can assume that the business is more interested in obtaining higher recall than precision, so we'll set the threshold at *X* which gives us the following metrics *(numbers are made up)*:
78 |
79 | | Threshold | 0.25 |
80 | |------------|------|
81 | | Precision | 0.7 |
82 | | Recall | 0.9 |
83 | | F1 Score | 0.85 |
84 | | Alert Rate | 0.02 |
85 |
86 | ## Prediction Service
87 |
88 | For this project, the assumtpion is that feature engineering will be handled by another serivce, so the deployment part is responsible purely for the model inference.
89 | To create the API locally, you'll need to use Docker.
90 |
91 | ### Step 1: Build Docker Image
92 |
93 | Clone the repository and go to the folder with the Dockerfile. Then run the following command to build the image.
94 |
95 | ```shell
96 | docker build -t prediction-service:latest .
97 | ```
98 |
99 | To check if the image was created successfully, run `docker images` in you CLI and you should see `prediction-service` listed.
100 |
101 | ### Step 2: Send the Request
102 |
103 | To test if the API is working, you can run the `ping.py` file in the `app` folder. You'll need Python installed on your computer.
104 |
105 | ```shell
106 | python app/ping.py
107 | ```
108 |
109 | ### Step 3: Measuring Response Time
110 |
111 | The following response times were measured locally by sending 100 requests per second from 1 user:
112 |
113 | | Response Time | Measure |
114 | |-------------------------------|--------------|
115 | | Median Response Time | 0.1 seconds |
116 | | 99th Percentile Response Time | 0.9 seconds |
117 | | Max Response Time | 0.95 seconds |
118 |
119 | To run these tests on your machine, you'll need to run the `measure_response.py` script
120 |
121 | ```shell
122 | python app/measure_response.py
123 | ```
124 |
125 | ## Authors
126 |
127 | * [Antons Tocilins-Ruberts](https://github.com/aruberts)
128 |
--------------------------------------------------------------------------------
/api_anomaly_project/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==2.3.1
2 | uvicorn==0.12.2
3 | fastapi==0.63.0
--------------------------------------------------------------------------------
/api_anomaly_project/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/api_anomaly_project/utils/__init__.py
--------------------------------------------------------------------------------
/api_anomaly_project/utils/cleaning.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 |
3 |
4 | def count_missing(data: pl.DataFrame) -> pl.DataFrame:
5 | """Return a polars dataframe with missing counts per columns
6 |
7 | Args:
8 | data (pl.DataFrame): input dataframe to be analysed
9 |
10 | Returns:
11 | pl.DataFrame: dataframe with missing counts
12 | """
13 | missing = data.select(
14 | pl.col(c).is_null().sum().alias(f"{c}_missing") for c in data.columns
15 | )
16 |
17 | return missing
18 |
--------------------------------------------------------------------------------
/api_anomaly_project/utils/feature_engineering.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import plotly.express as px
3 | import polars as pl
4 | import ppscore as pps
5 |
6 |
7 | def aggregate_node_features(
8 | data: pl.DataFrame, node_features: list[str], by: str = "_id"
9 | ) -> pl.DataFrame:
10 | """Utility function to generate basic aggregation statistics features for node level features
11 |
12 | Args:
13 | data (pl.DataFrame): input dataframe
14 | node_features (list[str]): list of node features to aggregate
15 | by (str, optional): the graph ID column. Defaults to "_id".
16 |
17 | Returns:
18 | pl.DataFrame: dataframe with aggregated features
19 | """
20 | aggs = []
21 | for f in node_features:
22 | avg = pl.col(f).mean().alias(f"avg_{f}")
23 | min_val = pl.col(f).min().alias(f"min_{f}")
24 | max_val = pl.col(f).max().alias(f"max_{f}")
25 | std = pl.col(f).std().alias(f"std_{f}")
26 | aggs += [avg, min_val, max_val, std]
27 | agg_data = data.group_by(by).agg(aggs)
28 |
29 | return agg_data
30 |
31 |
32 | def feature_predictive_power(
33 | data: pl.DataFrame, x: str, y: str, plot: bool = True
34 | ) -> np.float32:
35 | """Utility to calcualte predictive power of a feature and plot its relationship with the target
36 | Args:
37 | data (pl.DataFrame): input dataframe
38 | x (str): name of the feature
39 | y (str): name of the target
40 | plot (bool, optional): indicator whether you want to plot the relationship. Defaults to True.
41 |
42 | Returns:
43 | np.float32: predictive power score
44 | """
45 | data_pd = data.select([x, y]).to_pandas()
46 | score = np.float32(pps.score(data_pd, x, y)["ppscore"]).round(4)
47 |
48 | if plot:
49 | print(f"Predictive Power Score: {score}")
50 | fig = px.histogram(
51 | x=data_pd[x],
52 | color=data_pd[y],
53 | marginal="box",
54 | histnorm="probability",
55 | title=f"{x} distribution by {y}",
56 | )
57 | fig.show()
58 |
59 | return score
60 |
61 |
62 | def get_graph_features(data: pl.DataFrame, node_features: bool = True) -> pl.DataFrame:
63 | """Pipeline function to generate graph features
64 |
65 | Args:
66 | data (pl.DataFrame): dataframe with edges 'from' and 'to'
67 | node_features (bool, optional): Indicator whether you want to create node level features. Defaults to True.
68 |
69 | Returns:
70 | pl.DataFrame: dataframe with engineered features
71 | """
72 | graph_features = (
73 | data.groupby("_id")
74 | .agg(pl.count().alias("n_connections"), pl.col("from"), pl.col("to"))
75 | .with_columns(
76 | pl.concat_list("from", "to")
77 | .list.unique()
78 | .list.lengths()
79 | .alias("n_unique_nodes")
80 | )
81 | .select(["_id", "n_connections", "n_unique_nodes"])
82 | )
83 |
84 | if node_features:
85 | node_features_agg = aggregate_node_features(
86 | data,
87 | node_features=[
88 | "global_source_degrees",
89 | "global_dest_degrees",
90 | "local_source_degrees",
91 | "local_dest_degrees",
92 | ],
93 | by="_id",
94 | )
95 |
96 | graph_features = graph_features.join(node_features_agg, on="_id")
97 |
98 | return graph_features
99 |
--------------------------------------------------------------------------------
/api_anomaly_project/utils/ml.py:
--------------------------------------------------------------------------------
1 | import mlflow
2 | import numpy as np
3 | import numpy.typing as npt
4 | import pandas as pd
5 | import plotly.express as px
6 | from optuna import create_study
7 | from optuna.integration.mlflow import MLflowCallback
8 | from optuna.trial import FrozenTrial
9 | from sklearn.ensemble import HistGradientBoostingClassifier
10 | from sklearn.metrics import f1_score, precision_score, recall_score
11 | from sklearn.model_selection import cross_val_score
12 |
13 |
14 | def evaluate_thresholds(
15 | thresholds: npt.NDArray[np.float32],
16 | y_true: npt.NDArray[np.float32],
17 | y_pred_proba: npt.NDArray[np.float32],
18 | plot: bool = True,
19 | ) -> tuple[list[float], list[float], list[float]]:
20 | rcs = []
21 | prs = []
22 | f1s = []
23 |
24 | for t in thresholds:
25 | test_binary_pred = y_pred_proba[:, 1] >= t
26 | prs.append(precision_score(y_true, test_binary_pred))
27 | rcs.append(recall_score(y_true, test_binary_pred))
28 | f1s.append(f1_score(y_true, test_binary_pred))
29 |
30 | metrics_df = pd.DataFrame({"threshold": thresholds, "score": f1s, "metric": "F1"})
31 | metrics_df = pd.concat(
32 | (
33 | metrics_df,
34 | pd.DataFrame({"threshold": thresholds, "score": rcs, "metric": "Recall"}),
35 | )
36 | )
37 | metrics_df = pd.concat(
38 | (
39 | metrics_df,
40 | pd.DataFrame(
41 | {"threshold": thresholds, "score": prs, "metric": "Precision"}
42 | ),
43 | )
44 | )
45 |
46 | optimal_thr = thresholds[np.argmax(f1s)]
47 | optimal_f1 = f1s[np.argmax(f1s)]
48 | optimal_rc = rcs[np.argmax(f1s)]
49 | optimal_pr = prs[np.argmax(f1s)]
50 |
51 | print("Threshold with Max F1 Score: ", optimal_thr)
52 | print(f"F1 at threshold {optimal_thr}: {optimal_f1}")
53 | print(f"Recall at threshold {optimal_thr}: {optimal_rc}")
54 | print(f"Precision at threshold {optimal_thr}: {optimal_pr} ")
55 |
56 | if plot:
57 | fig = px.line(
58 | metrics_df,
59 | x="threshold",
60 | y="score",
61 | color="metric",
62 | title="Metrics per Threshold",
63 | )
64 | fig.show()
65 |
66 | return rcs, prs, f1s
67 |
68 |
69 | def tune_hgbt(
70 | n_trials: int, mlflc: MLflowCallback, X_train: pd.DataFrame, y_train: pd.Series
71 | ) -> FrozenTrial:
72 | @mlflc.track_in_mlflow()
73 | def objective(trial):
74 | params = {
75 | "learning_rate": 0.1,
76 | "max_iter": trial.suggest_int("max_iter", 10, 100),
77 | "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 31),
78 | "max_depth": trial.suggest_int("max_depth", 2, 10),
79 | "l2_regularization": trial.suggest_float("l2_regularization", 0, 10),
80 | }
81 | mlflow.set_tag("model_name", "HGBT")
82 | mlflow.log_params(params)
83 |
84 | gbt = HistGradientBoostingClassifier(**params)
85 |
86 | roc_auc = cross_val_score(gbt, X_train, y_train, cv=5, scoring="roc_auc").mean()
87 | print("ROC AUC (avg 5-fold):", roc_auc)
88 |
89 | return roc_auc
90 |
91 | study = create_study(direction="maximize", study_name="hgbt_tuning")
92 | study.optimize(objective, n_trials=n_trials, callbacks=[mlflc])
93 | return study.best_trial
94 |
--------------------------------------------------------------------------------
/api_anomaly_project/utils/visualisations.py:
--------------------------------------------------------------------------------
1 | import plotly.express as px
2 | import plotly.graph_objects as go
3 |
4 | import polars as pl
5 |
6 |
7 | def bar_plot(data: pl.DataFrame, column: str, title: str) -> go.Figure:
8 | """Creates a plotly barplot from Polars column
9 |
10 | Args:
11 | data (pl.DataFrame): input dataframe
12 | column (str): column to plot
13 | title (str): title for the plot
14 |
15 | Returns:
16 | go.Figure: resulting barplot as plotly Figure
17 | """
18 | counts = data[column].value_counts(sort=True)
19 | fig = px.bar(
20 | x=counts[column].to_list(),
21 | y=counts["counts"].to_list(),
22 | text_auto=True,
23 | title=title,
24 | color_discrete_sequence=px.colors.qualitative.Antique,
25 | labels={
26 | "x": column,
27 | "y": "Counts",
28 | },
29 | )
30 | fig.update_traces(
31 | textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
32 | )
33 |
34 | return fig
35 |
36 |
37 | def proportion_plot(
38 | data: pl.DataFrame, column: str, target: str, title: str
39 | ) -> go.Figure:
40 | """Creates a plotly barplot with proportions
41 |
42 | Args:
43 | data (pl.DataFrame): input dataframe
44 | column (str): column to analyse
45 | target (str): a discrete target
46 | title (str): title for the plot
47 |
48 | Returns:
49 | go.Figure: resulting barplot as plotly Figure
50 | """
51 | counts = data.groupby(column, target).agg(pl.count())
52 | target_counts = counts.groupby(column).agg(pl.col("count").sum().alias("total"))
53 | proportions = counts.join(target_counts, on=column)
54 | proportions = proportions.with_columns(
55 | proportion=pl.col("count") / pl.col("total")
56 | ).sort((column, target))
57 | fig = px.bar(
58 | x=proportions[column].to_list(),
59 | y=proportions["proportion"].to_list(),
60 | color=proportions[target].to_list(),
61 | color_discrete_sequence=px.colors.qualitative.Antique,
62 | labels={
63 | "x": column,
64 | "y": f"{target} proportion",
65 | },
66 | title=title,
67 | )
68 | fig.update_traces(
69 | textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
70 | )
71 |
72 | return fig
73 |
74 |
75 | def boxplot_by_bin_with_target(
76 | data: pl.DataFrame,
77 | column_to_bin: str,
78 | numeric_column: str,
79 | target: str,
80 | number_bins: int = 10,
81 | ) -> go.Figure:
82 | """Creates a plotly boxplot
83 |
84 | Args:
85 | data (pl.DataFrame): input dataframe
86 | column_to_bin (str): numeric column to bin
87 | numeric_column (str): numeric column to create a box plot from
88 | target (str): target column to colour a boxplot
89 | number_bins (int, optional): number of quantile bins to create. Defaults to 10.
90 |
91 | Returns:
92 | go.Figure: _description_
93 | """
94 |
95 | temp = data.select(
96 | pl.col(column_to_bin)
97 | .qcut(number_bins, allow_duplicates=True)
98 | .alias(f"{column_to_bin}_binned"),
99 | pl.col(column_to_bin),
100 | pl.col(numeric_column),
101 | pl.col(target),
102 | )
103 |
104 | order = (
105 | temp.groupby(f"{column_to_bin}_binned")
106 | .agg(pl.col(column_to_bin).min().alias("min"))
107 | .sort("min")[f"{column_to_bin}_binned"]
108 | .to_list()
109 | )
110 |
111 | fig = px.box(
112 | x=temp[f"{column_to_bin}_binned"].to_list(),
113 | y=temp[numeric_column].to_list(),
114 | color=temp[target].to_list(),
115 | color_discrete_sequence=px.colors.qualitative.Antique,
116 | log_y=True,
117 | category_orders={"x": order},
118 | labels={
119 | "x": "",
120 | "y": numeric_column,
121 | },
122 | )
123 |
124 | return fig
125 |
--------------------------------------------------------------------------------
/deployment/fastapi/Dockerfile:
--------------------------------------------------------------------------------
1 | # Start from a base image
2 | FROM python:3.9-slim
3 |
4 | # Set the working directory
5 | WORKDIR /app
6 |
7 | # Copy the requirements file into the container
8 | COPY requirements.txt requirements.txt
9 |
10 | # Install the required packages
11 | RUN pip install --upgrade pip
12 | RUN pip install -r requirements.txt
13 |
14 | # Copy the application code into the container
15 | COPY ["loan_catboost_model.cbm", "app.py", "./"] .
16 |
17 | # Expose the app port
18 | EXPOSE 80
19 |
20 | # Run command
21 | CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
--------------------------------------------------------------------------------
/deployment/fastapi/app.py:
--------------------------------------------------------------------------------
1 | import catboost as cb
2 | import pandas as pd
3 | from pydantic import BaseModel
4 |
5 | from fastapi import FastAPI
6 |
7 |
8 | # Pydantic classes for input and output
9 | class LoanApplication(BaseModel):
10 | Term: int
11 | NoEmp: int
12 | CreateJob: int
13 | RetainedJob: int
14 | longitude: float
15 | latitude: float
16 | GrAppv: float
17 | SBA_Appv: float
18 | is_new: str
19 | FranchiseCode: str
20 | UrbanRural: int
21 | City: str
22 | State: str
23 | Bank: str
24 | BankState: str
25 | RevLineCr: str
26 | naics_first_two: str
27 | same_state: str
28 |
29 |
30 | class PredictionOut(BaseModel):
31 | default_proba: float
32 |
33 |
34 | # Load the model
35 | model = cb.CatBoostClassifier()
36 | model.load_model("loan_catboost_model.cbm")
37 |
38 | # Start the app
39 | app = FastAPI()
40 |
41 | # Home page
42 | @app.get("/")
43 | def home():
44 | return {"message": "Loan Default Prediction App", "model_version": 0.1}
45 |
46 |
47 | # Inference endpoint
48 | @app.post("/predict", response_model=PredictionOut)
49 | def predict(payload: LoanApplication):
50 | cust_df = pd.DataFrame([payload.dict()])
51 | preds = model.predict_proba(cust_df)[0, 1]
52 | result = {"default_proba": preds}
53 | return result
54 |
--------------------------------------------------------------------------------
/deployment/fastapi/app_test.py:
--------------------------------------------------------------------------------
1 | import random
2 | from locust import HttpUser, task, constant_throughput
3 |
4 | test_applications = [
5 | {
6 | "Term": 84,
7 | "NoEmp": 5,
8 | "CreateJob": 0,
9 | "RetainedJob": 5,
10 | "longitude": -77.9221,
11 | "latitude": 35.3664,
12 | "GrAppv": 1500000.0,
13 | "SBA_Appv": 1275000.0,
14 | "is_new": True,
15 | "FranchiseCode": "0",
16 | "UrbanRural": 1,
17 | "City": "Other",
18 | "State": "NC",
19 | "Bank": "BBCN BANK",
20 | "BankState": "CA",
21 | "RevLineCr": "N",
22 | "naics_first_two": "45",
23 | "same_state": False,
24 | },
25 | {
26 | "Term": 19,
27 | "NoEmp": 10,
28 | "CreateJob": 0,
29 | "RetainedJob": 10,
30 | "longitude": -85.0117,
31 | "latitude": 41.0699,
32 | "GrAppv": 3500000.0,
33 | "SBA_Appv": 1750000.0,
34 | "is_new": False,
35 | "FranchiseCode": "1",
36 | "UrbanRural": 2,
37 | "City": "Other",
38 | "State": "IN",
39 | "Bank": "WELLS FARGO BANK NATL ASSOC",
40 | "BankState": "SD",
41 | "RevLineCr": "Y",
42 | "naics_first_two": "81",
43 | "same_state": False,
44 | },
45 | ]
46 |
47 |
48 | class BankLoan(HttpUser):
49 | wait_time = constant_throughput(1)
50 |
51 | @task
52 | def predict(self):
53 | self.client.post(
54 | "/predict",
55 | json=random.choice(test_applications),
56 | timeout=1,
57 | )
58 |
--------------------------------------------------------------------------------
/deployment/fastapi/loan_catboost_model.cbm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/deployment/fastapi/loan_catboost_model.cbm
--------------------------------------------------------------------------------
/deployment/fastapi/measure_response.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 | import numpy as np
4 | from tqdm import tqdm
5 |
6 | if __name__ == "__main__":
7 | # Example loan application
8 | application = {
9 | "Term": 84,
10 | "NoEmp": 5,
11 | "CreateJob": 0,
12 | "RetainedJob": 5,
13 | "longitude": -77.9221,
14 | "latitude": 35.3664,
15 | "GrAppv": 1500000.0,
16 | "SBA_Appv": 1275000.0,
17 | "is_new": "True",
18 | "FranchiseCode": "0",
19 | "UrbanRural": 1,
20 | "City": "Other",
21 | "State": "NC",
22 | "Bank": "BBCN BANK",
23 | "BankState": "CA",
24 | "RevLineCr": "N",
25 | "naics_first_two": "45",
26 | "same_state": "False"
27 | }
28 |
29 | # Location of my server
30 | url = "https://default-service-ni4eqbkvca-nw.a.run.app/predict"
31 |
32 | # Measure the response time
33 | all_times = []
34 | # For 1000 times
35 | for i in tqdm(range(100)):
36 | t0 = time.time_ns() // 1_000_000
37 | # Send a request
38 | resp = requests.post(url, json=application)
39 | t1 = time.time_ns() // 1_000_000
40 | # Measure how much time it took to get a response in ms
41 | time_taken = t1 - t0
42 | all_times.append(time_taken)
43 |
44 | # Print out the results
45 | print("Response time in ms:")
46 | print("Median:", np.quantile(all_times, 0.5))
47 | print("95th precentile:", np.quantile(all_times, 0.95))
48 | print("Max:", np.max(all_times))
49 |
--------------------------------------------------------------------------------
/deployment/fastapi/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.92.0
2 | pydantic==1.10.5
3 | uvicorn==0.20.0
4 | catboost==1.1.1
5 | numpy==1.21.5
6 | pandas==1.5.2
7 | gunicorn==20.1.0
--------------------------------------------------------------------------------
/deployment/flask/Dockerfile:
--------------------------------------------------------------------------------
1 | # Base image is Python 3.9
2 | FROM python:3.9-slim
3 |
4 | # Set the working directory
5 | WORKDIR /app
6 |
7 | # Copy the requirements file into the container
8 | COPY requirements.txt requirements.txt
9 |
10 | # Install the required packages
11 | RUN pip install --upgrade pip
12 | RUN pip install -r requirements.txt
13 |
14 | # Copy the model and application code into the container
15 | COPY ["loan_catboost_model.cbm", "app.py", "./"] .
16 |
17 | # Run the app using gunicorn
18 | ENTRYPOINT [ "gunicorn", "--bind=0.0.0.0:8989", "app:app" ]
19 |
--------------------------------------------------------------------------------
/deployment/flask/app.py:
--------------------------------------------------------------------------------
1 | import catboost as cb
2 | import pandas as pd
3 |
4 | from flask import Flask, jsonify, request
5 |
6 | # Load the model
7 | model = cb.CatBoostClassifier()
8 | model.load_model("loan_catboost_model.cbm")
9 |
10 | # Init the app
11 | app = Flask("default")
12 |
13 |
14 | # Setup prediction endpoint
15 | @app.route("/predict", methods=["POST"])
16 | def predict():
17 | # Get the provided JSON
18 | X = request.get_json()
19 | # Perform a prediction
20 | preds = model.predict_proba(pd.DataFrame(X, index=[0]))[0, 1]
21 | # Output json with prediction
22 | result = {"default_proba": preds}
23 | return jsonify(result)
24 |
25 |
26 | if __name__ == "__main__":
27 | # Run the app on local host and port 8989
28 | app.run(debug=True, host="0.0.0.0", port=8989)
29 |
--------------------------------------------------------------------------------
/deployment/flask/app_test.py:
--------------------------------------------------------------------------------
1 | import random
2 | from locust import HttpUser, task, constant_throughput
3 |
4 | test_applications = [
5 | {
6 | "Term": 84,
7 | "NoEmp": 5,
8 | "CreateJob": 0,
9 | "RetainedJob": 5,
10 | "longitude": -77.9221,
11 | "latitude": 35.3664,
12 | "GrAppv": 1500000.0,
13 | "SBA_Appv": 1275000.0,
14 | "is_new": True,
15 | "FranchiseCode": "0",
16 | "UrbanRural": 1,
17 | "City": "Other",
18 | "State": "NC",
19 | "Bank": "BBCN BANK",
20 | "BankState": "CA",
21 | "RevLineCr": "N",
22 | "naics_first_two": "45",
23 | "same_state": False,
24 | },
25 | {
26 | "Term": 19,
27 | "NoEmp": 10,
28 | "CreateJob": 0,
29 | "RetainedJob": 10,
30 | "longitude": -85.0117,
31 | "latitude": 41.0699,
32 | "GrAppv": 3500000.0,
33 | "SBA_Appv": 1750000.0,
34 | "is_new": False,
35 | "FranchiseCode": "1",
36 | "UrbanRural": 2,
37 | "City": "Other",
38 | "State": "IN",
39 | "Bank": "WELLS FARGO BANK NATL ASSOC",
40 | "BankState": "SD",
41 | "RevLineCr": "Y",
42 | "naics_first_two": "81",
43 | "same_state": False,
44 | },
45 | ]
46 |
47 |
48 | class BankLoan(HttpUser):
49 | wait_time = constant_throughput(1)
50 |
51 | @task
52 | def predict(self):
53 | self.client.post(
54 | "/predict",
55 | json=random.choice(test_applications),
56 | timeout=1,
57 | )
58 |
--------------------------------------------------------------------------------
/deployment/flask/loan_catboost_model.cbm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/deployment/flask/loan_catboost_model.cbm
--------------------------------------------------------------------------------
/deployment/flask/measure_response.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 | import numpy as np
4 | from tqdm import tqdm
5 |
6 | if __name__ == "__main__":
7 | # Example loan application
8 | application = {
9 | "Term": 84,
10 | "NoEmp": 5,
11 | "CreateJob": 0,
12 | "RetainedJob": 5,
13 | "longitude": -77.9221,
14 | "latitude": 35.3664,
15 | "GrAppv": 1500000.0,
16 | "SBA_Appv": 1275000.0,
17 | "is_new": True,
18 | "FranchiseCode": "0",
19 | "UrbanRural": 1,
20 | "City": "Other",
21 | "State": "NC",
22 | "Bank": "BBCN BANK",
23 | "BankState": "CA",
24 | "RevLineCr": "N",
25 | "naics_first_two": "45",
26 | "same_state": False,
27 | }
28 |
29 | # Location of my server
30 | url = "http://0.0.0.0:8989/predict"
31 |
32 | # Measure the response time
33 | all_times = []
34 | # For 1000 times
35 | for i in tqdm(range(1000)):
36 | t0 = time.time_ns() // 1_000_000
37 | # Send a request
38 | resp = requests.post(url, json=application)
39 | t1 = time.time_ns() // 1_000_000
40 | # Measure how much time it took to get a response in ms
41 | time_taken = t1 - t0
42 | all_times.append(time_taken)
43 |
44 | # Print out the results
45 | print("Response time in ms:")
46 | print("Median:", np.quantile(all_times, 0.5))
47 | print("95th precentile:", np.quantile(all_times, 0.95))
48 | print("Max:", np.max(all_times))
49 |
--------------------------------------------------------------------------------
/deployment/flask/requirements.txt:
--------------------------------------------------------------------------------
1 | flask==2.2.3
2 | catboost==1.1.1
3 | numpy==1.21.5
4 | pandas==1.5.2
5 | gunicorn==20.1.0
--------------------------------------------------------------------------------
/hp_tuning/bv_tradeoff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/hp_tuning/bv_tradeoff.png
--------------------------------------------------------------------------------
/metaflow/fraud_email/fradulent_emails.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/metaflow/fraud_email/fradulent_emails.txt
--------------------------------------------------------------------------------
/metaflow/fraud_email/readme.md:
--------------------------------------------------------------------------------
1 | # Text Processing Pipeline using Polars
2 |
3 | This repository contains the code for my medium post **Fast String Processing with Polars - Scam Emails Dataset**.
4 |
5 | The project implements a text processing pipeline using the Polars library for efficient data manipulation and analysis. The pipeline is designed to handle text data, perform various preprocessing tasks, and extract useful features from the text.
6 |
7 | ## Dataset
8 |
9 | The dataset used in this project is the CLAIR collection of fraud emails by Radev, D. (2008). The dataset can be accessed from the ACL Data and Code Repository under the identifier ADCR2008T001. More information about the dataset can be found at [ACL Data and Code Repository](http://aclweb.org/aclwiki).
10 |
11 | ## Dependencies
12 |
13 | The following dependencies are required to run the text processing pipeline:
14 |
15 | ```
16 | numpy==1.23.5
17 | pandas==1.5.3
18 | polars==0.17.14
19 | nltk==3.8.1
20 | scikit-learn==1.2.2
21 | matplotlib==3.7.1
22 | wordcloud==1.9.2
23 | ```
24 |
25 | ## Run in a Notebook
26 |
27 | 1. Install the required dependencies using `pip`
28 | `pip install -r requirements.txt`
29 |
30 | 2. Navigate to `email_eda.ipynb` and run the code to load, pre-process, clean, and tokenise the emails. Additionally, it will cluster the texts and generate the wordcloud for each cluster.
31 |
--------------------------------------------------------------------------------
/metaflow/fraud_email/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.23.5
2 | pandas==1.5.3
3 | polars==0.17.14
4 | nltk==3.8.1
5 | scikit-learn==1.2.2
6 | matplotlib==3.7.1
7 | wordcloud==1.9.2
--------------------------------------------------------------------------------
/metaflow/fraud_email/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/metaflow/fraud_email/utils/__init__.py
--------------------------------------------------------------------------------
/metaflow/fraud_email/utils/feature_generation.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 |
3 |
4 | def extract_fields(emails: pl.DataFrame) -> pl.DataFrame:
5 | """
6 | Extracts specific fields from a DataFrame containing email data and
7 | returns a modified DataFrame.
8 |
9 | Args:
10 | emails (pl.DataFrame): A DataFrame containing email data.
11 |
12 | Returns:
13 | pl.DataFrame: A modified DataFrame with extracted fields.
14 | """
15 | email_pattern = r"From:\s*([^<\n\s]+)"
16 | subject_pattern = r"Subject:\s*(.*)"
17 | name_email_pattern = r'From:\s*"?([^"<]+)"?\s*<([^>]+)>'
18 |
19 | emails = (
20 | emails.with_columns(
21 | pl.col("emails").str.extract(name_email_pattern, 2).alias("sender_email"),
22 | pl.col("emails").str.extract(name_email_pattern, 1).alias("sender_name"),
23 | pl.col("emails").str.extract(subject_pattern, 1).alias("subject"),
24 | )
25 | .with_columns(
26 | pl.when(pl.col("sender_email").is_null())
27 | .then(pl.col("emails").str.extract(email_pattern, 1))
28 | .otherwise(pl.col("sender_email"))
29 | .alias("sender_email")
30 | )
31 | .with_columns(
32 | pl.col("emails")
33 | .str.replace("Status: RO", "Status: O", literal=True)
34 | .str.split("Status: O")
35 | .arr.get(1)
36 | .alias("email_text")
37 | )
38 | )
39 |
40 | return emails
41 |
42 |
43 | def email_features(data: pl.DataFrame, col: str) -> pl.DataFrame:
44 | """
45 | Computes additional features for a specified column in a DataFrame
46 | containing email data and returns a modified DataFrame.
47 |
48 | Args:
49 | data (pl.DataFrame): A DataFrame containing email data.
50 | col (str): The name of the column in the DataFrame to compute features for.
51 |
52 | Returns:
53 | pl.DataFrame: A modified DataFrame with additional computed features.
54 | """
55 | data = data.with_columns(
56 | pl.col(col).str.n_chars().alias(f"{col}_length"),
57 | ).with_columns(
58 | (pl.col(col).str.count_match(r"[A-Z]") / pl.col(f"{col}_length")).alias(
59 | f"{col}_percent_capital"
60 | ),
61 | (pl.col(col).str.count_match(r"[^A-Za-z ]") / pl.col(f"{col}_length")).alias(
62 | f"{col}_percent_digits"
63 | ),
64 | )
65 |
66 | return data
67 |
--------------------------------------------------------------------------------
/metaflow/fraud_email/utils/plots.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from wordcloud import WordCloud
3 |
4 |
5 | def generate_word_cloud(text: str):
6 | """
7 | Generate and display a word cloud image based on the provided text.
8 |
9 | Args:
10 | text (str): The input text to generate the word cloud from.
11 |
12 | Returns:
13 | None
14 | """
15 | # Generate a word cloud image
16 | wordcloud = WordCloud(
17 | max_words=100, background_color="white", width=1600, height=800
18 | ).generate(text)
19 |
20 | plt.figure(figsize=(20, 10), facecolor="k")
21 | plt.imshow(wordcloud)
22 | plt.axis("off")
23 | plt.tight_layout(pad=0)
24 | plt.show()
25 |
--------------------------------------------------------------------------------
/metaflow/fraud_email/utils/preprocess.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 |
3 |
4 | def email_clean(
5 | data: pl.DataFrame, col: str, new_col_name: str | None = None
6 | ) -> pl.DataFrame:
7 | """
8 | Cleans and preprocesses the text in a specified column of a DataFrame containing
9 | email data, and returns a modified DataFrame.
10 |
11 | Args:
12 | data (pl.DataFrame): A DataFrame containing email data.
13 | col (str): The name of the column in the DataFrame to clean and preprocess.
14 | new_col_name (str | None, optional): The name for the new column with cleaned data. Defaults to None.
15 |
16 | Returns:
17 | pl.DataFrame: A modified DataFrame with the cleaned and preprocessed text.
18 |
19 | """
20 | data = data.with_columns(
21 | pl.col(col)
22 | .str.replace_all(r"<.*?>", " ")
23 | .str.replace_all(r"[^a-zA-Z\s]+", " ")
24 | .str.replace_all(r"\s+", " ")
25 | .str.to_lowercase()
26 | .alias(new_col_name if new_col_name is not None else col)
27 | )
28 |
29 | return data
30 |
31 |
32 | def tokenise_text(data: pl.DataFrame, col: str, split_token: str = " ") -> pl.DataFrame:
33 | """
34 | Tokenizes the text in a specified column of a DataFrame containing email data and returns a modified DataFrame.
35 |
36 | Args:
37 | data (pl.DataFrame): A DataFrame containing email data.
38 | col (str): The name of the column in the DataFrame to tokenize.
39 | split_token (str, optional): The token used to split the text into tokens. Defaults to " ".
40 |
41 | Returns:
42 | pl.DataFrame: A modified DataFrame with tokenized text.
43 | """
44 | data = data.with_columns(
45 | pl.col(col).str.split(split_token).alias(f"{col}_tokenised")
46 | )
47 |
48 | return data
49 |
50 |
51 | def remove_stopwords(
52 | data: pl.DataFrame, stopwords: set | list, col: str
53 | ) -> pl.DataFrame:
54 | """Removes stopwords from the text in a specified column of a DataFrame containing email data and returns a modified DataFrame.
55 |
56 | Args:
57 | data (pl.DataFrame): A DataFrame containing email data.
58 | stopwords (set | list): A set or list of stopwords to be removed from the text.
59 | col (str): The name of the column in the DataFrame to remove stopwords from.
60 |
61 | Returns:
62 | pl.DataFrame: A modified DataFrame with stopwords removed from the text.
63 | """
64 | data = data.with_columns(
65 | pl.col(col)
66 | .arr.eval(
67 | pl.when(
68 | (~pl.element().is_in(stopwords)) & (pl.element().str.n_chars() > 2)
69 | ).then(pl.element())
70 | )
71 | .arr.eval(pl.element().drop_nulls())
72 | )
73 | return data
74 |
--------------------------------------------------------------------------------
/metaflow/fraud_email/utils/read_data.py:
--------------------------------------------------------------------------------
1 | def load_emails_txt(path: str, split_str: str = "From r ") -> list[str]:
2 | """
3 | Loads emails from a text file and returns them as a list.
4 |
5 | Args:
6 | path (str): The file path to the text file.
7 | split_str (str, optional): The string used to split the text file into
8 | individual emails. Defaults to "From r ".
9 |
10 | Returns:
11 | list[str]: A list of emails extracted from the text file.
12 | """
13 | with open(path, "r", encoding="utf-8", errors="ignore") as file:
14 | text = file.read()
15 |
16 | emails = text.split(split_str)
17 |
18 | return emails
19 |
--------------------------------------------------------------------------------
/mlflow_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/mlflow_models/.DS_Store
--------------------------------------------------------------------------------
/mlflow_models/MLProject:
--------------------------------------------------------------------------------
1 | name: hp_search
2 |
3 | python_env: python_env.yaml
4 |
5 | entry_points:
6 | # Use Hyperopt to optimize hyperparams of the train entry_point.
7 | search_params:
8 | parameters:
9 | training_data: {type: string, default: "hemanthsai7/loandefault"}
10 | max_runs: {type: int, default: 10}
11 | model_type: {type: str, default: "hgbt"}
12 | command: "python -O search_params.py {training_data} --max-runs {max_runs} --model-type {model_type}"
13 |
14 | # train Random Forest model with default HPs
15 | train_rf:
16 | parameters:
17 | dset_name: {type: string, default: "sgpjesus/bank-account-fraud-dataset-neurips-2022"}
18 | max_depth: {type: int, default: 5}
19 | max_features: {type: float, default: 0.1}
20 | class_weight: {type: str, default: "balanced"}
21 | min_samples_leaf: {type: int, default: 10}
22 | command: "python train_rf.py {dset_name}
23 | --max-depth {max_depth}
24 | --max-features {max_features}
25 | --class-weight {class_weight}
26 | --min-samples-leaf {min_samples_leaf}"
27 |
28 | # train HistGradientBoosted model with default parameters
29 | train_hgbt:
30 | parameters:
31 | dset_name: {type: string, default: "sgpjesus/bank-account-fraud-dataset-neurips-2022"}
32 | max_depth: {type: int, default: 20}
33 | learning_rate: {type: float, default: 0.1}
34 | class_weight: {type: str, default: "balanced"}
35 | max_leaf_nodes: {type: int, default: 31}
36 | l2_regularization: {type: int, default: 1.}
37 | command: "python train_hgbt.py {dset_name}
38 | --max-depth {max_depth}
39 | --learning-rate {learning_rate}
40 | --class-weight {class_weight}
41 | --max-leaf-nodes {max_leaf_nodes}
42 | --l2-regularization {l2_regularization}"
--------------------------------------------------------------------------------
/mlflow_models/model_search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import json\n",
10 | "\n",
11 | "import mlflow\n",
12 | "import numpy as np\n",
13 | "import pandas as pd\n",
14 | "import requests\n",
15 | "from mlflow.tracking import MlflowClient\n",
16 | "\n",
17 | "from train_rf import CATEGORICAL_FEATURES, NUMERICAL_FEATURES\n"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 2,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "current_experiment=dict(mlflow.get_experiment_by_name(\"loan\"))\n",
27 | "experiment_id=current_experiment['experiment_id']\n",
28 | "\n",
29 | "# Get this from UI or CLI\n",
30 | "rf_parent_run = \"03046a89d08346a5bda301cc7c745885\""
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "## Find the best model"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "Experiment had 10 HP tuning round\n",
50 | "Best run - 0 with PR AUC of 0.104\n"
51 | ]
52 | }
53 | ],
54 | "source": [
55 | "# To access MLFlow stuff we need to work with MlflowClient\n",
56 | "client = MlflowClient()\n",
57 | "\n",
58 | "# Searches runs for a specific attribute and filters them by Parent Run ID\n",
59 | "runs = client.search_runs(\n",
60 | " [experiment_id], \n",
61 | " f\"tags.mlflow.parentRunId = '{rf_parent_run}'\", \n",
62 | " order_by=[\"metrics.test_PR_AUC DESC\"]\n",
63 | ")\n",
64 | "\n",
65 | "# Select the best run according to test_PR_AUC metric\n",
66 | "best_run = np.argmax([f.data.metrics['test_PR_AUC'] for f in runs])\n",
67 | "best_pr_auc = np.round(runs[best_run].data.metrics['test_PR_AUC'], 4)\n",
68 | "\n",
69 | "print(f\"Experiment had {len(runs)} HP tuning round\")\n",
70 | "print(f\"Best run - {best_run} with PR AUC of {best_pr_auc}\")"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "name": "stdout",
80 | "output_type": "stream",
81 | "text": [
82 | "Best model URI - runs:/1d2537d89cb04760b3b9bc501ee0854f/sklearn_models\n"
83 | ]
84 | }
85 | ],
86 | "source": [
87 | "# log-model history is stored as string, so we need to \"jsonify\" it first\n",
88 | "log_model_info = json.loads(runs[best_run].data.tags['mlflow.log-model.history'])[0]\n",
89 | "\n",
90 | "# Construct a valid model URI\n",
91 | "model_uri = 'runs:/' + log_model_info['run_id'] + '/' + log_model_info['artifact_path']\n",
92 | "print(f\"Best model URI - {model_uri}\")\n"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "## Load the best model"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 16,
105 | "metadata": {},
106 | "outputs": [],
107 | "source": [
108 | "# Data sample to test the model\n",
109 | "data = pd.read_csv(\"./data/raw/train.csv\", nrows=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 17,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "name": "stderr",
119 | "output_type": "stream",
120 | "text": [
121 | "2023/02/14 11:46:29 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:\n",
122 | " - category-encoders (current: 2.6.0, required: category-encoders==2.3.0)\n",
123 | "To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.\n"
124 | ]
125 | },
126 | {
127 | "data": {
128 | "text/plain": [
129 | "array([[0.4980769, 0.5019231]])"
130 | ]
131 | },
132 | "execution_count": 17,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | }
136 | ],
137 | "source": [
138 | "# Load the model as pyfunc\n",
139 | "sklearn_pyfunc = mlflow.pyfunc.load_model(model_uri=model_uri)\n",
140 | "sklearn_pyfunc.predict(data)"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "## Register and Promote"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 19,
153 | "metadata": {},
154 | "outputs": [
155 | {
156 | "name": "stderr",
157 | "output_type": "stream",
158 | "text": [
159 | "Successfully registered model 'loan_model'.\n",
160 | "2023/02/14 11:51:20 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: loan_model, version 1\n",
161 | "Created version '1' of model 'loan_model'.\n"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "model_name = 'loan_model'\n",
167 | "model_version = 1\n",
168 | "\n",
169 | "# Register model\n",
170 | "mlflow.register_model(model_uri, model_name)\n",
171 | "\n",
172 | "# Promote to Production\n",
173 | "logs = client.transition_model_version_stage(name=model_name, version=model_version, stage=\"Production\")"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "## Load from Production Model Registry"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 20,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stderr",
190 | "output_type": "stream",
191 | "text": [
192 | "2023/02/14 11:54:25 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:\n",
193 | " - category-encoders (current: 2.6.0, required: category-encoders==2.3.0)\n",
194 | "To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.\n"
195 | ]
196 | },
197 | {
198 | "data": {
199 | "text/plain": [
200 | "array([[0.4980769, 0.5019231]])"
201 | ]
202 | },
203 | "execution_count": 20,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | }
207 | ],
208 | "source": [
209 | "stage = 'Production'\n",
210 | "\n",
211 | "# Since it's a registered model in Production, we can load it like this now!\n",
212 | "# No need for model URIs\n",
213 | "model_registry_path = f'models:/{model_name}/{stage}'\n",
214 | "production_model = mlflow.pyfunc.load_model(model_registry_path)\n",
215 | "\n",
216 | "production_model.predict(data)"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "## Serve models"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "Run this command in the terminal: `mlflow models serve --model-uri models:/loan_model/Production -p 5001`"
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "metadata": {},
236 | "source": [
237 | "### Call from server"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 22,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "# Prepare the data to be sent to API\n",
247 | "example = data[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]\n",
248 | "to_send = example.to_dict(orient='split')\n",
249 | "to_send.pop(\"index\", None)"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 21,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "# Prediction endpoint\n",
259 | "url = 'http://127.0.0.1:5001/invocations'\n",
260 | "\n",
261 | "# Preprocess the example\n",
262 | "response = requests.post(url=url, data=json.dumps({\"dataframe_split\" :to_send}), headers={\"Content-type\": \"application/json\"})\n",
263 | "\n",
264 | "# Load the response\n",
265 | "response_json = json.loads(response.text)\n",
266 | "print(response_json)"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": []
275 | }
276 | ],
277 | "metadata": {
278 | "interpreter": {
279 | "hash": "a2df742b932880654a3f6652148a9c802dc0dfad475f6beda4797814052023f2"
280 | },
281 | "kernelspec": {
282 | "display_name": "Python 3.9.13 ('base')",
283 | "language": "python",
284 | "name": "python3"
285 | },
286 | "language_info": {
287 | "codemirror_mode": {
288 | "name": "ipython",
289 | "version": 3
290 | },
291 | "file_extension": ".py",
292 | "mimetype": "text/x-python",
293 | "name": "python",
294 | "nbconvert_exporter": "python",
295 | "pygments_lexer": "ipython3",
296 | "version": "3.9.13"
297 | },
298 | "orig_nbformat": 4
299 | },
300 | "nbformat": 4,
301 | "nbformat_minor": 2
302 | }
303 |
--------------------------------------------------------------------------------
/mlflow_models/python_env.yaml:
--------------------------------------------------------------------------------
1 | python: "3.9"
2 | build_dependencies:
3 | - pip
4 | dependencies:
5 | - numpy>=1.21
6 | - click>=8.0
7 | - pandas>=1.5
8 | - scipy>=1.7
9 | - scikit-learn==1.2.1
10 | - mlflow>=2.1
11 | - hyperopt==0.2.7
12 | - protobuf
13 | - kaggle
14 | - category-encoders==2.3.0
--------------------------------------------------------------------------------
/mlflow_models/search_params.py:
--------------------------------------------------------------------------------
1 | import click
2 |
3 | from hyperopt import fmin, hp, tpe
4 | from hyperopt.pyll import scope
5 |
6 | import mlflow.projects
7 | from mlflow.tracking import MlflowClient
8 |
9 |
10 | @click.command(
11 | help="""
12 | Perform hyperparameter search with Hyperopt library.
13 | Optimize PR AUC.
14 | """
15 | )
16 | @click.option(
17 | "--max-runs",
18 | type=click.INT,
19 | default=10,
20 | help="Maximum number of runs to evaluate."
21 | )
22 | @click.option(
23 | "--model-type",
24 | type=click.STRING,
25 | default="hgbt",
26 | help="Model type to tune"
27 | )
28 | @click.argument("training_data")
29 | def train(training_data, max_runs, model_type):
30 | """
31 | Run hyperparameter optimization.
32 | """
33 | # create random file to store run ids of the training tasks
34 | tracking_client = MlflowClient()
35 |
36 | def new_eval(experiment_id):
37 | """
38 | Create a new eval function
39 | :experiment_id: Experiment id for the training run
40 | :return: new eval function.
41 | """
42 |
43 | def eval(params):
44 | """
45 | Train sklearn model with given parameters by invoking MLflow run.
46 | :param params: Parameters to the train script we optimize over
47 | :return: The metric value evaluated on the validation data.
48 | """
49 | with mlflow.start_run(nested=True) as child_run:
50 | if model_type == "rf":
51 | # Params used to train RF
52 | (
53 | max_depth, max_features,
54 | class_weight, min_samples_leaf
55 | ) = params
56 | # Run the training script as MLflow sub-run
57 | p = mlflow.projects.run(
58 | uri=".",
59 | entry_point="train_rf",
60 | run_id=child_run.info.run_id,
61 | parameters={
62 | "dset_name": training_data,
63 | "max_depth": str(max_depth),
64 | "max_features": str(max_features),
65 | "class_weight": str(class_weight),
66 | "min_samples_leaf": str(min_samples_leaf),
67 | },
68 | experiment_id=experiment_id,
69 | synchronous=False,
70 | )
71 | # No idea why, but it's needed?
72 | succeeded = p.wait()
73 | # Log params
74 | mlflow.log_params(
75 | {
76 | "max_depth": max_depth,
77 | "max_features": max_features,
78 | "class_weight": class_weight,
79 | "min_samples_leaf": min_samples_leaf,
80 | }
81 | )
82 | elif model_type == "hgbt":
83 | # Params used to train HGBT
84 | (
85 | max_depth,
86 | max_leaf_nodes,
87 | class_weight,
88 | l2_regularization,
89 | learning_rate,
90 | ) = params
91 | # Run the train_hgbt as sub-run
92 | p = mlflow.projects.run(
93 | uri=".",
94 | entry_point="train_hgbt",
95 | run_id=child_run.info.run_id,
96 | parameters={
97 | "dset_name": training_data,
98 | "learning_rate": str(learning_rate),
99 | "max_leaf_nodes": str(max_leaf_nodes),
100 | "max_depth": str(max_depth),
101 | "class_weight": str(class_weight),
102 | "l2_regularization": str(l2_regularization),
103 | },
104 | experiment_id=experiment_id,
105 | synchronous=False,
106 | )
107 | succeeded = p.wait()
108 | mlflow.log_params(
109 | {
110 | "learning_rate": learning_rate,
111 | "max_leaf_nodes": max_leaf_nodes,
112 | "max_depth": max_depth,
113 | "class_weight": class_weight,
114 | "l2_regularization": l2_regularization,
115 | }
116 | )
117 | print(succeeded)
118 |
119 | # Grab the test metrics from the MLflow run
120 | training_run = tracking_client.get_run(p.run_id)
121 | metrics = training_run.data.metrics
122 | test_prauc = metrics["test_PR_AUC"]
123 |
124 | return -test_prauc
125 |
126 | return eval
127 |
128 | if model_type == "rf":
129 | # Search space for RF
130 | space = [
131 | scope.int(hp.quniform("max_depth", 1, 30, q=1)),
132 | hp.uniform("max_features", 0.05, 0.8),
133 | hp.choice("class_weight", ["balanced", None]),
134 | scope.int(hp.quniform("min_samples_leaf", 5, 100, q=5)),
135 | ]
136 | elif model_type == "hgbt":
137 | # Search space for HGBT
138 | space = [
139 | scope.int(hp.quniform("max_depth", 1, 30, q=1)),
140 | scope.int(hp.quniform("max_leaf_nodes", 5, 100, q=5)),
141 | hp.choice("class_weight", ["balanced", None]),
142 | hp.uniform("l2_regularization", 0.0, 20.0),
143 | hp.uniform("learning_rate", 0.01, 0.1),
144 | ]
145 | else:
146 | raise ValueError(f"Model type {model_type} is not supported")
147 |
148 | # This starts the actual search_rf.py experiment run
149 | with mlflow.start_run() as run:
150 | # Get parent ID
151 | experiment_id = run.info.experiment_id
152 |
153 | # Optimisation function that takes parent id and search params as input
154 | best = fmin(
155 | fn=new_eval(experiment_id),
156 | space=space,
157 | algo=tpe.suggest,
158 | max_evals=max_runs,
159 | )
160 | mlflow.set_tag("best params", str(best))
161 |
162 |
163 | if __name__ == "__main__":
164 | train()
165 |
--------------------------------------------------------------------------------
/mlflow_models/train_hgbt.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import click
4 | import mlflow
5 | import pandas as pd
6 | from sklearn.compose import ColumnTransformer
7 | from sklearn.ensemble import HistGradientBoostingClassifier
8 | from sklearn.model_selection import train_test_split
9 | from sklearn.pipeline import Pipeline
10 | from sklearn.preprocessing import OrdinalEncoder
11 |
12 | from utils.columns import CATEGORICAL_FEATURES, NUMERICAL_FEATURES, TARGET
13 | from utils.data_utils import load_raw_data
14 | from utils.eval_utils import eval_and_log_metrics
15 |
16 |
17 | @click.command(
18 | help="Trains HGBT Model"
19 | "The input is expected in csv format."
20 | "The model and its metrics are logged with mlflow."
21 | )
22 | @click.option("--max-depth", type=click.INT, default=20, help="Depth of the trees")
23 | @click.option(
24 | "--max-leaf-nodes",
25 | type=click.INT,
26 | default=31,
27 | help="The maximum number of leaves for each tree",
28 | )
29 | @click.option(
30 | "--class-weight", type=click.STRING, default="balanced", help="Weight of labels"
31 | )
32 | @click.option(
33 | "--l2-regularization",
34 | type=click.FLOAT,
35 | default=1.0,
36 | help="The L2 regularization parameter",
37 | )
38 | @click.option(
39 | "--learning-rate",
40 | type=click.FLOAT,
41 | default=0.1,
42 | help="The learning rate, also known as shrinkage",
43 | )
44 | @click.argument("dset_name")
45 | def run(
46 | dset_name, max_depth, max_leaf_nodes, class_weight, l2_regularization, learning_rate
47 | ):
48 | """
49 | This function trains and logs an HistGradientBoostingClassifier model on a dataset.
50 |
51 | :param dset_name: The name of the dataset to be used. (str)
52 | :param max_depth: The maximum depth of the decision tree. (int)
53 | :param max_leaf_nodes: The maximum number of leaf nodes in the decision tree. (int)
54 | :param class_weight: The weight to be given to different classes in the target column. (str or None)
55 | :param l2_regularization: The L2 regularization value to be used by the model. (float)
56 | :param learning_rate: The learning rate to be used by the model. (float)
57 |
58 | :returns: None
59 |
60 | The function starts an MLflow run and logs various metrics such as accuracy, precision, and recall.
61 | It also logs the trained model using the mlflow.sklearn.log_model function.
62 | """
63 | warnings.filterwarnings("ignore")
64 | # Read data
65 | csv_loc = load_raw_data(dset_name, file_name="train.csv")
66 | data = pd.read_csv(csv_loc)
67 |
68 | # Transform categoricals into category type
69 | data[CATEGORICAL_FEATURES] = data[CATEGORICAL_FEATURES].astype("category")
70 |
71 | # Train/test split
72 | train, test = train_test_split(data, random_state=42)
73 |
74 | # Separate X and y
75 | train_x = train[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
76 | train_y = train[[TARGET]]
77 | test_x = test[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
78 | test_y = test[[TARGET]]
79 |
80 | # Start the experiemnt
81 | with mlflow.start_run():
82 | # Pass the params into dictionary
83 | hgbt_params = {
84 | "learning_rate": learning_rate,
85 | "max_leaf_nodes": max_leaf_nodes,
86 | "max_depth": max_depth,
87 | "class_weight": class_weight if class_weight != "None" else None,
88 | "l2_regularization": l2_regularization,
89 | }
90 | # Define model
91 | hgbt = HistGradientBoostingClassifier(
92 | **hgbt_params,
93 | categorical_features=CATEGORICAL_FEATURES,
94 | max_iter=10000,
95 | early_stopping=True,
96 | validation_fraction=10
97 | )
98 | # Define transform
99 | transformer = ColumnTransformer(
100 | transformers=[
101 | ("categorical", OrdinalEncoder(), CATEGORICAL_FEATURES)
102 | ], # HGBT still needs this
103 | verbose_feature_names_out=False, # to not alter categorical names
104 | remainder="passthrough",
105 | )
106 | # Define pipeline
107 | pipeline = Pipeline(steps=[("prep", transformer), ("model", hgbt)]).set_output(
108 | transform="pandas"
109 | )
110 | # Fit the pipeline
111 | pipeline.fit(train_x, train_y)
112 | # Evaluate on testset
113 | test_preds = pipeline.predict_proba(test_x)
114 | eval_and_log_metrics("test", test_y, test_preds[:, 1])
115 | # Save the pipeline
116 | mlflow.sklearn.log_model(
117 | pipeline, "sklearn_models", pyfunc_predict_fn="predict_proba"
118 | )
119 |
120 |
121 | if __name__ == "__main__":
122 | run()
123 |
--------------------------------------------------------------------------------
/mlflow_models/train_rf.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import click
4 | import mlflow
5 | import pandas as pd
6 | from category_encoders import WOEEncoder
7 | from sklearn.compose import ColumnTransformer
8 | from sklearn.ensemble import RandomForestClassifier
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.pipeline import Pipeline
11 |
12 | from utils.columns import CATEGORICAL_FEATURES, NUMERICAL_FEATURES, TARGET
13 | from utils.data_utils import load_raw_data
14 | from utils.eval_utils import eval_and_log_metrics
15 |
16 |
17 | @click.command(
18 | help="Trains RF Model"
19 | "The input is expected in csv format."
20 | "The model and its metrics are logged with mlflow."
21 | )
22 | @click.option("--max-depth", type=click.INT, default=5, help="Depth of the trees")
23 | @click.option(
24 | "--max-features", type=click.FLOAT, default=0.1, help="Fraction of features to use"
25 | )
26 | @click.option(
27 | "--class-weight", type=click.STRING, default="balanced", help="Weight of labels"
28 | )
29 | @click.option(
30 | "--min-samples-leaf",
31 | type=click.INT,
32 | default=10,
33 | help="Minimum number of samples required to be at a leaf node.",
34 | )
35 | @click.argument("dset_name")
36 | def run(
37 | dset_name: str,
38 | max_depth: int,
39 | max_features: float,
40 | class_weight: str,
41 | min_samples_leaf: int,
42 | ):
43 | """
44 | This function trains and logs a Random Forest Classifier pipeline with mlflow.
45 |
46 | :param dset_name: The name of the dataset to use for training.
47 | :param max_depth: The maximum depth of the tree in the Random Forest Classifier.
48 | :param max_features: The maximum number of features to consider when looking for the best split.
49 | :param class_weight: The weighting of the classes. Can be None, 'balanced', or a dictionary.
50 | :param min_samples_leaf: The minimum number of samples required to be at a leaf node.
51 |
52 | :return: None
53 | """
54 | warnings.filterwarnings("ignore")
55 | # Read data
56 | csv_loc = load_raw_data(dset_name, file_name="train.csv")
57 | data = pd.read_csv(csv_loc)
58 |
59 | # Split data into train/test
60 | train, test = train_test_split(data, random_state=42)
61 |
62 | # Separate X and y
63 | train_x = train[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
64 | train_y = train[[TARGET]]
65 |
66 | test_x = test[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
67 | test_y = test[[TARGET]]
68 |
69 | # Star the experiment
70 | with mlflow.start_run():
71 | # Pass the parameters into dictionary
72 | rf_params = {
73 | "max_depth": max_depth,
74 | "max_features": max_features,
75 | "class_weight": class_weight if class_weight != "None" else None,
76 | "min_samples_leaf": min_samples_leaf,
77 | }
78 | # Define model
79 | rf = RandomForestClassifier(**rf_params)
80 | # Define transform
81 | transformer = ColumnTransformer(
82 | transformers=[("categorical", WOEEncoder(), CATEGORICAL_FEATURES)],
83 | remainder="passthrough",
84 | )
85 | # Define pipeline
86 | pipeline = Pipeline(steps=[("prep", transformer), ("model", rf)])
87 | # Fit the model
88 | pipeline.fit(train_x, train_y)
89 | # Evaluate and log metrics
90 | test_preds = pipeline.predict_proba(test_x)
91 | eval_and_log_metrics("test", test_y, test_preds[:, 1])
92 | # Save the model
93 | mlflow.sklearn.log_model(
94 | pipeline, "sklearn_models", pyfunc_predict_fn="predict_proba"
95 | ) # predict_proba because we want to predict a probabiltiy when deployed
96 |
97 |
98 | if __name__ == "__main__":
99 | run()
100 |
--------------------------------------------------------------------------------
/mlflow_models/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/mlflow_models/utils/__init__.py
--------------------------------------------------------------------------------
/mlflow_models/utils/columns.py:
--------------------------------------------------------------------------------
1 | TARGET = 'Loan Status'
2 |
3 | NUMERICAL_FEATURES = [
4 | "Loan Amount",
5 | "Funded Amount",
6 | "Funded Amount Investor",
7 | "Term",
8 | "Interest Rate",
9 | "Home Ownership",
10 | "Debit to Income",
11 | "Delinquency - two years",
12 | "Inquires - six months",
13 | "Open Account",
14 | "Public Record",
15 | "Revolving Balance",
16 | "Revolving Utilities",
17 | "Total Accounts",
18 | "Total Received Interest",
19 | "Total Received Late Fee",
20 | "Recoveries",
21 | "Collection Recovery Fee",
22 | "Collection 12 months Medical",
23 | "Last week Pay",
24 | "Accounts Delinquent",
25 | "Total Collection Amount",
26 | "Total Current Balance",
27 | "Total Revolving Credit Limit",
28 | ]
29 |
30 | CATEGORICAL_FEATURES = [
31 | "Batch Enrolled",
32 | "Grade",
33 | "Sub Grade",
34 | "Employment Duration",
35 | "Verification Status",
36 | "Payment Plan",
37 | "Loan Title",
38 | "Initial List Status",
39 | "Application Type",
40 | ]
41 |
--------------------------------------------------------------------------------
/mlflow_models/utils/data_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import zipfile
3 |
4 | import kaggle
5 |
6 |
7 | def load_raw_data(dset_name: str, file_name: str,):
8 | """Downloads and unpacks Kaggle data
9 |
10 | Args:
11 | dset_name (str, optional): name of kaggle dataset.
12 | Follows the format - username/dataset-name.
13 | For example - "sgpjesus/bank-account-fraud-dataset-neurips-2022".
14 | file_name (str, optional): name of the extracted file.
15 | Should be specified in case there are many files in the zip archive
16 |
17 | Raises:
18 | Exception: if kaggle API was not setup
19 |
20 | Returns:
21 | str: location of the downlaoded and extracted csv file
22 | """
23 | zip_destination_folder = "./data/"
24 | raw_destination_folder = os.path.join(zip_destination_folder, "raw")
25 |
26 | # Check if the Kaggle API key was created
27 | if not os.path.exists(os.path.expanduser("~/.kaggle/kaggle.json")):
28 | raise Exception(
29 | """
30 | Kaggle API key not found.
31 | Make sure to follow the instructions to set up your Kaggle API key.
32 | """
33 | )
34 |
35 | # Download the dataset into a current folder
36 | kaggle.api.dataset_download_files(dset_name, path=zip_destination_folder)
37 |
38 | # Check if the destination folder exists, and create it if it does not
39 | if not os.path.exists(raw_destination_folder):
40 | os.makedirs(raw_destination_folder)
41 |
42 | # Open the zip file in read mode
43 | zip_name = os.path.join(
44 | zip_destination_folder,
45 | f"{dset_name.split('/')[1]}.zip"
46 | )
47 | with zipfile.ZipFile(zip_name, "r") as zip_ref:
48 | # Extract all the files to the destination folder
49 | zip_ref.extractall(raw_destination_folder)
50 |
51 | csv_location = os.path.join(raw_destination_folder, file_name)
52 |
53 | return csv_location
54 |
--------------------------------------------------------------------------------
/mlflow_models/utils/eval_utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import average_precision_score
2 | import mlflow
3 |
4 | def eval_and_log_metrics(prefix, actual, pred):
5 | pr = average_precision_score(actual, pred)
6 | mlflow.log_metric("{}_PR_AUC".format(prefix), pr)
7 | return pr
--------------------------------------------------------------------------------
/mlflow_project/MLproject:
--------------------------------------------------------------------------------
1 | name: fraud_detection
2 |
3 | conda_env: conda_env.yaml
4 |
5 | entry_points:
6 | main:
7 | parameters:
8 | dset: {type: str, default: sgpjesus/bank-account-fraud-dataset-neurips-2022}
9 | n_trials: {type: int, default: 10}
10 | command: "python main.py {dset} {n_trials}"
11 |
--------------------------------------------------------------------------------
/mlflow_project/conda_env.yaml:
--------------------------------------------------------------------------------
1 | name: fraud
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python=3.9
6 | - pip
7 | - pip:
8 | - click
9 | - mlflow>=2.1
10 | - kaggle
11 | - polars
12 | - catboost
13 | - optuna
14 | - pandas
15 | - sklearn
--------------------------------------------------------------------------------
/mlflow_project/main.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import mlflow
4 | from steps.download_data import load_raw_data
5 | from steps.preprocess_data import preprocess_data
6 | from steps.tune_model import tune_model
7 | from steps.train_final_model import train_model
8 |
9 | class bcolors:
10 | HEADER = '\033[95m'
11 | OKBLUE = '\033[94m'
12 | OKCYAN = '\033[96m'
13 | OKGREEN = '\033[92m'
14 | WARNING = '\033[93m'
15 | FAIL = '\033[91m'
16 | ENDC = '\033[0m'
17 | BOLD = '\033[1m'
18 | UNDERLINE = '\033[4m'
19 |
20 |
21 | def pipeline():
22 | mlflow.set_experiment("fraud")
23 | file_location = load_raw_data(sys.argv[1])
24 | print(f"{bcolors.OKCYAN}Data is loaded{bcolors.ENDC}")
25 |
26 | file_dirs = preprocess_data(file_location, missing_thr=0.95)
27 | print(f"{bcolors.OKCYAN}Data is preprocessed{bcolors.ENDC}")
28 | best_params = tune_model(
29 | train_path=file_dirs["train-data-dir"],
30 | val_path=file_dirs["val-data-dir"],
31 | n_trials=int(sys.argv[2]),
32 | )
33 | print(f"{bcolors.OKCYAN}HP tuning is finished{bcolors.ENDC}")
34 | best_params["n_estimators"] = 1000
35 | best_params["objective"] = "Logloss"
36 |
37 | roc, pr = train_model(
38 | best_params,
39 | train_path=file_dirs["train-data-dir"],
40 | val_path=file_dirs["val-data-dir"],
41 | test_path=file_dirs["test-data-dir"],
42 | )
43 | print(f"{bcolors.OKGREEN}Final model is trained. \nTestset ROC AUC: {roc}\nTestset PR AUC: {pr}{bcolors.ENDC}")
44 |
45 |
46 | if __name__ == "__main__":
47 | pipeline()
48 |
--------------------------------------------------------------------------------
/mlflow_project/steps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/mlflow_project/steps/__init__.py
--------------------------------------------------------------------------------
/mlflow_project/steps/download_data.py:
--------------------------------------------------------------------------------
1 | import os
2 | import zipfile
3 |
4 | import kaggle
5 |
6 |
7 | def load_raw_data(dset_name):
8 | zip_destination_folder = "./data/"
9 | raw_destination_folder = os.path.join(zip_destination_folder, "raw")
10 |
11 | # Check if the Kaggle API key was created
12 | if not os.path.exists(os.path.expanduser("~/.kaggle/kaggle.json")):
13 | raise Exception(
14 | "Kaggle API key not found. Make sure to follow the instructions to set up your Kaggle API key."
15 | )
16 |
17 | # Download the dataset into a current folder
18 | kaggle.api.dataset_download_files(
19 | dset_name,
20 | path=zip_destination_folder,
21 | )
22 |
23 | # Check if the destination folder exists, and create it if it does not
24 | if not os.path.exists(raw_destination_folder):
25 | os.makedirs(raw_destination_folder)
26 |
27 | # Open the zip file in read mode
28 | zip_name = os.path.join(
29 | zip_destination_folder, "bank-account-fraud-dataset-neurips-2022.zip"
30 | )
31 | with zipfile.ZipFile(zip_name, "r") as zip_ref:
32 | # Extract all the files to the destination folder
33 | zip_ref.extractall(raw_destination_folder)
34 |
35 | # TODO: make file name a param as well
36 | csv_location = os.path.join(raw_destination_folder, "Base.csv")
37 |
38 | return csv_location
--------------------------------------------------------------------------------
/mlflow_project/steps/preprocess_data.py:
--------------------------------------------------------------------------------
1 | import os
2 | import polars as pl
3 |
4 |
5 | def process_nans(df: pl.DataFrame, drop_thr: float = 0.95) -> pl.DataFrame:
6 | for col in df.get_columns():
7 | nulls_prop = col.is_null().mean()
8 | print(f"{col.name} - {nulls_prop * 100}% missing")
9 | # drop if missing more than a threshold
10 | if nulls_prop >= drop_thr:
11 | print("Dropping", col.name)
12 | df = df.select([pl.exclude(col.name)])
13 | # If some values are missing
14 | elif nulls_prop > 0:
15 | print("Imputing", col.name)
16 | # If numeric, impute with median
17 | if col.is_numeric():
18 | fill_value = col.median()
19 | else:
20 | # Else, impute with mode
21 | fill_value = col.mode()
22 | df = df.select(
23 | [
24 | # Exclude the original column
25 | pl.exclude(col.name),
26 | # Include the imputed one
27 | pl.col(col.name).fill_null(value=fill_value),
28 | ]
29 | )
30 |
31 | return df
32 |
33 | def drop_static(df:pl.DataFrame) -> pl.DataFrame:
34 | for col in df.get_columns():
35 | std = col.std()
36 | # drop if missing more than a threshold
37 | if std == 0:
38 | print("Dropping", col.name)
39 | df = df.select([pl.exclude(col.name)])
40 |
41 | return df
42 |
43 |
44 | def train_val_test_split(df, test_size=0.2, val_size=0.2):
45 | df_train = df.filter(
46 | pl.col("month") < df['month'].quantile(0.8)
47 | )
48 |
49 | df_test = df.filter(
50 | pl.col("month") >= df['month'].quantile(0.8)
51 | )
52 |
53 | df_val = df_train.filter(
54 | pl.col("month") >= df_train['month'].quantile(0.8)
55 | )
56 |
57 | df_train = df_train.filter(
58 | pl.col("month") < df_train['month'].quantile(0.8)
59 | )
60 |
61 | return df_train, df_val, df_test
62 |
63 | def preprocess_data(dset_path, missing_thr):
64 | df = pl.read_csv(dset_path)
65 | # Preprocess nulls
66 | df = process_nans(df, missing_thr)
67 | # Drop static
68 | df = drop_static(df)
69 | # Train/val/test split
70 | train_df, val_df, test_df = train_val_test_split(df)
71 | # Save data
72 | split_destination_folder = './data/processed'
73 | if not os.path.exists(split_destination_folder):
74 | os.makedirs(split_destination_folder)
75 |
76 | train_df.write_parquet('./data/processed/train.parquet')
77 | val_df.write_parquet('./data/processed/validation.parquet')
78 | test_df.write_parquet('./data/processed/test.parquet')
79 |
80 | file_locations = {
81 | 'train-data-dir': './data/processed/train.parquet',
82 | 'val-data-dir': './data/processed/validation.parquet',
83 | 'test-data-dir': './data/processed/test.parquet',
84 | }
85 |
86 | return file_locations
--------------------------------------------------------------------------------
/mlflow_project/steps/train_final_model.py:
--------------------------------------------------------------------------------
1 | import catboost as cb
2 | import click
3 | import pandas as pd
4 | from sklearn.metrics import average_precision_score, roc_auc_score
5 | from steps.tune_model import CATEGORICAL_FEATURES, NUMERICAL_FEATURES, TARGET, read_cb_data
6 | import mlflow
7 |
8 | def train_model(params, train_path, val_path, test_path):
9 | train_dataset = read_cb_data(
10 | train_path,
11 | numeric_features=NUMERICAL_FEATURES,
12 | categorical_features=CATEGORICAL_FEATURES,
13 | target_feature=TARGET
14 | )
15 | val_dataset = read_cb_data(
16 | val_path,
17 | numeric_features=NUMERICAL_FEATURES,
18 | categorical_features=CATEGORICAL_FEATURES,
19 | target_feature=TARGET
20 | )
21 | test_dataset = read_cb_data(
22 | test_path,
23 | numeric_features=NUMERICAL_FEATURES,
24 | categorical_features=CATEGORICAL_FEATURES,
25 | target_feature=TARGET
26 | )
27 | mlflow.set_experiment("fraud")
28 | experiment = mlflow.get_experiment_by_name("fraud")
29 | client = mlflow.tracking.MlflowClient()
30 | run = client.create_run(experiment.experiment_id)
31 | with mlflow.start_run(run_id = run.info.run_id):
32 | gbm = cb.CatBoostClassifier(**params)
33 | gbm.fit(train_dataset, eval_set=val_dataset, early_stopping_rounds=50)
34 | preds = gbm.predict_proba(test_dataset)
35 | ap = average_precision_score(test_dataset.get_label(), preds[:, 1])
36 | roc = roc_auc_score(test_dataset.get_label(), preds[:, 1])
37 |
38 | mlflow.log_metric("Test ROC AUC", roc)
39 | mlflow.log_metric("Test PR AUC", ap)
40 | mlflow.log_params(params)
41 | mlflow.catboost.log_model(gbm, "catboost_model")
42 |
43 | return roc, ap
44 |
45 |
46 | if __name__ == "__main__":
47 | train_model()
48 |
--------------------------------------------------------------------------------
/mlflow_project/steps/tune_model.py:
--------------------------------------------------------------------------------
1 | import catboost as cb
2 | import mlflow
3 | import optuna
4 | import pandas as pd
5 | from optuna.integration.mlflow import MLflowCallback
6 | from sklearn.metrics import average_precision_score, roc_auc_score
7 |
8 |
9 | TARGET = "fraud_bool"
10 |
11 | CATEGORICAL_FEATURES = [
12 | "payment_type",
13 | "employment_status",
14 | "housing_status",
15 | "source",
16 | "device_os",
17 | ]
18 | NUMERICAL_FEATURES = [
19 | "income",
20 | "name_email_similarity",
21 | "prev_address_months_count",
22 | "current_address_months_count",
23 | "customer_age",
24 | "days_since_request",
25 | "intended_balcon_amount",
26 | "zip_count_4w",
27 | "velocity_6h",
28 | "velocity_24h",
29 | "velocity_4w",
30 | "bank_branch_count_8w",
31 | "date_of_birth_distinct_emails_4w",
32 | "credit_risk_score",
33 | "email_is_free",
34 | "phone_home_valid",
35 | "phone_mobile_valid",
36 | "bank_months_count",
37 | "has_other_cards",
38 | "proposed_credit_limit",
39 | "foreign_request",
40 | "session_length_in_minutes",
41 | "keep_alive_session",
42 | "device_distinct_emails_8w",
43 | "month",
44 | ]
45 |
46 |
47 | def read_cb_data(
48 | path: str, numeric_features: list, categorical_features: list, target_feature: str
49 | ):
50 | data = pd.read_parquet(path)
51 | dataset = cb.Pool(
52 | data=data[numeric_features + categorical_features],
53 | label=data[target_feature],
54 | cat_features=categorical_features,
55 | )
56 | return dataset
57 |
58 |
59 | def tune_model(train_path, val_path, n_trials):
60 | train_dataset = read_cb_data(
61 | train_path,
62 | numeric_features=NUMERICAL_FEATURES,
63 | categorical_features=CATEGORICAL_FEATURES,
64 | target_feature=TARGET,
65 | )
66 | val_dataset = read_cb_data(
67 | val_path,
68 | numeric_features=NUMERICAL_FEATURES,
69 | categorical_features=CATEGORICAL_FEATURES,
70 | target_feature=TARGET,
71 | )
72 |
73 | def objective(trial):
74 | mlflow.set_experiment("fraud")
75 | experiment = mlflow.get_experiment_by_name("fraud")
76 | client = mlflow.tracking.MlflowClient()
77 | run = client.create_run(experiment.experiment_id)
78 | with mlflow.start_run(run_id = run.info.run_id):
79 | param = {
80 | "n_estimators": 1000,
81 | "objective": "Logloss",
82 | "subsample": trial.suggest_uniform("subsample", 0.4, 1.0),
83 | "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-3, 10.0),
84 | "learning_rate": trial.suggest_uniform("learning_rate", 0.006, 0.02),
85 | "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 0.5),
86 | "depth": trial.suggest_int("depth", 2, 12),
87 | "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
88 | }
89 | mlflow.log_params(param)
90 | gbm = cb.CatBoostClassifier(**param)
91 | gbm.fit(train_dataset, eval_set=val_dataset, early_stopping_rounds=50)
92 |
93 | preds = gbm.predict_proba(val_dataset)
94 | ap = average_precision_score(val_dataset.get_label(), preds[:, 1])
95 | roc = roc_auc_score(val_dataset.get_label(), preds[:, 1])
96 | mlflow.log_metric("Val PR AUC", ap)
97 | mlflow.log_metric("Val ROC AUC", roc)
98 | return ap
99 |
100 | study = optuna.create_study(direction="maximize")
101 | study.optimize(objective, n_trials=n_trials)
102 | return study.best_trial.params
103 |
--------------------------------------------------------------------------------
/polars/data_preparation_pipeline.py:
--------------------------------------------------------------------------------
1 | """Pipeline script to prepare and save data for modelling"""
2 | import time
3 |
4 | import polars as pl
5 | import yaml
6 |
7 | from data_utils.feature_engineering import (
8 | add_period_features,
9 | add_rolling_features,
10 | basic_feature_engineering,
11 | )
12 | from data_utils.processing import clean_data, read_category_mappings
13 | from data_utils.transfomation import create_target_df
14 |
15 |
16 | def pipeline():
17 | """Pipeline that reads, cleans, and transofrms data into
18 | the format we need for modelling
19 | """
20 | # Read and unwrap the config
21 | with open("pipe_config.yaml", "r") as file:
22 | pipe_config = yaml.safe_load(file)
23 |
24 | date_column_format = pipe_config["date_column_format"]
25 | ratios_config = pipe_config["ratio_features"]
26 | diffs_config = pipe_config["difference_features"]
27 | dates_config = pipe_config["date_features"]
28 |
29 | id_to_category = read_category_mappings(pipe_config["category_map_path"])
30 | col_mappings = {"category_id": id_to_category}
31 |
32 | output_data = (
33 | pl.scan_csv(pipe_config["data_path"])
34 | .pipe(clean_data, date_column_format, col_mappings)
35 | .pipe(basic_feature_engineering, ratios_config, diffs_config, dates_config)
36 | .pipe(
37 | create_target_df,
38 | time_to_trending_thr=pipe_config["max_time_to_trending"],
39 | original_join_cols=pipe_config["join_columns"],
40 | other_cols=pipe_config["base_columns"],
41 | )
42 | .pipe(
43 | add_rolling_features,
44 | "first_day_in_trending",
45 | pipe_config["aggregate_windows"],
46 | )
47 | .pipe(
48 | add_period_features,
49 | "first_day_in_trending",
50 | pipe_config["aggregate_windows"],
51 | )
52 | ).collect()
53 |
54 | return output_data
55 |
56 |
57 | if __name__ == "__main__":
58 | t0 = time.time()
59 | output = pipeline()
60 | t1 = time.time()
61 | print("Pipeline took", t1 - t0, "seconds")
62 | print("Output shape", output.shape)
63 | print("Output columns:", output.columns)
64 | output.write_parquet("./data/modelling_data.parquet")
65 |
--------------------------------------------------------------------------------
/polars/data_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aruberts/tutorials/f81eb7a971914a254d3ab7d9b290a021db8f416b/polars/data_utils/__init__.py
--------------------------------------------------------------------------------
/polars/data_utils/feature_engineering.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | import polars as pl
4 |
5 |
6 | def ratio_features(features_config: Dict[str, List[str]]) -> List[pl.Expr]:
7 | expressions = []
8 | for name, cols in features_config.items():
9 | expressions.append((pl.col(cols[0]) / pl.col(cols[1])).alias(name))
10 |
11 | return expressions
12 |
13 |
14 | def diff_features(features_config: Dict[str, List[str]]) -> List[pl.Expr]:
15 | expressions = []
16 | for name, cols in features_config.items():
17 | expressions.append((pl.col(cols[0]) - pl.col(cols[1])).alias(name))
18 |
19 | return expressions
20 |
21 |
22 | def date_features(features_config: Dict[str, List[str]]) -> List[pl.Expr]:
23 | expressions = []
24 | for col, features in features_config.items():
25 | if "weekday" in features:
26 | expressions.append(pl.col(col).dt.weekday().alias(f"{col}_weekday"))
27 | if "month" in features:
28 | expressions.append(pl.col(col).dt.month().alias(f"{col}_month"))
29 | if "year" in features:
30 | expressions.append(pl.col(col).dt.year().alias(f"{col}_year"))
31 |
32 | return expressions
33 |
34 |
35 | def basic_feature_engineering(
36 | data: pl.LazyFrame,
37 | ratios_config: Dict[str, List[str]],
38 | diffs_config: Dict[str, List[str]],
39 | dates_config: Dict[str, List[str]],
40 | ) -> pl.LazyFrame:
41 | ratio_expressions = ratio_features(ratios_config)
42 | date_diff_expressions = diff_features(diffs_config)
43 | date_expressions = date_features(dates_config)
44 |
45 | data = data.with_columns(
46 | ratio_expressions + date_diff_expressions + date_expressions
47 | )
48 | return data
49 |
50 |
51 | def build_channel_rolling(df: pl.LazyFrame, date_col: str, period: int) -> pl.LazyFrame:
52 | channel_aggs = (
53 | df.sort(date_col)
54 | .groupby_rolling(
55 | index_column=date_col,
56 | period=f"{period}d",
57 | by="channel_title",
58 | closed="left", #
59 | )
60 | .agg(
61 | pl.col("video_id")
62 | .n_unique()
63 | .alias(f"channel_num_trending_videos_last_{period}_days"),
64 | pl.col("days_in_trending")
65 | .max()
66 | .alias(f"channel_max_days_in_trending_{period}_days"),
67 | pl.col("days_in_trending")
68 | .mean()
69 | .alias(f"channel_avg_days_in_trending_{period}_days"),
70 | )
71 | .fill_null(0)
72 | )
73 |
74 | return channel_aggs
75 |
76 |
77 | def add_rolling_features(
78 | df: pl.LazyFrame, date_col: str, periods: List[int]
79 | ) -> pl.LazyFrame:
80 | for period in periods:
81 | rolling_features = build_channel_rolling(df, date_col, period)
82 | df = df.join(rolling_features, on=["channel_title", "first_day_in_trending"])
83 |
84 | return df
85 |
86 |
87 | def build_period_features(df: pl.LazyFrame, date_col: str, period: int) -> pl.LazyFrame:
88 | general_aggs = (
89 | df.sort(date_col)
90 | .groupby_dynamic(
91 | index_column=date_col,
92 | every="1d",
93 | period=f"{period}d",
94 | closed="left",
95 | )
96 | .agg(
97 | pl.col("video_id")
98 | .n_unique()
99 | .alias(f"general_num_trending_videos_last_{period}_days"),
100 | pl.col("days_in_trending")
101 | .max()
102 | .alias(f"general_max_days_in_trending_{period}_days"),
103 | pl.col("days_in_trending")
104 | .mean()
105 | .alias(f"general_avg_days_in_trending_{period}_days"),
106 | )
107 | .with_columns(
108 | # shift match values with previous period
109 | pl.col(f"general_num_trending_videos_last_{period}_days").shift(period),
110 | pl.col(f"general_max_days_in_trending_{period}_days").shift(period),
111 | pl.col(f"general_avg_days_in_trending_{period}_days").shift(period),
112 | )
113 | .fill_null(0)
114 | )
115 |
116 | return general_aggs
117 |
118 |
119 | def add_period_features(
120 | df: pl.LazyFrame, date_col: str, periods: List[int]
121 | ) -> pl.LazyFrame:
122 | for period in periods:
123 | rolling_features = build_period_features(df, date_col, period)
124 | df = df.join(rolling_features, on=["first_day_in_trending"])
125 |
126 | return df
127 |
--------------------------------------------------------------------------------
/polars/data_utils/processing.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Dict, List
3 |
4 | import polars as pl
5 |
6 |
7 | def read_category_mappings(path: str) -> Dict[int, str]:
8 | with open(path, "r") as f:
9 | categories = json.load(f)
10 |
11 | id_to_category = {}
12 | for c in categories["items"]:
13 | id_to_category[int(c["id"])] = c["snippet"]["title"]
14 |
15 | return id_to_category
16 |
17 |
18 | def parse_dates(date_cols: Dict[str, str]) -> List[pl.Expr]:
19 | expressions = []
20 | for date_col, fmt in date_cols.items():
21 | expressions.append(pl.col(date_col).str.to_date(format=fmt))
22 |
23 | return expressions
24 |
25 |
26 | def map_dict_columns(
27 | mapping_cols: Dict[str, Dict[str | int, str | int]]
28 | ) -> List[pl.Expr]:
29 | expressions = []
30 | for col, mapping in mapping_cols.items():
31 | expressions.append(pl.col(col).map_dict(mapping))
32 | return expressions
33 |
34 |
35 | def clean_data(
36 | df: pl.LazyFrame,
37 | date_cols_config: Dict[str, str],
38 | mapping_cols_config: Dict[str, Dict[str | int, str | int]],
39 | ) -> pl.LazyFrame:
40 | parse_dates_expressions = parse_dates(date_cols=date_cols_config)
41 | mapping_expressions = map_dict_columns(mapping_cols_config)
42 |
43 | df = df.with_columns(parse_dates_expressions + mapping_expressions)
44 | return df
45 |
--------------------------------------------------------------------------------
/polars/data_utils/transfomation.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | import polars as pl
4 |
5 |
6 | def join_original_features(
7 | main: pl.LazyFrame,
8 | original: pl.LazyFrame,
9 | main_join_cols: List[str],
10 | original_join_cols: List[str],
11 | other_cols: List[str],
12 | ) -> pl.LazyFrame:
13 | original_features = original.select(original_join_cols + other_cols).unique(
14 | original_join_cols
15 | ) # unique ensures one row per video + date
16 | main = main.join(
17 | original_features,
18 | left_on=main_join_cols,
19 | right_on=original_join_cols,
20 | how="left",
21 | )
22 |
23 | return main
24 |
25 |
26 | def create_target_df(
27 | df: pl.LazyFrame,
28 | time_to_trending_thr: int,
29 | original_join_cols: List[str],
30 | other_cols: List[str],
31 | ) -> pl.LazyFrame:
32 | # Create a DF with video ID per row and corresponding days to trending and days in trending (target)
33 | target = (
34 | df.groupby(["video_id"])
35 | .agg(
36 | pl.col("days_to_trending").min().dt.days(),
37 | pl.col("trending_date").min().dt.date().alias("first_day_in_trending"),
38 | pl.col("trending_date").max().dt.date().alias("last_day_in_trending"),
39 | (pl.col("trending_date").max() - pl.col("trending_date").min())
40 | .dt.days()
41 | .alias("days_in_trending"),
42 | )
43 | .filter(pl.col("days_to_trending") <= time_to_trending_thr)
44 | )
45 |
46 | # Join features to the aggregates
47 | target = join_original_features(
48 | main=target,
49 | original=df,
50 | main_join_cols=["video_id", "first_day_in_trending"],
51 | original_join_cols=original_join_cols,
52 | other_cols=other_cols,
53 | )
54 |
55 | return target
56 |
--------------------------------------------------------------------------------
/polars/pipe_config.yaml:
--------------------------------------------------------------------------------
1 | data_path: "./youtube/GBvideos.csv"
2 | category_map_path: "./youtube/GB_category_id.json"
3 |
4 | # Pre-processing config
5 | date_column_format:
6 | trending_date: "%y.%d.%m"
7 | publish_time: "%Y-%m-%dT%H:%M:%S%.fZ"
8 |
9 | # Feature engineering config
10 | ratio_features:
11 | likes_to_dislikes:
12 | - likes
13 | - dislikes
14 | likes_to_views:
15 | - likes
16 | - views
17 | comments_to_views:
18 | - comment_count
19 | - views
20 |
21 | difference_features:
22 | days_to_trending:
23 | - trending_date
24 | - publish_time
25 |
26 | date_features:
27 | trending_date:
28 | - weekday
29 |
30 | # Filtering config
31 | max_time_to_trending: 60
32 |
33 | # Features config
34 | join_columns:
35 | - video_id
36 | - trending_date
37 |
38 | base_columns:
39 | - views
40 | - likes
41 | - dislikes
42 | - comment_count
43 | - comments_disabled
44 | - ratings_disabled
45 | - video_error_or_removed
46 | - likes_to_dislikes
47 | - likes_to_views
48 | - comments_to_views
49 | - trending_date_weekday
50 | - channel_title
51 | - tags
52 | - description
53 | - category_id
54 |
55 | aggregate_windows:
56 | - 7
57 | - 30
58 | - 180
59 |
--------------------------------------------------------------------------------
/pyspark/cleaning.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 | from pyspark.sql import DataFrame
3 |
4 |
5 | def get_static(data: DataFrame, cols_to_analyse):
6 | """Return the list of static columns
7 |
8 | Args:
9 | data (DataFrame): input PySpark dataframe
10 | cols_to_analyse (list[str]): list of columns to analyse
11 |
12 | Returns:
13 | list[str]: list of static columns
14 | """
15 | unique_counts = data.agg(
16 | *(F.countDistinct(F.col(c)).alias(c) for c in cols_to_analyse)
17 | ).first()
18 | static_cols = [c for c in unique_counts.asDict() if unique_counts[c] == 1]
19 | print("Static columns:", static_cols)
20 | return static_cols
21 |
22 |
23 | def remove_rare_categories(
24 | data: DataFrame, columns, min_count: int = 100
25 | ) -> DataFrame:
26 | """Removes rare categories in categorical features by substituting
27 | them with 'Other'
28 |
29 | Args:
30 | data (DataFrame): input PySpark dataframe
31 | columns (list[str]): list of categorical features to process
32 | min_count (int, optional): minimum number of times for category
33 | to appear to not be considered rare. Defaults to 100.
34 |
35 | Returns:
36 | DataFrame: processed PySpark dataframe
37 | """
38 | categorical_valid_values = {}
39 |
40 | for c in columns:
41 | # Find frequent values
42 | categorical_valid_values[c] = (
43 | data.groupby(c)
44 | .count()
45 | .filter(F.col("count") > min_count)
46 | .select(c)
47 | .toPandas()
48 | .values.ravel()
49 | )
50 |
51 | data = data.withColumn(
52 | c,
53 | F.when(
54 | F.col(c).isin(list(categorical_valid_values[c])), F.col(c)
55 | ).otherwise(F.lit("Other").alias(c)),
56 | )
57 |
58 | return data
59 |
--------------------------------------------------------------------------------
/pyspark/conda_env.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 | - python=3.10
4 | - pyyaml=6.0.1
5 | - mmh3==4.0.1
6 | - numpy==1.26.2
7 | - pandas=2.0.3
8 | - pyspark==3.5.0
9 | - hyperopt==0.2.7
10 | name: iot
--------------------------------------------------------------------------------
/pyspark/config.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | categorical_features:
3 | - proto
4 | - service
5 | - conn_state
6 | - history
7 | filepaths:
8 | - ./iot_malware/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv
9 | model_output_path: ./pipeline
10 | na_fill_vals:
11 | conn_state: missing
12 | duration: -999999
13 | history: missing
14 | orig_bytes: -999999
15 | orig_ip_bytes: -999999
16 | orig_pkts: -999999
17 | proto: missing
18 | resp_bytes: -999999
19 | resp_ip_bytes: -999999
20 | resp_pkts: -999999
21 | service: missing
22 | numerical_features:
23 | - duration
24 | - orig_bytes
25 | - resp_bytes
26 | - missed_bytes
27 | - orig_pkts
28 | - orig_ip_bytes
29 | - resp_pkts
30 | - resp_ip_bytes
31 | random_split: true
32 | tuning_rounds: 0
33 | # - ./iot_malware/CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv
34 | # - ./iot_malware/CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv
35 | # - ./iot_malware/CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv
36 | # - ./iot_malware/CTU-IoT-Malware-Capture-35-1conn.log.labeled.csv
37 | # - ./iot_malware/CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv
38 | # - ./iot_malware/CTU-IoT-Malware-Capture-44-1conn.log.labeled.csv
39 | # - ./iot_malware/CTU-IoT-Malware-Capture-48-1conn.log.labeled.csv
40 | # - ./iot_malware/CTU-IoT-Malware-Capture-60-1conn.log.labeled.csv
41 | # - ./iot_malware/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv
--------------------------------------------------------------------------------
/pyspark/feature_engineering.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 | from pyspark.sql import Column, Window, WindowSpec
3 |
4 |
5 | def mins_to_secs(mins: int) -> int:
6 | """Transforms minutes to seconds
7 |
8 | Args:
9 | mins (int): number of minutes to be transformed
10 |
11 | Returns:
12 | int: numeber of seconds
13 | """
14 | return mins * 60
15 |
16 |
17 | def generate_window(
18 | window_in_minutes: int, partition_by: str, timestamp_col: str
19 | ) -> WindowSpec:
20 | """Generates window expressions for PySpark
21 |
22 | Args:
23 | window_in_minutes (int): Number of minutes you want in the rolling window
24 | partition_by (str): Column to partition by e.g. IP or user account
25 | timestamp_col (str): Column with timestamp data type
26 |
27 | Returns:
28 | _type_: _description_
29 | """
30 | window = (
31 | Window()
32 | .partitionBy(F.col(partition_by))
33 | .orderBy(F.col(timestamp_col).cast("long"))
34 | .rangeBetween(-mins_to_secs(window_in_minutes), -1)
35 | )
36 |
37 | return window
38 |
39 |
40 | def generate_rolling_aggregate(
41 | col: str,
42 | partition_by: str | None = None,
43 | operation: str = "count",
44 | timestamp_col: str = "dt",
45 | window_in_minutes: int = 1,
46 | ) -> Column:
47 | """Rolling aggregate experession constructor
48 |
49 | Args:
50 | col (str): Name of column to aggregate
51 | partition_by (str | None, optional): Column to partition by. Defaults to None.
52 | operation (str, optional): What type of aggregation should be done. Defaults to "count".
53 | timestamp_col (str, optional): Timestamp column in your PySpark DF. Defaults to "dt".
54 | window_in_minutes (int, optional): Number of minutes for the window. Defaults to 1.
55 |
56 | Raises:
57 | ValueError: _description_
58 |
59 | Returns:
60 | Column: _description_
61 | """
62 | if partition_by is None:
63 | partition_by = col
64 |
65 | match operation:
66 | case "count":
67 | return F.count(col).over(
68 | generate_window(
69 | window_in_minutes=window_in_minutes,
70 | partition_by=col,
71 | timestamp_col=timestamp_col,
72 | )
73 | )
74 | case "sum":
75 | return F.sum(col).over(
76 | generate_window(
77 | window_in_minutes=window_in_minutes,
78 | partition_by=col,
79 | timestamp_col=timestamp_col,
80 | )
81 | )
82 | case "avg":
83 | return F.avg(col).over(
84 | generate_window(
85 | window_in_minutes=window_in_minutes,
86 | partition_by=col,
87 | timestamp_col=timestamp_col,
88 | )
89 | )
90 | case _:
91 | raise ValueError(f"Operation {operation} is not defined")
92 |
--------------------------------------------------------------------------------
/pyspark/gcs_config.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | categorical_features:
3 | - proto
4 | - service
5 | - conn_state
6 | - history
7 | filepaths:
8 | - gs://iot-data-demo/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv
9 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv
10 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv
11 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv
12 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv
13 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-35-1conn.log.labeled.csv
14 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv
15 | # - gs://iot-data-demo/CTU-IoT-Malware-Capture-44-1conn.log.labeled.csv
16 | model_output_path: gs://iot-data-demo/best_pipeline
17 | na_fill_vals:
18 | conn_state: missing
19 | duration: -999999
20 | history: missing
21 | orig_bytes: -999999
22 | orig_ip_bytes: -999999
23 | orig_pkts: -999999
24 | proto: missing
25 | resp_bytes: -999999
26 | resp_ip_bytes: -999999
27 | resp_pkts: -999999
28 | service: missing
29 | numerical_features:
30 | - duration
31 | - orig_bytes
32 | - resp_bytes
33 | - missed_bytes
34 | - orig_pkts
35 | - orig_ip_bytes
36 | - resp_pkts
37 | - resp_ip_bytes
38 | random_split: true
39 | tuning_rounds: 0
--------------------------------------------------------------------------------
/pyspark/ml_prep.py:
--------------------------------------------------------------------------------
1 | import mmh3
2 | import pyspark.sql.functions as F
3 | from pyspark.sql import DataFrame
4 | from pyspark.sql.types import LongType
5 |
6 |
7 | @F.udf(returnType=LongType())
8 | def hash_udf(x):
9 | return mmh3.hash64(str(x))[0]
10 |
11 |
12 | def hash_split(
13 | data: DataFrame, col: str, test_size: float = 0.2
14 | ) -> tuple[DataFrame, DataFrame]:
15 | data = data.withColumn("hash", hash_udf(F.col(col)))
16 |
17 | # 80/20 split
18 | train_thr = data.approxQuantile(
19 | "hash", probabilities=[test_size], relativeError=0.01
20 | )[0]
21 | train = data.where(F.col("hash") >= train_thr).drop("hash")
22 | test = data.where(F.col("hash") < train_thr).drop("hash")
23 |
24 | return train, test
25 |
26 |
27 | def ip_based_split(
28 | data: DataFrame, col: str, test_size: float = 0.2
29 | ) -> tuple[DataFrame, DataFrame]:
30 | # Get list of IPs with > 20% malicious activity
31 | bad_ips = (
32 | data.groupby("source_ip")
33 | .agg(F.avg(F.col("is_bad")).alias("bad_avg"))
34 | .where(F.col("bad_avg") > 0.2)
35 | .select("source_ip")
36 | .toPandas()
37 | .values.ravel()
38 | )
39 | bad_ips = list(bad_ips)
40 | print(bad_ips)
41 |
42 | data = data.withColumn("ip_hash", hash_udf(F.col("source_ip")))
43 |
44 | # Split good IPs
45 | good_df = data.where(~F.col("source_ip").isin(bad_ips))
46 | bad_df = data.where(F.col("source_ip").isin(bad_ips))
47 | print("Original Sizes")
48 | print("Good", good_df.count())
49 | print("Bad", bad_df.count())
50 |
51 | # 80/20 split
52 | good_train, good_test = hash_split(good_df, col, test_size)
53 | print("Good data", good_train.count(), good_test.count())
54 | bad_train, bad_test = hash_split(bad_df, col, test_size)
55 | print("Bad data", bad_train.count(), bad_test.count())
56 |
57 | train = good_train.union(bad_train)
58 | test = good_test.union(bad_test)
59 |
60 | return train, test
61 |
--------------------------------------------------------------------------------
/pyspark/pipe.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 | import yaml
3 | from hyperopt import hp
4 | from pyspark.ml import Pipeline
5 | from pyspark.ml.classification import RandomForestClassifier
6 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
7 | from pyspark.ml.feature import StringIndexer, VectorAssembler
8 | from pyspark.sql import SparkSession
9 |
10 | from cleaning import get_static, remove_rare_categories
11 | from feature_engineering import generate_rolling_aggregate
12 | from ml_prep import ip_based_split
13 | from tuning import tune_rf
14 |
15 | # Read and set configs
16 | with open("gcs_config.yaml", "r") as file:
17 | conf = yaml.safe_load(file)
18 |
19 | numerical_features: list[str] = conf["numerical_features"]
20 | categorical_features: list[str] = conf["categorical_features"]
21 |
22 | spark = SparkSession.builder.appName("LocalTest").getOrCreate()
23 | spark.sparkContext.setLogLevel("WARN")
24 |
25 | # Read in and do some basic processing
26 | df = (
27 | spark.read.option("delimiter", "|")
28 | .csv(conf["filepaths"], inferSchema=True, header=True)
29 | .withColumns(
30 | {
31 | "is_bad": F.when(F.col("label") != "Benign", 1).otherwise(0),
32 | "dt": F.to_timestamp(F.from_unixtime("ts")),
33 | }
34 | )
35 | .withColumnsRenamed(
36 | {
37 | "id.orig_h": "source_ip",
38 | "id.orig_p": "source_port",
39 | "id.resp_h": "dest_ip",
40 | "id.resp_p": "dest_port",
41 | }
42 | )
43 | .withColumns({n: F.col(n).cast("double") for n in numerical_features})
44 | .replace("-", None)
45 | .fillna(conf["na_fill_vals"])
46 | )
47 |
48 | # Find and drop static columns
49 | static_numerical = get_static(df, numerical_features)
50 | static_categorical = get_static(df, categorical_features)
51 | numerical_features = [f for f in numerical_features if f not in static_numerical]
52 | categorical_features = [f for f in categorical_features if f not in static_categorical]
53 | categorical_features_indexed = [c + "_ind" for c in categorical_features]
54 | input_features = numerical_features + categorical_features_indexed
55 |
56 | # Process categorical
57 | df = remove_rare_categories(
58 | df.drop(*static_numerical + static_categorical), categorical_features, min_count=100
59 | )
60 |
61 | # Feature engineering
62 | df = df.withColumns(
63 | {
64 | "source_ip_count_last_min": generate_rolling_aggregate(
65 | col="source_ip", operation="count", timestamp_col="dt", window_in_minutes=1
66 | ),
67 | "source_ip_count_last_30_mins": generate_rolling_aggregate(
68 | col="source_ip", operation="count", timestamp_col="dt", window_in_minutes=30
69 | ),
70 | "source_port_count_last_min": generate_rolling_aggregate(
71 | col="source_port",
72 | operation="count",
73 | timestamp_col="dt",
74 | window_in_minutes=1,
75 | ),
76 | "source_port_count_last_30_mins": generate_rolling_aggregate(
77 | col="source_port",
78 | operation="count",
79 | timestamp_col="dt",
80 | window_in_minutes=30,
81 | ),
82 | "source_ip_avg_pkts_last_min": generate_rolling_aggregate(
83 | col="orig_pkts",
84 | partition_by="source_ip",
85 | operation="avg",
86 | timestamp_col="dt",
87 | window_in_minutes=1,
88 | ),
89 | "source_ip_avg_pkts_last_30_mins": generate_rolling_aggregate(
90 | col="orig_pkts",
91 | partition_by="source_ip",
92 | operation="avg",
93 | timestamp_col="dt",
94 | window_in_minutes=30,
95 | ),
96 | "source_ip_avg_bytes_last_min": generate_rolling_aggregate(
97 | col="orig_ip_bytes",
98 | partition_by="source_ip",
99 | operation="avg",
100 | timestamp_col="dt",
101 | window_in_minutes=1,
102 | ),
103 | "source_ip_avg_bytes_last_30_mins": generate_rolling_aggregate(
104 | col="orig_ip_bytes",
105 | partition_by="source_ip",
106 | operation="avg",
107 | timestamp_col="dt",
108 | window_in_minutes=30,
109 | ),
110 | }
111 | )
112 |
113 | if conf["random_split"]:
114 | df_train, df_test = df.randomSplit(weights=[0.8, 0.2], seed=200)
115 | else:
116 | df_train, df_test = ip_based_split(df, "source_ip", 0.2)
117 |
118 | roc = BinaryClassificationEvaluator(labelCol="is_bad", metricName="areaUnderROC")
119 | ind = StringIndexer(
120 | inputCols=categorical_features,
121 | outputCols=categorical_features_indexed,
122 | handleInvalid="skip",
123 | )
124 | va = VectorAssembler(
125 | inputCols=input_features, outputCol="features", handleInvalid="skip"
126 | )
127 |
128 | if conf["tuning_rounds"] > 0:
129 | df_train, df_val = df_train.randomSplit(weights=[0.8, 0.2], seed=200)
130 | search_space = {
131 | "numTrees": hp.uniformint("numTrees", 10, 500),
132 | "maxDepth": hp.uniformint("maxDepth", 2, 10),
133 | }
134 | print(f"Tuning the model for {conf['tuning_rounds']} rounds")
135 | best_params = tune_rf(
136 | train=df_train,
137 | val=df_val,
138 | string_indexer=ind,
139 | vector_assembler=va,
140 | evaluator=roc,
141 | param_grid=search_space,
142 | tuning_rounds=conf["tuning_rounds"],
143 | )
144 | else:
145 | print("Skipping the tuning...")
146 | best_params = {"numTrees": 10, "maxDepth": 4}
147 |
148 | best_rf = RandomForestClassifier(
149 | featuresCol="features",
150 | labelCol="is_bad",
151 | numTrees=best_params["numTrees"],
152 | maxDepth=best_params["maxDepth"],
153 | )
154 |
155 | best_pipeline = Pipeline(stages=[ind, va, best_rf])
156 | best_pipeline = best_pipeline.fit(df_train)
157 | test_preds = best_pipeline.transform(df_test)
158 |
159 | score = roc.evaluate(test_preds)
160 | print("ROC AUC", score)
161 | best_pipeline.save(conf["model_output_path"])
162 |
--------------------------------------------------------------------------------
/pyspark/spark_feature_engineering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# PySpark Project Step-by-Step: Part 2\n",
8 | "\n",
9 | "This notebook will walk you through 2 more steps in the ML lifecycle - **Feature Engineering** and **Model Fitting & Evaluation**.
\n",
10 | "* In the feature engineering part you'll see how to perform common aggregates using analytical functions.\n",
11 | "* In the modelling part you'll see how to prepare your data for modelling in PySpark, and how to fit a model using MLLib.\n",
12 | "* Finally, we'll see how we can evaluate the model we've built."
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "from pyspark.sql import SparkSession\n",
22 | "from pyspark.sql import Window\n",
23 | "import pyspark.sql.functions as F\n",
24 | "from pyspark.ml.feature import StringIndexer, VectorAssembler\n",
25 | "from pyspark.ml import Pipeline\n",
26 | "from pyspark.ml.classification import RandomForestClassifier"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "spark = (\n",
36 | " SparkSession.builder.appName(\"iot\")\n",
37 | " .getOrCreate()\n",
38 | ")\n",
39 | "spark.sparkContext.setLogLevel(\"ERROR\")"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "## Read Data"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "df = spark.read.parquet(\"processed.pq\").withColumn(\n",
56 | " \"is_bad\", F.when(F.col(\"label\") != \"Benign\", 1).otherwise(0)\n",
57 | ")\n",
58 | "df.show(5)"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "## Feature Engineering\n",
66 | "\n",
67 | "Since we have a time-component to this data, we can engineer all sorts of rolling features. The ones that I'll cover here are:\n",
68 | "* Number of times we've seen this source IP in the last minute\n",
69 | "* Number of times we've seen this destination IP in the last minute\n",
70 | "* Number of times we've seen this source PORT in the last minute\n",
71 | "* Number of times we've seen this destination PORT in the last minute\n",
72 | "\n",
73 | "To calculate these features, we'll need to use analytical functions. "
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "def mins_to_secs(mins):\n",
83 | " return mins * 60\n",
84 | "\n",
85 | "\n",
86 | "def generate_window(window_in_minutes: int, partition_by: str, timestamp_col: str):\n",
87 | " window = (\n",
88 | " Window()\n",
89 | " .partitionBy(F.col(partition_by))\n",
90 | " .orderBy(F.col(timestamp_col).cast(\"long\"))\n",
91 | " .rangeBetween(-mins_to_secs(window_in_minutes), -1)\n",
92 | " )\n",
93 | "\n",
94 | " return window\n",
95 | "\n",
96 | "\n",
97 | "def generate_rolling_aggregate(\n",
98 | " col: str,\n",
99 | " partition_by: str | None = None,\n",
100 | " operation: str = \"count\",\n",
101 | " timestamp_col: str = \"dt\",\n",
102 | " window_in_minutes: int = 1,\n",
103 | "):\n",
104 | " if partition_by is None:\n",
105 | " partition_by = col\n",
106 | "\n",
107 | " match operation:\n",
108 | " case \"count\":\n",
109 | " return F.count(col).over(\n",
110 | " generate_window(\n",
111 | " window_in_minutes=window_in_minutes,\n",
112 | " partition_by=col,\n",
113 | " timestamp_col=timestamp_col,\n",
114 | " )\n",
115 | " )\n",
116 | " case \"sum\":\n",
117 | " return F.sum(col).over(\n",
118 | " generate_window(\n",
119 | " window_in_minutes=window_in_minutes,\n",
120 | " partition_by=col,\n",
121 | " timestamp_col=timestamp_col,\n",
122 | " )\n",
123 | " )\n",
124 | " case \"avg\":\n",
125 | " return F.avg(col).over(\n",
126 | " generate_window(\n",
127 | " window_in_minutes=window_in_minutes,\n",
128 | " partition_by=col,\n",
129 | " timestamp_col=timestamp_col,\n",
130 | " )\n",
131 | " )\n",
132 | " case _:\n",
133 | " raise ValueError(f\"Operation {operation} is not defined\")"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "### Generate Rolling Count Features"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "Due to the nicely defined functions above, generating rolling averages and counts is a piece of cake!"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "df = df.withColumns({\n",
157 | " \"source_ip_count_last_min\": generate_rolling_aggregate(col=\"source_ip\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=1),\n",
158 | " \"source_ip_count_last_30_mins\": generate_rolling_aggregate(col=\"source_ip\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=30),\n",
159 | " \"source_port_count_last_min\": generate_rolling_aggregate(col=\"source_port\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=1),\n",
160 | " \"source_port_count_last_30_mins\": generate_rolling_aggregate(col=\"source_port\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=30),\n",
161 | " \"dest_ip_count_last_min\": generate_rolling_aggregate(col=\"dest_ip\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=1),\n",
162 | " \"dest_ip_count_last_30_mins\": generate_rolling_aggregate(col=\"dest_ip\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=30),\n",
163 | " \"dest_port_count_last_min\": generate_rolling_aggregate(col=\"dest_port\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=1),\n",
164 | " \"dest_port_count_last_30_mins\": generate_rolling_aggregate(col=\"dest_port\", operation=\"count\", timestamp_col=\"dt\", window_in_minutes=30),\n",
165 | " \"source_ip_avg_pkts_last_min\": generate_rolling_aggregate(col=\"orig_pkts\", partition_by=\"source_ip\", operation=\"avg\", timestamp_col=\"dt\", window_in_minutes=1),\n",
166 | " \"source_ip_avg_pkts_last_30_mins\": generate_rolling_aggregate(col=\"orig_pkts\", partition_by=\"source_ip\", operation=\"avg\", timestamp_col=\"dt\", window_in_minutes=30),\n",
167 | " \"source_ip_avg_bytes_last_min\": generate_rolling_aggregate(col=\"orig_ip_bytes\", partition_by=\"source_ip\", operation=\"avg\", timestamp_col=\"dt\", window_in_minutes=1),\n",
168 | " \"source_ip_avg_bytes_last_30_mins\": generate_rolling_aggregate(col=\"orig_ip_bytes\", partition_by=\"source_ip\", operation=\"avg\", timestamp_col=\"dt\", window_in_minutes=30),\n",
169 | "})"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "df.show(5)"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "Now,execute and save the resulting table into a new parquet file"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "df.write.mode(\"overwrite\").parquet(\"feature_engineered.pq\")"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "df_fe = spark.read.parquet(\"feature_engineered.pq\")"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "Let's compare the speed of calling the old `df` vs the new `df_fe`..."
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "df_fe.show(10)"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "Such a drastic difference is because when you call `df.show()` it's going to execute all of the very expensive operations we did. Instead, it's better to construct a new dataframe for the analysis."
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "## Preprocessing"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "df_fe.columns[:5]"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "numerical_features = [\n",
252 | " \"duration\",\n",
253 | " \"orig_bytes\",\n",
254 | " \"resp_bytes\",\n",
255 | " \"orig_pkts\",\n",
256 | " \"orig_ip_bytes\",\n",
257 | " \"resp_pkts\",\n",
258 | " \"resp_ip_bytes\",\n",
259 | " \"source_ip_count_last_min\",\n",
260 | " \"source_ip_count_last_30_mins\",\n",
261 | " \"source_port_count_last_min\",\n",
262 | " \"source_port_count_last_30_mins\",\n",
263 | " # \"dest_ip_count_last_min\",\n",
264 | " # \"dest_ip_count_last_30_mins\",\n",
265 | " # \"dest_port_count_last_min\",\n",
266 | " # \"dest_port_count_last_30_mins\",\n",
267 | " \"source_ip_avg_pkts_last_min\",\n",
268 | " \"source_ip_avg_pkts_last_30_mins\",\n",
269 | " \"source_ip_avg_bytes_last_min\",\n",
270 | " \"source_ip_avg_bytes_last_30_mins\",\n",
271 | "]\n",
272 | "categorical_features = [\"proto\", \"service\", \"conn_state\", \"history\"]\n",
273 | "categorical_features_indexed = [c + \"_index\" for c in categorical_features]\n",
274 | "\n",
275 | "input_features = numerical_features + categorical_features_indexed"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | "### Remove rare categories"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "df_fe.select([F.count_distinct(c) for c in categorical_features]).show()"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "categorical_valid_values = {}\n",
301 | "\n",
302 | "for c in categorical_features:\n",
303 | " # Find frequent values\n",
304 | " categorical_valid_values[c] = (\n",
305 | " df_fe.groupby(c)\n",
306 | " .count()\n",
307 | " .filter(F.col(\"count\") > 100)\n",
308 | " .select(c)\n",
309 | " .toPandas()\n",
310 | " .values.ravel()\n",
311 | " )\n",
312 | "\n",
313 | " df_fe = df_fe.withColumn(\n",
314 | " c,\n",
315 | " F.when(F.col(c).isin(list(categorical_valid_values[c])), F.col(c)).otherwise(\n",
316 | " F.lit(\"Other\").alias(c)\n",
317 | " ),\n",
318 | " )"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "df_fe.select([F.count_distinct(c) for c in categorical_features]).show()"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "## Train/Test Split\n",
335 | "Train test split will need to be done using the source IP address, otherwise we risk leaking data. The best way to do this is by splitting the IP addresses at random, and then filtering the data frame according to the IP address."
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": [
344 | "df_fe.groupby(\"source_ip\").agg(F.sum(F.col(\"is_bad\")).alias(\"bad_sum\")).orderBy(\"bad_sum\", ascending=False).show(5)"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "# Training non-malicious IPs (80%)\n",
354 | "train_ips = (\n",
355 | " df_fe.where(\n",
356 | " ~F.col(\"source_ip\").isin([\"192.168.100.103\", \"192.168.2.5\", \"192.168.2.1\"])\n",
357 | " )\n",
358 | " .select(F.col(\"source_ip\"), F.lit(1).alias(\"is_train\"))\n",
359 | " .dropDuplicates()\n",
360 | " .sample(0.8)\n",
361 | ")\n",
362 | "\n",
363 | "\n",
364 | "df_fe = df_fe.join(train_ips, \"source_ip\", \"left\")\n",
365 | "\n",
366 | "# Add 1 malicious IP to training and testing data\n",
367 | "df_train = df_fe.where((F.col(\"is_train\") == 1) | (F.col(\"source_ip\") == \"192.168.100.103\"))\n",
368 | "df_test = df_fe.where((F.col(\"is_train\") != 1) | (F.col(\"source_ip\") == \"192.168.2.5\"))"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {},
374 | "source": [
375 | "## Pipeline"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "ind = StringIndexer(inputCols=categorical_features, outputCols=categorical_features_indexed, handleInvalid='skip')\n",
385 | "va = VectorAssembler(inputCols=input_features, outputCol=\"features\", handleInvalid='skip' )\n",
386 | "rf = RandomForestClassifier(featuresCol=\"features\", labelCol=\"is_bad\", numTrees=100)\n",
387 | "\n",
388 | "pipeline = Pipeline(stages=[ind, va, rf])"
389 | ]
390 | },
391 | {
392 | "cell_type": "markdown",
393 | "metadata": {},
394 | "source": [
395 | "## Fit and Predict"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": null,
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "pipeline = pipeline.fit(df_train)\n",
405 | "test_preds = pipeline.transform(df_test)"
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "metadata": {},
411 | "source": [
412 | "## Evaluate"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": null,
418 | "metadata": {},
419 | "outputs": [],
420 | "source": [
421 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
422 | "\n",
423 | "roc = BinaryClassificationEvaluator(labelCol=\"is_bad\", metricName=\"areaUnderROC\")\n",
424 | "print(\"ROC AUC\", roc.evaluate(test_preds))\n",
425 | "\n",
426 | "pr = BinaryClassificationEvaluator(labelCol=\"is_bad\", metricName=\"areaUnderPR\")\n",
427 | "print(\"PR AUC\", pr.evaluate(test_preds))"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": null,
433 | "metadata": {},
434 | "outputs": [],
435 | "source": [
436 | "import pandas as pd\n",
437 | "\n",
438 | "pd.DataFrame(\n",
439 | " {\n",
440 | " \"importance\": list(pipeline.stages[-1].featureImportances),\n",
441 | " \"feature\": pipeline.stages[-2].getInputCols(),\n",
442 | " }\n",
443 | ").sort_values(\"importance\", ascending=False)"
444 | ]
445 | },
446 | {
447 | "cell_type": "markdown",
448 | "metadata": {},
449 | "source": [
450 | "## Export"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": null,
456 | "metadata": {},
457 | "outputs": [],
458 | "source": [
459 | "pipeline.stages[-1].save(\"rf_basic\")"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "pipeline.save(\"pipeline_basic\")"
469 | ]
470 | }
471 | ],
472 | "metadata": {
473 | "kernelspec": {
474 | "display_name": "dev",
475 | "language": "python",
476 | "name": "python3"
477 | },
478 | "language_info": {
479 | "codemirror_mode": {
480 | "name": "ipython",
481 | "version": 3
482 | },
483 | "file_extension": ".py",
484 | "mimetype": "text/x-python",
485 | "name": "python",
486 | "nbconvert_exporter": "python",
487 | "pygments_lexer": "ipython3",
488 | "version": "3.10.13"
489 | }
490 | },
491 | "nbformat": 4,
492 | "nbformat_minor": 2
493 | }
494 |
--------------------------------------------------------------------------------
/pyspark/spark_hp_tuning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pyspark.sql import SparkSession\n",
10 | "from pyspark.sql import Window\n",
11 | "import pyspark.sql.functions as F\n",
12 | "from pyspark.ml.feature import StringIndexer, VectorAssembler\n",
13 | "from pyspark.ml import Pipeline\n",
14 | "from pyspark.ml.classification import RandomForestClassifier\n",
15 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
16 | "\n",
17 | "from hyperopt import fmin, tpe, hp, STATUS_OK, Trials"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Start Session"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "spark = (\n",
34 | " SparkSession.builder.appName(\"iot\")\n",
35 | " .getOrCreate()\n",
36 | ")\n",
37 | "spark.sparkContext.setLogLevel(\"ERROR\")"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "## Read data"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "df = spark.read.parquet(\"feature_engineered.pq\")"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "df.show(5)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "numerical_features = [\n",
72 | " \"duration\",\n",
73 | " \"orig_bytes\",\n",
74 | " \"resp_bytes\",\n",
75 | " \"orig_pkts\",\n",
76 | " \"orig_ip_bytes\",\n",
77 | " \"resp_pkts\",\n",
78 | " \"resp_ip_bytes\",\n",
79 | " \"source_ip_count_last_min\",\n",
80 | " \"source_ip_count_last_30_mins\",\n",
81 | " \"source_port_count_last_min\",\n",
82 | " \"source_port_count_last_30_mins\",\n",
83 | " \"source_ip_avg_pkts_last_min\",\n",
84 | " \"source_ip_avg_pkts_last_30_mins\",\n",
85 | " \"source_ip_avg_bytes_last_min\",\n",
86 | " \"source_ip_avg_bytes_last_30_mins\",\n",
87 | "]\n",
88 | "categorical_features = [\"proto\", \"service\", \"conn_state\", \"history\"]\n",
89 | "categorical_features_indexed = [c + \"_index\" for c in categorical_features]\n",
90 | "\n",
91 | "input_features = numerical_features + categorical_features_indexed"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "categorical_valid_values = {}\n",
101 | "\n",
102 | "for c in categorical_features:\n",
103 | " # Find frequent values\n",
104 | " categorical_valid_values[c] = (\n",
105 | " df.groupby(c)\n",
106 | " .count()\n",
107 | " .filter(F.col(\"count\") > 100)\n",
108 | " .select(c)\n",
109 | " .toPandas()\n",
110 | " .values.ravel()\n",
111 | " )\n",
112 | "\n",
113 | " df_fe = df.withColumn(\n",
114 | " c,\n",
115 | " F.when(F.col(c).isin(list(categorical_valid_values[c])), F.col(c)).otherwise(\n",
116 | " F.lit(\"Other\").alias(c)\n",
117 | " ),\n",
118 | " )"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "df_train, df_test = df_fe.randomSplit(weights=[0.8, 0.2], seed=42)\n",
128 | "df_train, df_val = df_train.randomSplit(weights=[0.8, 0.2], seed=42)"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "## HP Tuning"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "from tuning import tune_rf"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "search_space = {\n",
154 | " \"numTrees\": hp.uniformint(\"numTrees\", 10, 500),\n",
155 | " \"maxDepth\": hp.uniformint(\"maxDepth\", 2, 10),\n",
156 | "}\n",
157 | "\n",
158 | "roc = BinaryClassificationEvaluator(labelCol=\"is_bad\", metricName=\"areaUnderROC\")\n",
159 | "\n",
160 | "ind = StringIndexer(\n",
161 | " inputCols=categorical_features,\n",
162 | " outputCols=categorical_features_indexed,\n",
163 | " handleInvalid=\"skip\",\n",
164 | ")\n",
165 | "va = VectorAssembler(\n",
166 | " inputCols=input_features, outputCol=\"features\", handleInvalid=\"skip\"\n",
167 | ")\n",
168 | "\n",
169 | "best_params = tune_rf(df_train, df_val, ind, va, roc, search_space)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "best_rf = RandomForestClassifier(\n",
179 | " featuresCol=\"features\",\n",
180 | " labelCol=\"is_bad\",\n",
181 | " numTrees=best_params[\"numTrees\"],\n",
182 | " maxDepth=best_params[\"maxDepth\"],\n",
183 | ")\n",
184 | "\n",
185 | "best_pipeline = Pipeline(stages=[ind, va, best_rf])\n",
186 | "\n",
187 | "best_pipeline = best_pipeline.fit(df_train)\n",
188 | "test_preds = best_pipeline.transform(df_test)\n",
189 | "\n",
190 | "score = roc.evaluate(test_preds)\n",
191 | "score"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {},
198 | "outputs": [],
199 | "source": [
200 | "best_pipeline.save(\"best_pipeline\")"
201 | ]
202 | }
203 | ],
204 | "metadata": {
205 | "kernelspec": {
206 | "display_name": "dev",
207 | "language": "python",
208 | "name": "python3"
209 | },
210 | "language_info": {
211 | "codemirror_mode": {
212 | "name": "ipython",
213 | "version": 3
214 | },
215 | "file_extension": ".py",
216 | "mimetype": "text/x-python",
217 | "name": "python",
218 | "nbconvert_exporter": "python",
219 | "pygments_lexer": "ipython3",
220 | "version": "3.10.13"
221 | }
222 | },
223 | "nbformat": 4,
224 | "nbformat_minor": 2
225 | }
226 |
--------------------------------------------------------------------------------
/pyspark/spark_intro.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Imports"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "from pyspark.sql import SparkSession\n",
17 | "from pyspark.sql.functions import (\n",
18 | " from_unixtime,\n",
19 | " to_timestamp,\n",
20 | " min,\n",
21 | " max,\n",
22 | " sum,\n",
23 | " avg,\n",
24 | " col,\n",
25 | " countDistinct,\n",
26 | " broadcast,\n",
27 | " date_trunc,\n",
28 | " count,\n",
29 | ")\n",
30 | "from pyspark.sql import Window\n",
31 | "import pyspark.sql.functions as F\n",
32 | "import plotly.express as px"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "# Read Files"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "filepaths = [\"./iot_malware/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv\", \"./iot_malware/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv\"]\n",
49 | "\n",
50 | "\n",
51 | "spark = (\n",
52 | " SparkSession.builder.appName(\"iot\")\n",
53 | " .getOrCreate()\n",
54 | ")\n",
55 | "spark.sparkContext.setLogLevel(\"ERROR\")\n",
56 | "spark.sparkContext.version"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "df = spark.read.option(\"delimiter\", \"|\").csv(filepaths, inferSchema = True, header = True)\n",
66 | "df.show(5)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "df.printSchema()"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "## Pre-processing"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "df = df.withColumn(\"dt\", from_unixtime(\"ts\")).withColumn(\"dt\", to_timestamp(\"dt\"))"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "df = df.withColumnsRenamed(\n",
101 | " {\n",
102 | " \"id.orig_h\": \"source_ip\",\n",
103 | " \"id.orig_p\": \"source_port\",\n",
104 | " \"id.resp_h\": \"dest_ip\",\n",
105 | " \"id.resp_p\": \"dest_port\",\n",
106 | " }\n",
107 | ")"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## Dataset Quality Checks"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "### Min, Max datetime"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "df.agg(\n",
131 | " min(\"dt\").alias(\"min_date\"), \n",
132 | " max(\"dt\").alias(\"max_date\")\n",
133 | ").show()"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "### Shape"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "df.count(), len(df.columns)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "### Static Columns"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "to_analyse = [\n",
166 | " \"source_ip\",\n",
167 | " \"source_port\",\n",
168 | " \"dest_ip\",\n",
169 | " \"dest_port\",\n",
170 | " \"proto\",\n",
171 | " \"service\",\n",
172 | " \"duration\",\n",
173 | " \"orig_bytes\",\n",
174 | " \"resp_bytes\",\n",
175 | " \"conn_state\",\n",
176 | " \"local_orig\",\n",
177 | " \"local_resp\",\n",
178 | " \"missed_bytes\",\n",
179 | " \"history\",\n",
180 | " \"orig_pkts\",\n",
181 | " \"orig_ip_bytes\",\n",
182 | " \"resp_pkts\",\n",
183 | " \"resp_ip_bytes\",\n",
184 | " \"tunnel_parents\",\n",
185 | " \"label\",\n",
186 | " \"detailed-label\",\n",
187 | "]\n",
188 | "\n",
189 | "unique_counts = df.agg(*(countDistinct(col(c)).alias(c) for c in to_analyse))\n",
190 | "print(unique_counts.show())"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "unique_counts = unique_counts.first()\n",
200 | "static_cols = [c for c in unique_counts.asDict() if unique_counts[c] == 1]\n",
201 | "print(\"Dataset has\", len(static_cols), \"static columns: \", static_cols)\n",
202 | "df = df.drop(*static_cols)"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "### Count Distinct Values"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "source_ips = df.select(col(\"source_ip\")).distinct()\n",
219 | "dest_ips = df.select(col(\"dest_ip\")).distinct()\n",
220 | "common_ips = source_ips.join(broadcast(dest_ips), source_ips.source_ip == dest_ips.dest_ip, how='inner')\n",
221 | "\n",
222 | "\n",
223 | "print(\"Source IPs count:\", source_ips.count())\n",
224 | "print(\"Dest IPs count:\", dest_ips.count())\n",
225 | "print(\"IPs as both:\", common_ips.count())"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "source_ports = df.select(col(\"source_port\")).distinct()\n",
235 | "dest_ports = df.select(col(\"dest_port\")).distinct()\n",
236 | "common_ports = source_ports.join(broadcast(dest_ports), source_ports.source_port == dest_ports.dest_port, how='inner')\n",
237 | "\n",
238 | "\n",
239 | "print(\"Source Ports count:\", source_ports.count())\n",
240 | "print(\"Dest Ports count:\", dest_ports.count())\n",
241 | "print(\"Ports as both:\", common_ports.count())"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "### Count Nulls"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "df = df.replace(\"-\", None)"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "remaining_cols = [f for f in to_analyse if f not in static_cols]\n",
267 | "df.select(\n",
268 | " [count(F.when(F.isnan(c) | col(c).isNull(), c)).alias(c) for c in remaining_cols]\n",
269 | ").show()"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "## Time-Series Plots"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "df = df.withColumns(\n",
286 | " {\n",
287 | " \"day\": date_trunc(\"day\", \"dt\"),\n",
288 | " \"hour\": date_trunc(\"hour\", \"dt\"),\n",
289 | " \"minute\": date_trunc(\"minute\", \"dt\"),\n",
290 | " \"second\": date_trunc(\"second\", \"dt\"),\n",
291 | " }\n",
292 | ")"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "for agg in ['day', 'hour', 'minute']:\n",
302 | " plotting_table = df.groupBy([agg, \"label\"]).agg(count(\"uid\").alias(\"counts\")).orderBy(agg).toPandas()\n",
303 | " fig = px.line(plotting_table, x=agg, y=\"counts\", color=\"label\", title=f'Event Counts per {agg}')\n",
304 | " fig.show()"
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "metadata": {},
310 | "source": [
311 | "## Univariate Data Analysis"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "def counts(df, var):\n",
321 | " var_counts = df.groupBy(var).count().orderBy(\"count\", ascending=False)\n",
322 | " var_counts = var_counts.withColumn(\n",
323 | " \"percent\", F.round(col(\"count\") / sum(col(\"count\")).over(Window.partitionBy()), 4)\n",
324 | " )\n",
325 | " var_counts.show()\n",
326 | " fig = px.bar(var_counts.toPandas(), x=var, y=\"count\")\n",
327 | " fig.show()\n",
328 | "\n",
329 | "\n",
330 | "categorical_columns = [\"proto\", \"service\", \"conn_state\", \"history\", \"label\"]\n",
331 | "\n",
332 | "for c in categorical_columns:\n",
333 | " counts(df, c)"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "## Prepare for Modelling"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "numerical_cols = [\n",
350 | " \"duration\",\n",
351 | " \"orig_bytes\",\n",
352 | " \"resp_bytes\",\n",
353 | " \"orig_pkts\",\n",
354 | " \"orig_ip_bytes\",\n",
355 | " \"resp_pkts\",\n",
356 | " \"resp_ip_bytes\",\n",
357 | "]\n",
358 | "categorical_cols = [\"proto\", \"service\", \"conn_state\"]\n",
359 | "label = \"label\"\n",
360 | "\n",
361 | "all_cols = numerical_cols + categorical_cols"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": null,
367 | "metadata": {},
368 | "outputs": [],
369 | "source": [
370 | "recast_cols = {}\n",
371 | "fill_vals = {}\n",
372 | "for c in numerical_cols:\n",
373 | " recast_cols[c] = col(c).cast(\"double\")\n",
374 | " fill_vals[c] = -999999\n",
375 | "\n",
376 | "for c in categorical_cols:\n",
377 | " fill_vals[c] = 'missing'\n",
378 | " \n",
379 | "df = df.withColumns(recast_cols)\n",
380 | "df = df.fillna(fill_vals)\n"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {},
386 | "source": [
387 | "## Full Pipeline"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "static_cols = [\"local_orig\", \"local_resp\", \"missed_bytes\", \"tunnel_parents\"]\n",
397 | "\n",
398 | "recast_cols = {\n",
399 | " \"duration\": col(\"duration\").cast(\"double\"),\n",
400 | " \"orig_bytes\": col(\"orig_bytes\").cast(\"double\"),\n",
401 | " \"resp_bytes\": col(\"resp_bytes\").cast(\"double\"),\n",
402 | " \"orig_ip_bytes\": col(\"orig_ip_bytes\").cast(\"double\"),\n",
403 | " \"orig_pkts\": col(\"orig_pkts\").cast(\"double\"),\n",
404 | " \"resp_pkts\": col(\"resp_pkts\").cast(\"double\"),\n",
405 | " \"resp_ip_bytes\": col(\"resp_ip_bytes\").cast(\"double\"),\n",
406 | "}\n",
407 | "\n",
408 | "fill_vals = {\n",
409 | " \"duration\": -999999,\n",
410 | " \"orig_bytes\": -999999,\n",
411 | " \"resp_bytes\": -999999,\n",
412 | " \"orig_pkts\": -999999,\n",
413 | " \"orig_ip_bytes\": -999999,\n",
414 | " \"resp_pkts\": -999999,\n",
415 | " \"resp_ip_bytes\": -999999,\n",
416 | " \"history\": \"missing\",\n",
417 | " \"proto\": \"missing\",\n",
418 | " \"service\": \"missing\",\n",
419 | " \"conn_state\": \"missing\",\n",
420 | "}\n",
421 | "\n",
422 | "preprocessed_data = (\n",
423 | " spark.read.option(\"delimiter\", \"|\")\n",
424 | " .csv(filepaths, inferSchema=True, header=True)\n",
425 | " .withColumn(\"dt\", to_timestamp(from_unixtime(\"ts\")))\n",
426 | " .withColumns(\n",
427 | " {\n",
428 | " \"day\": date_trunc(\"day\", \"dt\"),\n",
429 | " \"hour\": date_trunc(\"hour\", \"dt\"),\n",
430 | " \"minute\": date_trunc(\"minute\", \"dt\"),\n",
431 | " \"second\": date_trunc(\"second\", \"dt\"),\n",
432 | " }\n",
433 | " )\n",
434 | " .withColumnsRenamed(\n",
435 | " {\n",
436 | " \"id.orig_h\": \"source_ip\",\n",
437 | " \"id.orig_p\": \"source_port\",\n",
438 | " \"id.resp_h\": \"dest_ip\",\n",
439 | " \"id.resp_p\": \"dest_port\",\n",
440 | " }\n",
441 | " )\n",
442 | " .drop(*static_cols)\n",
443 | " .replace(\"-\", None)\n",
444 | " .withColumns(recast_cols)\n",
445 | " .fillna(fill_vals)\n",
446 | ")\n",
447 | "\n",
448 | "preprocessed_data.show()"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "## Write Out"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {},
462 | "outputs": [],
463 | "source": [
464 | "preprocessed_data.writeparquet(\"processed.pq\")"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {},
471 | "outputs": [],
472 | "source": [
473 | "read_in = spark.read.parquet(\"processed.pq\")\n",
474 | "read_in.show()"
475 | ]
476 | }
477 | ],
478 | "metadata": {
479 | "kernelspec": {
480 | "display_name": "dev",
481 | "language": "python",
482 | "name": "python3"
483 | },
484 | "language_info": {
485 | "codemirror_mode": {
486 | "name": "ipython",
487 | "version": 3
488 | },
489 | "file_extension": ".py",
490 | "mimetype": "text/x-python",
491 | "name": "python",
492 | "nbconvert_exporter": "python",
493 | "pygments_lexer": "ipython3",
494 | "version": "3.10.13"
495 | }
496 | },
497 | "nbformat": 4,
498 | "nbformat_minor": 2
499 | }
500 |
--------------------------------------------------------------------------------
/pyspark/tuning.py:
--------------------------------------------------------------------------------
1 | from hyperopt import STATUS_OK, Trials, fmin, tpe
2 | from hyperopt.pyll.base import Apply
3 | from pyspark.ml import Pipeline
4 | from pyspark.ml.classification import RandomForestClassifier
5 | from pyspark.ml.evaluation import Evaluator
6 | from pyspark.ml.feature import StringIndexer, VectorAssembler
7 | from pyspark.sql import DataFrame
8 |
9 |
10 | def tune_rf(
11 | train: DataFrame,
12 | val: DataFrame,
13 | string_indexer: StringIndexer,
14 | vector_assembler: VectorAssembler,
15 | evaluator: Evaluator,
16 | param_grid: dict[str, Apply],
17 | tuning_rounds: int = 10,
18 | ):
19 | def objective(params):
20 | rf = RandomForestClassifier(
21 | featuresCol="features",
22 | labelCol="is_bad",
23 | numTrees=params["numTrees"],
24 | maxDepth=params["maxDepth"],
25 | )
26 |
27 | pipeline = Pipeline(stages=[string_indexer, vector_assembler, rf])
28 |
29 | pipeline = pipeline.fit(train)
30 | val_df = pipeline.transform(val)
31 |
32 | score = evaluator.evaluate(val_df)
33 | return {"loss": -score, "status": STATUS_OK}
34 |
35 | rf_trials = Trials()
36 |
37 | argmin = fmin(
38 | fn=objective,
39 | space=param_grid,
40 | algo=tpe.suggest,
41 | max_evals=tuning_rounds,
42 | trials=rf_trials,
43 | )
44 |
45 | return argmin
46 |
--------------------------------------------------------------------------------
/tfdf/notebooks/data_preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "2023-03-17 17:18:41.007743: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n",
13 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import numpy as np\n",
20 | "import plotly.express as px"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stderr",
30 | "output_type": "stream",
31 | "text": [
32 | "Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.\n"
33 | ]
34 | }
35 | ],
36 | "source": [
37 | "data = pd.read_csv(\"SBAnational.csv\")"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "def get_frequent(x, thr=0.005):\n",
47 | " count_norm = x.value_counts(normalize=True)\n",
48 | " frequent = count_norm[count_norm >= thr]\n",
49 | " return frequent.index\n",
50 | "\n",
51 | "\n",
52 | "def plot_numeric_boxplots(data, target, feature):\n",
53 | " fig = px.box(\n",
54 | " data,\n",
55 | " x=target,\n",
56 | " y=feature,\n",
57 | " )\n",
58 | " fig.show()\n",
59 | "\n",
60 | "\n",
61 | "def plot_category_props(data, x, target):\n",
62 | " prop = data.groupby(x)[target].mean()\n",
63 | " fig = px.bar(x=prop.index, y=prop.values, labels={\"x\": x, \"y\": target})\n",
64 | " fig.show()"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 4,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "data['is_default'] = ~data['ChgOffDate'].isna()"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "## Feature Cleaning"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 5,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "frequent_city = get_frequent(data['City'])\n",
90 | "data['City'] = data['City'].apply(lambda x: x if x in frequent_city else 'Other')\n",
91 | "\n",
92 | "frequent_banks = get_frequent(data['Bank'])\n",
93 | "data['Bank'] = data['Bank'].apply(lambda x: x if x in frequent_banks else 'Other')\n"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 6,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "frequent_fr_code = get_frequent(data[\"FranchiseCode\"].astype(str))\n",
103 | "data[\"FranchiseCode\"] = data[\"FranchiseCode\"].apply(\n",
104 | " lambda x: str(x) if str(x) in frequent_fr_code else \"Other\"\n",
105 | ")\n"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 7,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "data['RevLineCr'] = data['RevLineCr'].apply(lambda x: x if x in (\"Y\", 'N') else 'Other')\n",
115 | "data['LowDoc'] = data['LowDoc'].apply(lambda x: x if x in (\"Y\", 'N') else 'Other')"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 8,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "data['GrAppv'] = data['GrAppv'].apply(lambda x: float(x.replace('$', '').replace('.', '').replace(',', '')))\n",
125 | "data['SBA_Appv'] = data['SBA_Appv'].apply(lambda x: float(x.replace('$', '').replace('.', '').replace(',', '')))"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 9,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "data": {
135 | "text/plain": [
136 | "1.0 644869\n",
137 | "2.0 253125\n",
138 | "0.0 1034\n",
139 | "Name: NewExist, dtype: int64"
140 | ]
141 | },
142 | "execution_count": 9,
143 | "metadata": {},
144 | "output_type": "execute_result"
145 | }
146 | ],
147 | "source": [
148 | "data['NewExist'].value_counts()"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 10,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "data['is_new'] = data['NewExist'].apply(lambda x: x == 2)"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "## Feature Engineering"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 11,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "data['same_state'] = data['State'] == data['BankState']"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 12,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "import pgeocode\n",
183 | "\n",
184 | "zip_codes = data['Zip'].astype(str).unique()\n",
185 | "nomi = pgeocode.Nominatim('us')\n",
186 | "zip_aug = nomi.query_postal_code(zip_codes)\n",
187 | "\n",
188 | "zip_long_map = dict(zip(zip_aug['postal_code'].values, zip_aug['longitude'].values))\n",
189 | "zip_lat_map = dict(zip(zip_aug['postal_code'].values, zip_aug['latitude'].values))\n",
190 | "\n",
191 | "data['longitude'] = data['Zip'].astype(str).map(zip_long_map)\n",
192 | "data['latitude'] = data['Zip'].astype(str).map(zip_lat_map)"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "## Featur Selection"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 13,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "NUMERIC_FEATURES = [\n",
209 | " \"Term\",\n",
210 | " \"NoEmp\",\n",
211 | " \"CreateJob\",\n",
212 | " \"RetainedJob\",\n",
213 | " \"longitude\",\n",
214 | " \"latitude\",\n",
215 | " \"GrAppv\",\n",
216 | " \"SBA_Appv\",\n",
217 | "]\n",
218 | "\n",
219 | "CATEGORICAL_FEATURES = [\n",
220 | " \"is_new\",\n",
221 | " \"FranchiseCode\",\n",
222 | " \"UrbanRural\",\n",
223 | " \"City\",\n",
224 | " \"State\",\n",
225 | " \"Bank\",\n",
226 | " \"BankState\",\n",
227 | " \"RevLineCr\",\n",
228 | " \"naics_first_two\",\n",
229 | " \"same_state\",\n",
230 | "]\n",
231 | "\n",
232 | "TARGET = \"is_default\"\n"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 14,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "clean_data = data[['ApprovalFY'] + NUMERIC_FEATURES + CATEGORICAL_FEATURES + [TARGET]]"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 15,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "clean_data = clean_data[clean_data['ApprovalFY'] != '1976A']\n",
251 | "clean_data['ApprovalFY'] = clean_data['ApprovalFY'].astype(int)\n",
252 | "clean_data.to_parquet(\"loan_data_clean.parquet\")"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "## Data Split"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 16,
265 | "metadata": {},
266 | "outputs": [],
267 | "source": [
268 | "test_thr = np.quantile(clean_data['ApprovalFY'], 0.90)\n",
269 | "train_data = clean_data[clean_data['ApprovalFY'] <= test_thr]\n",
270 | "test_data = clean_data[clean_data['ApprovalFY'] > test_thr]"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 17,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "val_thr = np.quantile(train_data['ApprovalFY'], 0.90)\n",
280 | "val_data = train_data[train_data['ApprovalFY'] > val_thr]\n",
281 | "train_data = train_data[train_data['ApprovalFY'] <= val_thr]"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 104,
287 | "metadata": {},
288 | "outputs": [
289 | {
290 | "data": {
291 | "text/plain": [
292 | "((802301, 20), (39540, 20), (57305, 20))"
293 | ]
294 | },
295 | "execution_count": 104,
296 | "metadata": {},
297 | "output_type": "execute_result"
298 | }
299 | ],
300 | "source": [
301 | "train_data.shape, val_data.shape, test_data.shape"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 19,
307 | "metadata": {},
308 | "outputs": [],
309 | "source": [
310 | "train_data.to_parquet('train_data.parquet', index=False)\n",
311 | "val_data.to_parquet('val_data.parquet', index=False)\n",
312 | "test_data.to_parquet('test_data.parquet', index=False)"
313 | ]
314 | },
315 | {
316 | "attachments": {},
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "[data split](#)"
321 | ]
322 | }
323 | ],
324 | "metadata": {
325 | "interpreter": {
326 | "hash": "a2df742b932880654a3f6652148a9c802dc0dfad475f6beda4797814052023f2"
327 | },
328 | "kernelspec": {
329 | "display_name": "Python 3.9.13",
330 | "language": "python",
331 | "name": "python3"
332 | },
333 | "language_info": {
334 | "codemirror_mode": {
335 | "name": "ipython",
336 | "version": 3
337 | },
338 | "file_extension": ".py",
339 | "mimetype": "text/x-python",
340 | "name": "python",
341 | "nbconvert_exporter": "python",
342 | "pygments_lexer": "ipython3",
343 | "version": "3.10.9"
344 | },
345 | "orig_nbformat": 4
346 | },
347 | "nbformat": 4,
348 | "nbformat_minor": 2
349 | }
350 |
--------------------------------------------------------------------------------
/tfdf/notebooks/plot.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |