├── .github └── workflows │ ├── features-and-predictions.yml │ ├── fraud-batch-inference-pipeline.yml │ ├── fraud-feature-pipelines.yml │ └── main.yml ├── .gitignore ├── LICENSE ├── README.md ├── assets ├── README.md ├── actual_iris.png ├── confusion_matrix.png ├── credit_cards.parquet ├── df_recent.png ├── images │ ├── card_horizontal.jpg │ └── serverless-ml-architecture.svg ├── latest_iris.png ├── profiles.parquet └── transactions.parquet ├── requirements.txt └── src ├── 00-intro ├── Feature-Store-Intro.ipynb ├── Pandas-Intro.ipynb ├── green-apples-vs-oranges.ipynb ├── red-and-green-apples-vs-oranges.ipynb └── streamlit-example.py ├── 01-module ├── assets │ ├── Setosa.png │ ├── Versicolor.png │ ├── Virginica.png │ ├── confusion_matrix.png │ └── iris.png ├── iris-batch-inference-pipeline.ipynb ├── iris-feature-pipeline.ipynb ├── iris-train-pipeline.ipynb ├── iris_end_to_end_ml_pipeline.ipynb ├── iris_model │ ├── confusion_matrix.png │ └── iris_model.pkl ├── orchest │ ├── clean repository.ipynb │ ├── iris-batch-inference-pipeline-orchest.ipynb │ ├── iris-train-pipeline-orchest.ipynb │ └── push-work.ipynb └── scripts │ └── run-feature-and-prediction-pipelines.sh ├── 02-module ├── 1_backfill_cc_feature_groups.ipynb ├── 2_cc_feature_pipeline.ipynb ├── scripts │ └── run-fraud-feature-pipelines.sh ├── sml │ ├── cc_features.py │ └── synthetic_data.py ├── test_sml │ └── test_sml.py └── titanic │ ├── titanic_feature_pipelines.ipynb │ └── titanic_training_pipeline.ipynb ├── 03-module ├── 3_model_training.ipynb ├── 4_batch_predictions.ipynb ├── iris_with_sklearn_transformer.ipynb └── scripts │ └── run-fraud-batch-inference.sh ├── 04-module ├── app.py ├── cc-fraud-streamlit-ui.py ├── requirements-gradio.txt ├── run-fraud-streamlit.sh └── sml │ ├── cc_features.py │ └── synthetic_data.py ├── 05-module ├── 1_backfill_cc_feature_groups.ipynb ├── 2_cc_feature_pipeline_with_ge.ipynb ├── iris-feature-pipeline-with-ge.ipynb ├── pytest-workflow.yml ├── scripts │ └── run-fraud-feature-pipelines.sh ├── sml │ ├── cc_features.py │ └── synthetic_data.py └── test_sml │ └── test_sml.py └── 06-module ├── LICENSE ├── README.md ├── notebooks ├── 1_backfill_cc_feature_groups.ipynb ├── 2_cc_feature_pipeline.ipynb ├── 2_cc_usage_window_features_pipeline.ipynb ├── 3_feature_view_creation.ipynb ├── 4_model_training.ipynb ├── 5_model_deployment.ipynb ├── 6_online_predictions.ipynb ├── predict_example.py └── xgboost.pkl ├── requirements.txt ├── setup.py └── sml ├── __init__.py ├── features ├── cc_features.py └── synthetic_data.py └── pipelines ├── streamlit_app.py └── streamlit_batch_app.py /.github/workflows/features-and-predictions.yml: -------------------------------------------------------------------------------- 1 | name: iris-feature-and-prediction-pipelines 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '00 00 * * *' 7 | 8 | jobs: 9 | test_schedule: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout repo content 13 | uses: actions/checkout@v2 14 | 15 | - name: setup python 16 | uses: actions/setup-python@v3 17 | with: 18 | python-version: '3.10.9' 19 | 20 | - name: Install Jupyter 21 | run: | 22 | python -m pip install jupyter 23 | 24 | - name: List Jupyter Kernels 25 | run: jupyter kernelspec list 26 | 27 | - name: Install scikit-learn 28 | run: python -m pip install scikit-learn==1.2.1 29 | 30 | 31 | 32 | - name: install python packages 33 | run: | 34 | python -m pip install --upgrade pip 35 | pip install -r requirements.txt 36 | 37 | - name: execute python workflows from bash script 38 | env: 39 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 40 | run: ./src/01-module/scripts/run-feature-and-prediction-pipelines.sh 41 | 42 | - name: publish github pages 43 | uses: stefanzweifel/git-auto-commit-action@v4 44 | with: 45 | commit_message: "Automated graph published" 46 | 47 | # Optional. Local and remote branch name where commit is going to be pushed 48 | # to. Defaults to the current branch. 49 | # You might need to set `create_branch: true` if the branch does not exist. 50 | branch: main 51 | 52 | # Optional. Options used by `git-commit`. 53 | # See https://git-scm.com/docs/git-commit#_options 54 | commit_options: '--no-verify --signoff' 55 | 56 | # Optional glob pattern of files which should be added to the commit 57 | # Defaults to all (.) 58 | file_pattern: assets/latest_iris.png assets/actual_iris.png assets/confusion_matrix.png assets/df_recent.png 59 | 60 | # Optional. Local file path to the repository. 61 | # Defaults to the root of the repository. 62 | repository: . 63 | 64 | # Optional commit user and author settings 65 | commit_user_name: My GitHub Actions Bot # defaults to "github-actions[bot]" 66 | commit_user_email: my-github-actions-bot@example.org # defaults to "github-actions[bot]@users.noreply.github.com" 67 | commit_author: Author # defaults to author of the commit that triggered the run 68 | 69 | # Optional. Tag name being created in the local repository and 70 | # pushed to remote repository and defined branch. 71 | #tagging_message: 'v1.0.0' 72 | 73 | # Optional. Option used by `git-status` to determine if the repository is 74 | # dirty. See https://git-scm.com/docs/git-status#_options 75 | #status_options: '--untracked-files=no' 76 | 77 | # Optional. Options used by `git-add`. 78 | # See https://git-scm.com/docs/git-add#_options 79 | #add_options: '-u' 80 | 81 | # Optional. Options used by `git-push`. 82 | # See https://git-scm.com/docs/git-push#_options 83 | #push_options: '--force' 84 | 85 | # Optional. Disable dirty check and always try to create a commit and push 86 | skip_dirty_check: true 87 | 88 | # Optional. Skip internal call to `git fetch` 89 | skip_fetch: false 90 | 91 | # Optional. Skip internal call to `git checkout` 92 | skip_checkout: false 93 | 94 | # Optional. Prevents the shell from expanding filenames. 95 | # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html 96 | disable_globbing: true 97 | 98 | # Optional. Create given branch name in local and remote repository. 99 | create_branch: false 100 | 101 | -------------------------------------------------------------------------------- /.github/workflows/fraud-batch-inference-pipeline.yml: -------------------------------------------------------------------------------- 1 | name: fraud-batch-inference-pipeline 2 | 3 | on: 4 | workflow_dispatch: 5 | # schedule: 6 | # - cron: '11 11 * * *' 7 | 8 | jobs: 9 | test_schedule: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout repo content 13 | uses: actions/checkout@v2 14 | 15 | - name: setup python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.8.1' 19 | 20 | - name: install python packages 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | 25 | - name: execute python workflows from bash script 26 | env: 27 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 28 | run: ./src/03-module/scripts/run-fraud-batch-inference.sh 29 | 30 | -------------------------------------------------------------------------------- /.github/workflows/fraud-feature-pipelines.yml: -------------------------------------------------------------------------------- 1 | name: fraud-feature-pipelines 2 | 3 | on: 4 | workflow_dispatch: 5 | # schedule: 6 | # - cron: '11 11 * * *' 7 | 8 | jobs: 9 | test_schedule: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout repo content 13 | uses: actions/checkout@v2 14 | 15 | - name: setup python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.8.1' 19 | 20 | - name: install python packages 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | 25 | - name: execute python workflows from bash script 26 | env: 27 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 28 | run: ./src/02-module/scripts/run-fraud-feature-pipelines.sh 29 | 30 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: iris-feature-and-prediction-pipelines 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '00 00 * * *' 7 | 8 | jobs: 9 | test_schedule: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout repo content 13 | uses: actions/checkout@v2 14 | 15 | - name: setup python 16 | uses: actions/setup-python@v3 17 | with: 18 | python-version: '3.10.9' 19 | 20 | - name: Install Jupyter 21 | run: | 22 | python -m pip install jupyter 23 | 24 | - name: install python packages 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements.txt 28 | 29 | - name: execute python workflows from bash script 30 | env: 31 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 32 | run: ./src/01-module/scripts/run-feature-and-prediction-pipelines.sh 33 | 34 | - name: publish github pages 35 | uses: stefanzweifel/git-auto-commit-action@v4 36 | with: 37 | commit_message: "Automated graph published" 38 | 39 | # Optional. Local and remote branch name where commit is going to be pushed 40 | # to. Defaults to the current branch. 41 | # You might need to set `create_branch: true` if the branch does not exist. 42 | branch: main 43 | 44 | # Optional. Options used by `git-commit`. 45 | # See https://git-scm.com/docs/git-commit#_options 46 | commit_options: '--no-verify --signoff' 47 | 48 | # Optional glob pattern of files which should be added to the commit 49 | # Defaults to all (.) 50 | file_pattern: assets/latest_iris.png assets/actual_iris.png assets/confusion_matrix.png assets/df_recent.png 51 | 52 | # Optional. Local file path to the repository. 53 | # Defaults to the root of the repository. 54 | repository: . 55 | 56 | # Optional commit user and author settings 57 | commit_user_name: My GitHub Actions Bot # defaults to "github-actions[bot]" 58 | commit_user_email: my-github-actions-bot@example.org # defaults to "github-actions[bot]@users.noreply.github.com" 59 | commit_author: Author # defaults to author of the commit that triggered the run 60 | 61 | # Optional. Tag name being created in the local repository and 62 | # pushed to remote repository and defined branch. 63 | #tagging_message: 'v1.0.0' 64 | 65 | # Optional. Option used by `git-status` to determine if the repository is 66 | # dirty. See https://git-scm.com/docs/git-status#_options 67 | #status_options: '--untracked-files=no' 68 | 69 | # Optional. Options used by `git-add`. 70 | # See https://git-scm.com/docs/git-add#_options 71 | #add_options: '-u' 72 | 73 | # Optional. Options used by `git-push`. 74 | # See https://git-scm.com/docs/git-push#_options 75 | #push_options: '--force' 76 | 77 | # Optional. Disable dirty check and always try to create a commit and push 78 | skip_dirty_check: true 79 | 80 | # Optional. Skip internal call to `git fetch` 81 | skip_fetch: false 82 | 83 | # Optional. Skip internal call to `git checkout` 84 | skip_checkout: false 85 | 86 | # Optional. Prevents the shell from expanding filenames. 87 | # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html 88 | disable_globbing: true 89 | 90 | # Optional. Create given branch name in local and remote repository. 91 | create_branch: false 92 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | *.nbconvert.ipynb 7 | *~ 8 | 9 | .hw_api_key 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![readme header](/assets//images/card_horizontal.jpg) 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | # **[Beyond Notebooks - Serverless Machine Learning](https://www.serverless-ml.org)** 12 | ***Build Batch and Real-Time Prediction Services with Python*** 13 | 14 | ![serverless architecture](/assets/images/serverless-ml-architecture.svg "Serverless Architecture") 15 | 16 | # **Overview** 17 | You should not need to be an expert in Kubernetes or cloud computing to build an end-to-end service that makes intelligent decisions with the help of a ML model. Serverless Machine Learning (ML) makes it easy to build a system that uses ML models to make predictions. 18 | 19 | With Serverless ML, you do not need to install, upgrade, or operate any systems. You only need to be able to write Python programs that can be scheduled to run as pipelines. The features and models your pipelines produce are managed by a serverless feature store / model registry. We will also show you how to build a UI for your prediction service by writing Python and some HTML. 20 | 21 | Read this article for an overview on serverless machine learning. 22 | 23 | **Prerequisites:** Python - Pandas - Github 24 | 25 | # **Modules** 26 | - ## **Module 00** - Introduction and optional content. 27 | - Why Serverless ML: [Video](https://www.youtube.com/watch?v=zM2_m898P5g) | [Slides](https://drive.google.com/file/d/15gwryDoHq88tgxu8CoCbTqr5L9YN9O5p/view?usp=sharing) 28 | - Introduction to the course: [Video](https://www.youtube.com/watch?v=FM1YkIl1wXI&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=3) | [slides](https://drive.google.com/file/d/1a5uZHhVSUyxxjrESFea9vONovKROra4L/view?usp=sharing) 29 | - Development Environment & Platforms [Video](https://www.youtube.com/watch?v=9kNjky0MQtc&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=3) | [slides](https://drive.google.com/file/d/1LTTHkwV8RirYaz1MeZtoYgTc9TRSrBwr/view?usp=sharing) 30 | 31 | - ***Introduction to Machine Learning (ML 101)*** [Video](https://www.youtube.com/watch?v=RmAGTZ7dy58&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=4) | [slides](https://drive.google.com/file/d/1HXsrSRPcBMW53lgnBnYb95m5eS9oLqRk/view?usp=sharing) 32 | 33 | - ## **Module 01** - Pandas and ML Pipelines in Python. Write your first serverless App. 34 | - Full Lecture: [Video](https://www.youtube.com/watch?v=j-XnCflCc0I) | [Slides](https://drive.google.com/file/d/1L8DHGC5xo0NlNe8xfh4xf4NZV1CEGBA6/view?usp=sharing) 35 | 36 | - [Lab](https://www.youtube.com/watch?v=zAD3miW0Og0) | [Slides](https://drive.google.com/file/d/1hve9nVrImRhNE8lE26zPcr3X1DDDk7uD/view?usp=sharing) | [Homework form](https://forms.gle/2p5odBdpAqvavH1T7) 37 | 38 | - ## **Module 02** - Data modeling and the Feature Store. The Credit-card fraud prediction service. 39 | - Full Lecture: [Video](https://youtu.be/tpxZh8lbcBk) | [Slides](https://drive.google.com/file/d/1HgAKsHnOms1XCtl_KIEuELudTLtDkhxk/view?usp=sharing) 40 | 41 | - [Lab](https://www.youtube.com/watch?v=niPayagVxFg) | [Slides](https://drive.google.com/file/d/1_1oDN5nfpWSUpKNlls45HLllQ75yAWd-/view?usp=sharing) | [Homework form](https://forms.gle/5g9XtaeBEigKEirGA) 42 | - ## **Module 03** - Training Pipelines, Inference Pipelines, and the Model Registry. 43 | - Full lecture: [Video](https://youtu.be/BD1UOJs1Bvo) | [Slides](https://drive.google.com/file/d/1XhfnH7DzwDqQKS6WxDVqWFFas0fi_jnJ/view?usp=sharing) 44 | 45 | - [Lab](https://youtu.be/QfzrKgLqEXc) | [Slides](https://drive.google.com/file/d/1jITx5HGh2uM5vAeknvCaeN6ZPOc2i8AS/view?usp=sharing) 46 | - ## **Module 04** - Serverless User Interfaces for Machine Learning Systems. 47 | - Full lecture: [Video](https://youtu.be/GgwIspMUovM) | [Slides](https://drive.google.com/file/d/10JzJCDwi6IPnJNZ0iApzbwACkAn3C9Y9/view?usp=sharing) 48 | 49 | - [Lab](https://youtu.be/sMhCXwm_Wmw) | [Slides](https://drive.google.com/file/d/1bASaZN68__Ut0RnSuTvhF8LKn240UPtE/view?usp=sharing) 50 | 51 | - ## **Module 05** - Principles and Practices of MLOps 52 | - Part 01: [Video](https://youtu.be/-vbLMtfoBeo) | [Slides](https://drive.google.com/file/d/1orKJJ2e_1pNgF8X6CFBUKw7qEDQVoVqt/view?usp=share_link) 53 | - Part 02: [Video](https://youtu.be/j4wZmywPs1E) | [Slides](https://drive.google.com/file/d/13r1OvuuV6Snq1r5PmAvwHU0dTiExQ4iE/view?usp=share_link) 54 | - Lab: [Video](https://youtu.be/BaAbiFsx25E) | [Slides](https://drive.google.com/file/d/1WOahxd4s9_NVr8JUUVJvvUFU6ea9konS/view?usp=share_link) 55 | 56 | - ## **Module 06** -Operational machine learning systems: Real-time Machine Learning. 57 | - Full lecture: [Video](https://youtu.be/GEgiIh9a048) | [Slides](https://drive.google.com/file/d/1VXU2jxEUMIvIY_Xe7XSrNuy0yxXt8glP/view?usp=share_link) 58 | - Lab: [Video](https://youtu.be/DsyNk3A6ouA) | [Slides](https://drive.google.com/file/d/1nZJKFMvAFoAu4s5smuc-rIb9EBQVME0Z/view?usp=share_link) 59 | 60 | 61 | --- 62 | 63 | ## **Learning Outcomes:** 64 | - Learn to develop and operate AI-enabled (prediction) services on serverless infrastructure 65 | - Develop and run serverless feature pipelines 66 | - Deploy features and models to serverless infrastructure 67 | - Train models and and run batch/inference pipelines 68 | - Develop a serverless UI for your prediction service 69 | - Learn MLOps fundamentals: versioning, testing, data validation, and operations 70 | - Develop and run a real-time serverless machine learning system 71 | 72 | ## **Course Contents:** 73 | - Pandas and ML Pipelines in Python. Write your first serverless App. 74 | - The Feature Store for Machine Learning. Feature engineering for a credit-card fraud serverless App. 75 | - Training Pipelines and Inference Pipelines 76 | - Bring a Prediction Service to Life with a User Interface (Gradio, Github Pages, Streamlit) 77 | - Automated Testing and Versioning of features and models 78 | - Real-time serverless machine learning systems. Project presentation. 79 | 80 | ## **Who is the target audience?** 81 | You have taken a course in machine learning (ML) and you can program in Python. You want to take the next step beyond training models on static datasets in notebooks. You want to be able to build a prediction service around your model. Maybe you work at an Enterprise and want to demonstrate your models’ value to stakeholders in the stakeholder's own language. Maybe you want to include ML in an existing application or system. 82 | 83 | ## **Why is this course different?** 84 | You don’t need any operations experience beyond using GitHub and writing Python code. You will learn the essentials of MLOps: versioning artifacts, testing artifacts, validating artifacts, and monitoring and upgrading running systems. You will work with raw and live data - you will need to engineer features in pipelines. You will learn how to select, extract, compute, and transform features. 85 | 86 | ## **Will this course cost me money?** 87 | No. You will become a serveless machine learning engineer without having to pay to run your serverless pipelines or to manage your features/models/user-interface. We will use Github Actions and Hopsworks that both have generous time-unlimited free tiers. 88 | 89 | **Register now at [Serveless ML Course](https://www.serverless-ml.org/register)** 90 | 91 | ## **Timeline** 92 | _Self-paced_ 93 | 94 | ## **Requirements** 95 | - **Python** environment include a notebook (Jupyter or Colaboratory) 96 | - https://github.com account 97 | - https://hopsworks.ai account 98 | 99 | # **Key Technologies** 100 | 101 | ## **Development environment** 102 | You can write, test, debug, and train your models in some Python IDE. We will focus on notebooks and Python programs. You can use Jupyter notebooks or Colaboratory. 103 | 104 | ## **Github** 105 | Github to manage your code, GitHub Actions to run your workflows, and Github Pages for your user interface for non-interactive applications. Github Actions offers a free tier of 500 MB and 2,000 minutes to run your pipelines. 106 | https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions 107 | 108 | ## **Hopsworks** 109 | [Hopsworks.ai](https://app.hopsworks.ai) has a free tier of 10 GB of storage. 110 |

111 | 112 | --- 113 | 114 | ## **Useful Resources** 115 | | name | Description | link | 116 | |------|-------------|------| 117 | |**Awesome MLOps**| A collection of links and resources for MLOps| https://github.com/visenger/awesome-mlops| 118 | |**Machine Learning Ops**| a collection of resources on how to facilitate Machine Learning Ops with GitHub.| https://mlops.githubapp.com/| 119 | |**MLOps Toys**| A curated list of MLOps projects.|https://mlops.toys/| 120 | |**MLOps Zoomcamp**| teaches practical aspects of productionizing ML services.|https://github.com/DataTalksClub/mlops-zoomcamp| 121 | |**PYSLACKERS**|A large open community for Python programming enthusiasts.|https://pyslackers.com/web| 122 | |**Feature Store Org**|An open community for everything feature stores.|https://www.featurestore.org| 123 | 124 | 125 | ## **Other MLOps Courses** 126 | | name | Description | link | 127 | |------|-------------|------| 128 | |**MlOps Zoomcamp**| DevOps style course with Python and Docker as prerequisites.| https://github.com/DataTalksClub/mlops-zoomcamp | 129 | |**Full Stack Deep Learning**| This course shares best practices for the full stack; topics range from problem selection to dataset management to monitoring.| https://fullstackdeeplearning.com/| 130 | |**MLOps course**| A series of lessons teaching how to apply ML to build production-grade products (by Goku Mohandas).|https://github.com/GokuMohandas/mlops-course | 131 | 132 | --- 133 | 134 | # **Definitions** 135 | 136 | - [Context windows for LLMs](http://www.hopsworks.ai/dictionary/context-window-for-llms) 137 | - [Compound AI Systems](https://www.hopsworks.ai/dictionary/compound-ai-systems) 138 | - [Feature Store](https://www.hopsworks.ai/dictionary/feature-store) 139 | - [Feature Monitoring](https://www.hopsworks.ai/dictionary/feature-monitoring) 140 | - [Feature Data](https://www.hopsworks.ai/dictionary/feature-data) 141 | - [Flash Attention](https://www.hopsworks.ai/dictionary/flash-attention) 142 | - [Function Calling with LLMs](https://www.hopsworks.ai/dictionary/function-calling-with-llms) 143 | - [Gradient Accumulation](https://www.hopsworks.ai/dictionary/gradient-accumulation) 144 | - [In Context Learning (ICL)](http://www.hopsworks.ai/dictionary/in-context-learning-icl) 145 | - [KServe](https://www.hopsworks.ai/dictionary/kserve) 146 | - [ML Logs](https://www.hopsworks.ai/dictionary/machine-learning-logs) 147 | - [ML Infrastructure](https://www.hopsworks.ai/dictionary/machine-learning-infrastructure) 148 | - [ML Observability](https://www.hopsworks.ai/dictionary/machine-learning-observability) 149 | - [ML Pipeline](https://www.hopsworks.ai/dictionary/ml-pipeline) 150 | - [ML Systems](https://www.hopsworks.ai/dictionary/ml-systems) 151 | - [Model Deployment](https://www.hopsworks.ai/dictionary/model-deployment) 152 | - [Model Monitoring](https://www.hopsworks.ai/dictionary/model-monitoring) 153 | - [Model Registry](https://www.hopsworks.ai/dictionary/model-registry) 154 | - [Model Serving](https://www.hopsworks.ai/dictionary/model-serving) 155 | - [PagedAttention](https://www.hopsworks.ai/dictionary/pagedattention) 156 | - [Prompt Store](https://www.hopsworks.ai/dictionary/prompt-store) 157 | - [Retrieval Augmented Generation (RAG) LLM](https://www.hopsworks.ai/dictionary/retrieval-augmented-generation-llm) 158 | - [RoPE Scaling](https://www.hopsworks.ai/dictionary/rope-scaling) 159 | - [Sample Packing](https://www.hopsworks.ai/dictionary/sample-packing) 160 | - [Similarity Search](http://www.hopsworks.ai/dictionary/similarity-search) 161 | 162 | # **Support and Partners** 163 |
164 |

165 | 166 | FSorg 167 | 168 |

169 |
170 |

171 | 172 | Hopsworks 173 | 174 |

175 | -------------------------------------------------------------------------------- /assets/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/README.md -------------------------------------------------------------------------------- /assets/actual_iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/actual_iris.png -------------------------------------------------------------------------------- /assets/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/confusion_matrix.png -------------------------------------------------------------------------------- /assets/credit_cards.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/credit_cards.parquet -------------------------------------------------------------------------------- /assets/df_recent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/df_recent.png -------------------------------------------------------------------------------- /assets/images/card_horizontal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/images/card_horizontal.jpg -------------------------------------------------------------------------------- /assets/latest_iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/latest_iris.png -------------------------------------------------------------------------------- /assets/profiles.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/profiles.parquet -------------------------------------------------------------------------------- /assets/transactions.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/transactions.parquet -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | faker 2 | parsedatetime 3 | hopsworks 4 | nbconvert 5 | scikit-learn 6 | plotly 7 | Pillow 8 | seaborn 9 | dataframe-image 10 | streamlit_folium 11 | plotly 12 | -------------------------------------------------------------------------------- /src/00-intro/Pandas-Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f66fadac", 6 | "metadata": {}, 7 | "source": [ 8 | "## Pandas in 2 mins\n", 9 | "You can't learn Pandas in 2 mins, but here are some of the basics needed for this course.\n", 10 | "\n", 11 | "First, you can define a dict containing credit card payments, labeled as fraud or not-fraud, and create a Pandas DataFrame from it." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "27b01f37", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "\n", 23 | "data = { \n", 24 | " 'credit_card_number': ['1111 2222 3333 4444', '1111 2222 3333 4444','1111 2222 3333 4444',\n", 25 | " '1111 2222 3333 4444'],\n", 26 | " 'trans_datetime': ['2022-01-01 08:44', '2022-01-01 19:44', '2022-01-01 20:44', '2022-01-01 20:55'],\n", 27 | " 'amount': [142.34, 12.34, 66.29, 112.33],\n", 28 | " 'location': ['Sao Paolo', 'Rio De Janeiro', 'Stockholm', 'Stockholm'],\n", 29 | " 'fraud': [False, False, True, True] \n", 30 | "}\n", 31 | "\n", 32 | "df = pd.DataFrame.from_dict(data)\n", 33 | "df['trans_datetime']= pd.to_datetime(df['trans_datetime'])\n", 34 | "df" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "d0146eac", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "df" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "id": "dd7889c9", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df.info()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "ecc3bb3b", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df['trans_datetime']= pd.to_datetime(df['trans_datetime'])\n", 65 | "df.info()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "280b5ebb", 71 | "metadata": {}, 72 | "source": [ 73 | "### Lambda functions\n", 74 | "\n", 75 | "We will now apply a lambda function to the column `amount` and save the result in a new column `is_big` in our DataFrame `df`." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "73ba75de", 82 | "metadata": { 83 | "scrolled": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "df['is_big'] = df['amount'].apply(lambda amount: amount > 100)\n", 88 | "df" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "f845b92e", 94 | "metadata": {}, 95 | "source": [ 96 | "### Apply and UDFs\n", 97 | "\n", 98 | "We will now apply a user-defined function (UDF), `is_small`, to each row in the data DataFrame `df`. \n", 99 | "The result is a series that we store in a new column in `df` called 'is_small'." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "36cf67ef", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "def is_small(row):\n", 110 | " return row['amount'] < 100\n", 111 | " \n", 112 | "df['is_small'] = df.apply(is_small, axis=1)\n", 113 | "df" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "c678d9ba", 119 | "metadata": {}, 120 | "source": [ 121 | "## Rolling Windows\n", 122 | "\n", 123 | "We will compute a rolling window over the day." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "8bc7a844", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "df_rolling = df.set_index('trans_datetime')\n", 134 | "df_rolling" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "4b3b6d2d", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "df_rolling['rolling_max_1d'] = df_rolling.rolling('1D').amount.max()\n", 145 | "df_rolling" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "12d55895", 151 | "metadata": {}, 152 | "source": [ 153 | "Let's create a new DataFrame, `d2`, with new data." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "f38554ad", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "import numpy as np\n", 164 | "import timeit \n", 165 | "\n", 166 | "df2 = pd.DataFrame({\n", 167 | " 'a':np.random.randint(1,100, size=10000),\n", 168 | " 'b':np.random.randint(100,1000, size=10000),\n", 169 | " 'c':np.random.random(10000)\n", 170 | "})\n", 171 | "df2.shape\n", 172 | "(100000, 3)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "id": "36e93895", 178 | "metadata": {}, 179 | "source": [ 180 | "### Vectorized operations are faster than \"apply\" with UDFs\n", 181 | "\n", 182 | "We will see that apply is approximately 50 times slower than the equivalent vectorized operation on 100k rows.\n", 183 | "\n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "b35aa5a2", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "%%timeit\n", 194 | "df2['a'].apply(lambda x: x**2)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "id": "622dc43c", 200 | "metadata": {}, 201 | "source": [ 202 | "This vectorized operation is much faster" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "id": "de746618", 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "%%timeit\n", 213 | "df2['a'] ** 2" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "4aededa8", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "df2.describe()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "c40d50fe", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "df.trans_datetime.unique()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "361d75ee", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "df.credit_card_number.nunique()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "8f7de134", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "df.isnull().sum()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "id": "b66d799c", 259 | "metadata": {}, 260 | "source": [ 261 | "## Transformations\n", 262 | "\n", 263 | "Plot a histogram with a long tail.\n", 264 | "Use numpy to seed the random number generator and generate a univariate data sample.\n" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "32ebde28", 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "import seaborn as sns\n", 275 | "\n", 276 | "from numpy.random import seed\n", 277 | "from numpy.random import randn\n", 278 | "from numpy.random import rand\n", 279 | "from numpy import append\n", 280 | "seed(1)\n", 281 | "array = 5 * randn(100) + 10\n", 282 | "tail = 10 + (rand(50) * 100)\n", 283 | "array = append(array, tail)\n", 284 | "sns.histplot(array)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "id": "262bf19c", 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "columns = ['amount']\n", 295 | "df_exp = pd.DataFrame(data = array, columns = columns)\n", 296 | " \n", 297 | "df_exp.describe()" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "bb560fa4", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "df_exp" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "id": "31a8bac9", 313 | "metadata": {}, 314 | "source": [ 315 | "## Standard Scalar in Vectorized Pandas\n", 316 | "\n", 317 | "This is an efficient way to transform our input Pandas column into a range of [0.0, 1.]" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "ae928d6c", 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "# Min-Max Normalization in Pandas\n", 328 | "df_norm = (df_exp-df_exp.min())/(df_exp.max()-df_exp.min())\n", 329 | "df_norm.head()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "bca3a9f9", 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "sns.histplot(df_norm)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "id": "ff81e054", 345 | "metadata": {}, 346 | "source": [ 347 | "## Power Transformer in Scikit-Learn\n", 348 | "\n", 349 | "Scikit-Learn has many different transformation libraries.\n", 350 | "For heavy-tailed distributions, it is often recommended to perform a [power transformation](\n", 351 | "https://towardsdatascience.com/how-to-differentiate-between-scaling-normalization-and-log-transformations-69873d365a94)\n", 352 | "\n", 353 | "We can see in the histogram, this produces a more Gaussian (normal) distribution than the MinMax Scalar." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "id": "85f5e6d6", 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "from sklearn.preprocessing import PowerTransformer\n", 364 | "\n", 365 | "pt = PowerTransformer()\n", 366 | "\n", 367 | "df_power = pd.DataFrame(\n", 368 | " pt.fit_transform(df_exp[[\"amount\"]]), columns=[\"amount\"]\n", 369 | ")\n", 370 | "\n", 371 | "sns.histplot(df_power)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "id": "1ced0dce", 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [] 381 | } 382 | ], 383 | "metadata": { 384 | "kernelspec": { 385 | "display_name": "Python 3 (ipykernel)", 386 | "language": "python", 387 | "name": "python3" 388 | }, 389 | "language_info": { 390 | "codemirror_mode": { 391 | "name": "ipython", 392 | "version": 3 393 | }, 394 | "file_extension": ".py", 395 | "mimetype": "text/x-python", 396 | "name": "python", 397 | "nbconvert_exporter": "python", 398 | "pygments_lexer": "ipython3", 399 | "version": "3.9.7" 400 | } 401 | }, 402 | "nbformat": 4, 403 | "nbformat_minor": 5 404 | } 405 | -------------------------------------------------------------------------------- /src/00-intro/green-apples-vs-oranges.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "6b138a28", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "[0 1]\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import sklearn \n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "from sklearn import tree \n", 21 | "\n", 22 | "# 4 examples of features with [red-color, green-color]\n", 23 | "features = [[0,120], [0, 110], [250, 150], [255, 163]]\n", 24 | "# green apples == 0; oranges == 1\n", 25 | "labels = [0, 0, 1, 1]\n", 26 | "\n", 27 | "clf = tree.DecisionTreeClassifier()\n", 28 | "clf = clf.fit(features, labels)\n", 29 | "\n", 30 | "test_fruits = [[0,128], [249, 155]]\n", 31 | "test_labels = [0, 1] \n", 32 | "pred_labels = clf.predict(test_fruits)\n", 33 | "print(pred_labels)\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "b40db72f", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 3 (ipykernel)", 48 | "language": "python", 49 | "name": "python3" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 3 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython3", 61 | "version": "3.9.7" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 5 66 | } 67 | -------------------------------------------------------------------------------- /src/00-intro/red-and-green-apples-vs-oranges.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "4948e813", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "[0 1 1 2]\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import sklearn \n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "\n", 21 | "# [green_apple(0,120), green_apple(0,110), orange(250,150), orange(255, 163), red_apple(255,0), red_apple(240,0)]\n", 22 | "features = [[0,120], [75, 40], [60, 60], [255, 163], [255, 0], [240, 0]]\n", 23 | "\n", 24 | "# [green_apple, green_apple, orange, orange, red_apple, red_apple]\n", 25 | "labels = [0, 0, 1, 1, 2, 2]\n", 26 | "\n", 27 | "clf = LogisticRegression()\n", 28 | "clf = clf.fit(features, labels)\n", 29 | "\n", 30 | "# (66,66) is labelled as a green apple\n", 31 | "test_features = [[0,110], [66, 66], [249, 155], [245, 0]]\n", 32 | "test_labels = [0, 1, 0, 2] \n", 33 | "pred_labels = clf.predict(test_features)\n", 34 | "\n", 35 | "print(pred_labels)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "f3ad083c", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3 (ipykernel)", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.9.7" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 5 68 | } 69 | -------------------------------------------------------------------------------- /src/00-intro/streamlit-example.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import streamlit as st 3 | import numpy as np 4 | 5 | st.title("Streamlit for ServerlessML") 6 | st.header("Easy UI in Python with Streamlit") 7 | 8 | chart_data = pd.DataFrame(np.random.randn(30, 3), 9 | columns=["Data Engineers", "Data Scientists", "ML Engineers"]) 10 | 11 | st.bar_chart(chart_data) 12 | -------------------------------------------------------------------------------- /src/01-module/assets/Setosa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/Setosa.png -------------------------------------------------------------------------------- /src/01-module/assets/Versicolor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/Versicolor.png -------------------------------------------------------------------------------- /src/01-module/assets/Virginica.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/Virginica.png -------------------------------------------------------------------------------- /src/01-module/assets/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/confusion_matrix.png -------------------------------------------------------------------------------- /src/01-module/assets/iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/iris.png -------------------------------------------------------------------------------- /src/01-module/iris_model/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/iris_model/confusion_matrix.png -------------------------------------------------------------------------------- /src/01-module/iris_model/iris_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/iris_model/iris_model.pkl -------------------------------------------------------------------------------- /src/01-module/orchest/clean repository.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f0db3065", 6 | "metadata": {}, 7 | "source": [ 8 | "### This notebooks will setup our Github Credentials, and make sure that remote and local repository are synced." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "fef79a2b-3616-40f0-8b08-a4ce2381474c", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# get the environement variable for the token\n", 19 | "import os\n", 20 | "secret = os.environ['GIT_TOKEN']\n", 21 | "account = os.environ['ACCOUNT']\n", 22 | "repo_url = os.environ['REPO']" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "f847dbb1-ce8f-4210-a3b8-1f23bbc604fb", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from datetime import datetime\n", 33 | "from git import Repo\n", 34 | "import git" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "76887f5c-dffe-413c-8a3c-7af077a46747", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Setup \n", 45 | "full_local_path = \"/project-dir/\"\n", 46 | "repo = git.Repo('/project-dir/')\n", 47 | "\n", 48 | "remote = f\"https://{secret}@github.com/{account}/{repo_url}.git\"\n", 49 | "repo = Repo(full_local_path)\n", 50 | "\n", 51 | "origin = repo.remote(name=\"origin\") \n", 52 | "if origin.url != remote:\n", 53 | " origin.set_url(remote, origin.url)\n", 54 | "\n", 55 | "# uncomment if you need to pull\n", 56 | "# origin.pull()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 14, 62 | "id": "e0bc7741-4b28-4c33-a261-20b97dec0267", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "\"Your branch is up to date with 'origin/gh-pages'.\"" 69 | ] 70 | }, 71 | "execution_count": 14, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "repo.git.checkout('gh-pages', force=True)\n", 78 | "\n", 79 | "# Going back to the main branch\n", 80 | "repo.git.checkout('main', force=True)\n", 81 | "\n", 82 | "# List remotes\n", 83 | "print('Remotes:')\n", 84 | "for remote in repo.remotes:\n", 85 | " print(f'- {remote.name} {remote.url}')" 86 | ] 87 | } 88 | ], 89 | "metadata": { 90 | "kernelspec": { 91 | "display_name": "Python 3.10.6 64-bit", 92 | "language": "python", 93 | "name": "python3" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": { 97 | "name": "ipython", 98 | "version": 3 99 | }, 100 | "file_extension": ".py", 101 | "mimetype": "text/x-python", 102 | "name": "python", 103 | "nbconvert_exporter": "python", 104 | "pygments_lexer": "ipython3", 105 | "version": "3.10.6" 106 | }, 107 | "vscode": { 108 | "interpreter": { 109 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 110 | } 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 5 115 | } 116 | -------------------------------------------------------------------------------- /src/01-module/orchest/iris-batch-inference-pipeline-orchest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "d2kLrOh-bpGy" 7 | }, 8 | "source": [ 9 | "# Iris Flower - Batch Prediction\n", 10 | "\n", 11 | "\n", 12 | "In this notebook we will, \n", 13 | "\n", 14 | "1. Load the batch inference data that arrived in the last 24 hours\n", 15 | "2. Predict the first Iris Flower found in the batch\n", 16 | "3. Write the ouput png of the Iris flower predicted, to be displayed in Github Pages." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "id": "xRtpj-psbpG8" 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "import hopsworks\n", 29 | "import joblib\n", 30 | "\n", 31 | "project = hopsworks.login()\n", 32 | "fs = project.get_feature_store()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "mr = project.get_model_registry()\n", 42 | "# model = mr.get_model(\"iris\", version=1) # selecting a specific model\n", 43 | "model = mr.get_best_model(\"iris\",'accuracy', 'max') # selecting the best model for accuracy\n", 44 | "model_dir = model.download()\n", 45 | "model = joblib.load(model_dir + \"/iris_model.pkl\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "We are downloading the 'raw' iris data. We explicitly do not want transformed data, reading for training. \n", 53 | "\n", 54 | "So, let's download the iris dataset, and preview some rows. \n", 55 | "\n", 56 | "Note, that it is 'tabular data'. There are 5 columns: 4 of them are \"features\", and the \"variety\" column is the **target** (what we are trying to predict using the 4 feature values in the target's row)." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "colab": { 64 | "base_uri": "https://localhost:8080/", 65 | "height": 206 66 | }, 67 | "id": "nRmFM7vcbpHA", 68 | "outputId": "d920d168-9818-40c5-c292-4cf0afcbbcfd" 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "feature_view = fs.get_feature_view(name=\"iris\", version=1)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "Now we will do some **Batch Inference**. \n", 80 | "\n", 81 | "We will read all the input features that have arrived in the last 24 hours, and score them." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "id": "uHuAD3ttP8Ep" 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "import datetime\n", 93 | "from PIL import Image\n", 94 | "\n", 95 | "batch_data = feature_view.get_batch_data()\n", 96 | "\n", 97 | "y_pred = model.predict(batch_data)\n", 98 | "\n", 99 | "y_pred" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "batch_data" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Batch prediction output is the last entry in the batch - it is output as a file 'latest_iris.png'" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "flower = y_pred[y_pred.size-1]\n", 125 | "flower_img = \"../assets/\" + flower + \".png\"\n", 126 | "img = Image.open(flower_img) \n", 127 | "\n", 128 | "img.save(\"../../../assets/latest_iris.png\")" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "iris_fg = fs.get_feature_group(name=\"iris\", version=1)\n", 138 | "df = iris_fg.read()\n", 139 | "df" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "label = df.iloc[-1][\"variety\"]\n", 149 | "label" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "label_flower = \"../assets/\" + label + \".png\"\n", 159 | "\n", 160 | "img = Image.open(label_flower) \n", 161 | "\n", 162 | "img.save(\"../../../assets/actual_iris.png\")" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "import pandas as pd\n", 172 | "\n", 173 | "monitor_fg = fs.get_or_create_feature_group(name=\"iris_predictions\",\n", 174 | " version=1,\n", 175 | " primary_key=[\"datetime\"],\n", 176 | " description=\"Iris flower Prediction/Outcome Monitoring\"\n", 177 | " )" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "from datetime import datetime\n", 187 | "now = datetime.now().strftime(\"%m/%d/%Y, %H:%M:%S\")\n", 188 | "\n", 189 | "data = {\n", 190 | " 'prediction': [flower],\n", 191 | " 'label': [label],\n", 192 | " 'datetime': [now],\n", 193 | "}\n", 194 | "monitor_df = pd.DataFrame(data)\n", 195 | "monitor_fg.insert(monitor_df)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "history_df = monitor_fg.read()\n", 205 | "history_df" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "import dataframe_image as dfi\n", 215 | "\n", 216 | "df_recent = history_df.tail(5)\n", 217 | " \n", 218 | "# If you exclude this image, you may have the same iris_latest.png and iris_actual.png files\n", 219 | "# If no files have changed, the GH-action 'git commit/push' stage fails, failing your GH action (last step)\n", 220 | "# This image, however, is always new, ensuring git commit/push will succeed.\n", 221 | "dfi.export(df_recent, '../../../assets/df_recent.png', table_conversion = 'matplotlib')" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "from sklearn.metrics import confusion_matrix\n", 231 | "\n", 232 | "predictions = history_df[['prediction']]\n", 233 | "labels = history_df[['label']]\n", 234 | "\n", 235 | "results = confusion_matrix(labels, predictions)\n", 236 | "print(results)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "from matplotlib import pyplot\n", 246 | "import seaborn as sns\n", 247 | "\n", 248 | "# Only create the confusion matrix when our iris_predictions feature group has examples of all 3 iris flowers\n", 249 | "if results.shape == (3,3):\n", 250 | "\n", 251 | " df_cm = pd.DataFrame(results, ['True Setosa', 'True Versicolor', 'True Virginica'],\n", 252 | " ['Pred Setosa', 'Pred Versicolor', 'Pred Virginica'])\n", 253 | "\n", 254 | " cm = sns.heatmap(df_cm, annot=True)\n", 255 | "\n", 256 | " fig = cm.get_figure()\n", 257 | " fig.savefig(\"../../../assets/confusion_matrix.png\") \n", 258 | " df_cm\n", 259 | "else:\n", 260 | " print(\"Run the batch inference pipeline more times until you get 3 different iris flowers\") " 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "colab": { 266 | "collapsed_sections": [], 267 | "provenance": [] 268 | }, 269 | "kernelspec": { 270 | "display_name": "Python 3.10.6 64-bit", 271 | "language": "python", 272 | "name": "python3" 273 | }, 274 | "language_info": { 275 | "codemirror_mode": { 276 | "name": "ipython", 277 | "version": 3 278 | }, 279 | "file_extension": ".py", 280 | "mimetype": "text/x-python", 281 | "name": "python", 282 | "nbconvert_exporter": "python", 283 | "pygments_lexer": "ipython3", 284 | "version": "3.10.6" 285 | }, 286 | "vscode": { 287 | "interpreter": { 288 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 289 | } 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 1 294 | } 295 | -------------------------------------------------------------------------------- /src/01-module/orchest/iris-train-pipeline-orchest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "d2kLrOh-bpGy" 7 | }, 8 | "source": [ 9 | "# Iris Flower Train and Publish Model\n", 10 | "\n", 11 | "\n", 12 | "In this notebook we will, \n", 13 | "\n", 14 | "1. Load the Iris Flower dataset into random split (train/test) DataFrames using a Feature View\n", 15 | "2. Train a KNN Model using SkLearn\n", 16 | "3. Evaluate model performance on the test set\n", 17 | "4. Register the model with Hopsworks Model Registry" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "!pip install -U hopsworks --quiet" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "xRtpj-psbpG8" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from sklearn.neighbors import KNeighborsClassifier\n", 38 | "from sklearn.metrics import accuracy_score\n", 39 | "import pandas as pd\n", 40 | "import seaborn as sns\n", 41 | "import hopsworks" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "Let's first get a feature_view for the iris flower dataset, or create one if it does not already exist.\n", 49 | "If you are running this notebook for the first time, it will create the feature view, which contains all of the columns from the **iris feature group**.\n", 50 | "\n", 51 | "There are 5 columns: 4 of them are \"features\", and the **variety** column is the **label** (what we are trying to predict using the 4 feature values in the label's row). The label is often called the **target**." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "colab": { 59 | "base_uri": "https://localhost:8080/", 60 | "height": 206 61 | }, 62 | "id": "nRmFM7vcbpHA", 63 | "outputId": "d920d168-9818-40c5-c292-4cf0afcbbcfd" 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "project = hopsworks.login()\n", 68 | "fs = project.get_feature_store()\n", 69 | "\n", 70 | "try: \n", 71 | " feature_view = fs.get_feature_view(name=\"iris\", version=1)\n", 72 | "except:\n", 73 | " iris_fg = fs.get_feature_group(name=\"iris\", version=1)\n", 74 | " query = iris_fg.select_all()\n", 75 | " feature_view = fs.create_feature_view(name=\"iris\",\n", 76 | " version=1,\n", 77 | " description=\"Read from Iris flower dataset\",\n", 78 | " labels=[\"variety\"],\n", 79 | " query=query)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "We will read our features and labels split into a **train_set** and a **test_set**. You split your data into a train_set and a test_set, because you want to train your model on only the train_set, and then evaluate its performance on data that was not seen during training, the test_set. This technique helps evaluate the ability of your model to accurately predict on data it has not seen before.\n", 87 | "\n", 88 | "We can ask the feature_view to return a **train_test_split** and it returns:\n", 89 | "\n", 90 | "* **X_** is a vector of features, so **X_train** is a vector of features from the **train_set**. \n", 91 | "* **y_** is a scale of labels, so **y_train** is a scalar of labels from the **train_set**. \n", 92 | "\n", 93 | "Note: a vector is an array of values and a scalar is a single value.\n", 94 | "\n", 95 | "Note: that mathematical convention is that a vector is denoted by an uppercase letter (hence \"X\") and a scalar is denoted by a lowercase letter (hence \"y\").\n", 96 | "\n", 97 | "**X_test** is the features and **y_test** is the labels from our holdout **test_set**. The **test_set** is used to evaluate model performance after the model has been trained." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "id": "JR8HeEs6bpHB" 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "X_train, X_test, y_train, y_test = feature_view.train_test_split(0.2, )" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "y_train" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Now, we can fit a model to our features and labels from our training set (**X_train** and **y_train**). \n", 125 | "\n", 126 | "Fitting a model to a dataset is more commonly called \"training a model\"." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "colab": { 134 | "base_uri": "https://localhost:8080/" 135 | }, 136 | "id": "PNZcUPHJPIu9", 137 | "outputId": "389acb4d-74ff-46f1-dee8-a7c27ee79a09" 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "model = KNeighborsClassifier(n_neighbors=2)\n", 142 | "model.fit(X_train, y_train.values.ravel())" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "Now, we have trained our model. We can evaluate our model on the **test_set** to estimate its performance." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "id": "uHuAD3ttP8Ep" 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "y_pred = model.predict(X_test)\n", 161 | "y_pred" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "We can report on how accurate these predictions (**y_pred**) are compared to the labels (the actual results - **y_test**). " 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "colab": { 176 | "base_uri": "https://localhost:8080/" 177 | }, 178 | "id": "b8EC4_SvbpHE", 179 | "outputId": "5d73b375-76f0-4518-8e88-4db23e8f2486" 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "from sklearn.metrics import classification_report\n", 184 | "\n", 185 | "metrics = classification_report(y_test, y_pred, output_dict=True)\n", 186 | "print(metrics)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "from sklearn.metrics import confusion_matrix\n", 196 | "\n", 197 | "results = confusion_matrix(y_test, y_pred)\n", 198 | "print(results)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "Notice in the confusion matrix results that we have 1 or 2 incorrect predictions.\n", 206 | "We have only 30 flowers in our test set - **y_test**.\n", 207 | "Our model predicted 1 or 2 flowers were of type \"Virginica\", but the flowers were, in fact, \"Versicolor\"." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "from matplotlib import pyplot\n", 217 | "\n", 218 | "df_cm = pd.DataFrame(results, ['True Setosa', 'True Versicolor', 'True Virginica'],\n", 219 | " ['Pred Setosa', 'Pred Versicolor', 'Pred Virginica'])\n", 220 | "\n", 221 | "cm = sns.heatmap(df_cm, annot=True)\n", 222 | "\n", 223 | "fig = cm.get_figure()\n", 224 | "fig.savefig(\"../assets/confusion_matrix.png\") \n", 225 | "fig.show()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Register the Model with Hopsworks Model Registry\n", 233 | "\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "from hsml.schema import Schema\n", 243 | "from hsml.model_schema import ModelSchema\n", 244 | "import os\n", 245 | "import joblib\n", 246 | "import hopsworks\n", 247 | "import shutil\n", 248 | "\n", 249 | "project = hopsworks.login()\n", 250 | "mr = project.get_model_registry()\n", 251 | "\n", 252 | "# The 'iris_model' directory will be saved to the model registry\n", 253 | "model_dir=\"iris_model\"\n", 254 | "if os.path.isdir(model_dir) == False:\n", 255 | " os.mkdir(model_dir)\n", 256 | "joblib.dump(model, model_dir + \"/iris_model.pkl\")\n", 257 | "shutil.copyfile(\"../assets/confusion_matrix.png\", model_dir + \"/confusion_matrix.png\")\n", 258 | "\n", 259 | "input_example = X_train.sample()\n", 260 | "input_schema = Schema(X_train)\n", 261 | "output_schema = Schema(y_train)\n", 262 | "model_schema = ModelSchema(input_schema, output_schema)\n", 263 | "\n", 264 | "iris_model = mr.python.create_model(\n", 265 | "# version=1, #removing version to incrementally create a new version at each training\n", 266 | " name=\"iris\", \n", 267 | " metrics={\"accuracy\" : metrics['accuracy']},\n", 268 | " model_schema=model_schema,\n", 269 | " input_example=input_example, \n", 270 | " description=\"Iris Flower Predictor\")\n", 271 | "\n", 272 | "iris_model.save(model_dir)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [] 281 | } 282 | ], 283 | "metadata": { 284 | "colab": { 285 | "collapsed_sections": [], 286 | "provenance": [] 287 | }, 288 | "kernelspec": { 289 | "display_name": "Python 3 (ipykernel)", 290 | "language": "python", 291 | "name": "python3" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 3 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython3", 303 | "version": "3.9.7" 304 | } 305 | }, 306 | "nbformat": 4, 307 | "nbformat_minor": 1 308 | } 309 | -------------------------------------------------------------------------------- /src/01-module/orchest/push-work.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "dde7af30", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# get the environement variable for the token\n", 11 | "import os\n", 12 | "secret = os.environ['GIT_TOKEN']\n", 13 | "account = os.environ['ACCOUNT']\n", 14 | "repo_url = os.environ['REPO']" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "e0bc7741-4b28-4c33-a261-20b97dec0267", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from datetime import datetime\n", 25 | "from git import Repo\n", 26 | "import git\n", 27 | "import shutil\n", 28 | "\n", 29 | "# Setup \n", 30 | "full_local_path = \"/project-dir/\"\n", 31 | "repo = git.Repo('/project-dir/')\n", 32 | "\n", 33 | "remote = f\"https://{secret}@github.com/{account}/{repo_url}.git\"\n", 34 | "repo = Repo(full_local_path)\n", 35 | "\n", 36 | "origin = repo.remote(name=\"origin\") \n", 37 | "if origin.url != remote:\n", 38 | " origin.set_url(remote, origin.url)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "f2911067", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# move the files to the /data folder in orchest\n", 49 | "assets_folder = r\"../../../assets/\"\n", 50 | "env_folder = r\"/data/\"\n", 51 | "files_to_move = ['latest_iris.png', 'actual_iris.png', 'confusion_matrix.png','df_recent.png']\n", 52 | "\n", 53 | "for file in files_to_move:\n", 54 | " # construct full file path\n", 55 | " source = assets_folder + file\n", 56 | " destination = env_folder + file\n", 57 | " # move file\n", 58 | " shutil.move(source, destination)\n", 59 | "\n", 60 | "# move to the branch for pages\n", 61 | "repo.git.checkout('gh-pages', force=True)\n", 62 | "\n", 63 | "#move back to an asset folder in the gh-pages branch \n", 64 | "for file in files_to_move:\n", 65 | " # construct full file path\n", 66 | " source = env_folder + file\n", 67 | " destination = assets_folder + file\n", 68 | " # move file\n", 69 | " shutil.move(source, destination)\n", 70 | "\n", 71 | "# Add our file, and set our commit\n", 72 | "repo.git.add('assets/latest_iris.png', 'assets/actual_iris.png', 'assets/confusion_matrix.png', 'assets/df_recent.png')\n", 73 | "current = datetime.now()\n", 74 | "repo.index.commit(f'New prediction! time and date: {current}')\n", 75 | "\n", 76 | "# Push to the pages repository\n", 77 | "origin.push()\n", 78 | "\n", 79 | "# Going back to the main branch\n", 80 | "repo.git.checkout('main', force=True)" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3.10.6 64-bit", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.10.6" 101 | }, 102 | "vscode": { 103 | "interpreter": { 104 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 105 | } 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 5 110 | } 111 | -------------------------------------------------------------------------------- /src/01-module/scripts/run-feature-and-prediction-pipelines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd src/01-module 6 | 7 | jupyter nbconvert --to notebook --execute iris-feature-pipeline.ipynb 8 | jupyter nbconvert --to notebook --execute iris-batch-inference-pipeline.ipynb 9 | 10 | -------------------------------------------------------------------------------- /src/02-module/scripts/run-fraud-feature-pipelines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd src/02-module 6 | 7 | jupyter nbconvert --to notebook --execute 2_cc_feature_pipeline.ipynb 8 | 9 | -------------------------------------------------------------------------------- /src/02-module/sml/cc_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from datetime import datetime, date 5 | from math import radians 6 | 7 | # + 8 | def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame: 9 | """Used only in feature pipelines (not online inference). 10 | Unit test with DataFrames and sample data. 11 | """ 12 | age_df = trans_df.merge(profiles_df, on="cc_num", how="left") 13 | trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y") 14 | return trans_df 15 | 16 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame: 17 | """Used only in feature pipelines (not online inference). 18 | Unit test with DataFrames and sample data. 19 | """ 20 | card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left") 21 | card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y") 22 | trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D") 23 | return trans_df 24 | 25 | 26 | # - 27 | 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float: 29 | """Compute Haversine distance between each consecutive coordinate in (long, lat).""" 30 | 31 | if isinstance(long, pd.Series): 32 | long = long.map(lambda x: (x)) 33 | else: 34 | long = radians(long) 35 | 36 | if isinstance(lat, pd.Series): 37 | lat = lat.map(lambda x: (x)) 38 | else: 39 | lat = radians(lat) 40 | 41 | if isinstance(long, pd.Series): 42 | prev_long = prev_long.map(lambda x: (x)) 43 | else: 44 | prev_long = radians(prev_long) 45 | 46 | if isinstance(lat, pd.Series): 47 | prev_lat = prev_lat.map(lambda x: (x)) 48 | else: 49 | prev_lat = radians(prev_lat) 50 | 51 | long_diff = prev_long - long 52 | lat_diff = prev_lat - lat 53 | 54 | a = np.sin(lat_diff/2.0)**2 55 | b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2 56 | c = 2*np.arcsin(np.sqrt(a + b)) 57 | 58 | return c 59 | 60 | 61 | def time_delta(prev_datetime: int, current_datetime: int)-> int: 62 | """Compute time difference between each consecutive transaction.""" 63 | return prev_datetime - current_datetime 64 | 65 | def time_delta_to_days(time_delta: datetime)-> float: 66 | """.""" 67 | return time_delta.total_seconds() / 86400 68 | 69 | def date_to_timestamp(date_obj: datetime)-> int: 70 | return int(date_obj.timestamp() * 1000) 71 | 72 | def timestamp_to_date(timestamp: int)-> datetime: 73 | return datetime.fromtimestamp(timestamp // 1000) 74 | 75 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame: 76 | 77 | # Convert coordinates into radians: 78 | trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians) 79 | 80 | trans_df.sort_values(["datetime", "cc_num"], inplace=True) 81 | 82 | # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most 83 | # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end), 84 | # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang). 85 | trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 86 | .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\ 87 | .reset_index(level=0, drop=True)\ 88 | .fillna(0) 89 | 90 | # Use the same `shift` operation in Pandas to get the previous row for a given cc_number 91 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 92 | .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\ 93 | .reset_index(level=0, drop=True) 94 | # .fillna(0) # handle the first datetime, which has no previous row when you call `shift` 95 | 96 | # Convert time_delta from seconds to days 97 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x)) 98 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0) 99 | trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country", "age_at_transaction"\ 100 | ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]] 101 | # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC 102 | # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins. 103 | trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x)) 104 | return trans_df 105 | 106 | 107 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame: 108 | 109 | cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime") 110 | 111 | # Moving average of transaction volume. 112 | df_mavg = pd.DataFrame(cc_group.mean()) 113 | df_mavg.columns = ["trans_volume_mavg", "datetime"] 114 | df_mavg = df_mavg.reset_index(level=["cc_num"]) 115 | df_mavg = df_mavg.drop(columns=["cc_num", "datetime"]) 116 | df_mavg = df_mavg.sort_index() 117 | 118 | # Moving standard deviation of transaction volume. 119 | df_std = pd.DataFrame(cc_group.mean()) 120 | df_std.columns = ["trans_volume_mstd", "datetime"] 121 | df_std = df_std.reset_index(level=["cc_num"]) 122 | df_std = df_std.drop(columns=["cc_num", "datetime"]) 123 | df_std = df_std.fillna(0) 124 | df_std = df_std.sort_index() 125 | window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True) 126 | 127 | # Moving average of transaction frequency. 128 | df_count = pd.DataFrame(cc_group.mean()) 129 | df_count.columns = ["trans_freq", "datetime"] 130 | df_count = df_count.reset_index(level=["cc_num"]) 131 | df_count = df_count.drop(columns=["cc_num", "datetime"]) 132 | df_count = df_count.sort_index() 133 | window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True) 134 | 135 | # Moving average of location difference between consecutive transactions. 136 | cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean() 137 | df_loc_delta_mavg = pd.DataFrame(cc_group) 138 | df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"] 139 | df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"]) 140 | df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"]) 141 | df_loc_delta_mavg = df_loc_delta_mavg.sort_index() 142 | window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True) 143 | 144 | window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True) 145 | 146 | return window_aggs_df 147 | -------------------------------------------------------------------------------- /src/02-module/sml/synthetic_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | from collections import defaultdict 5 | from faker import Faker 6 | import pandas as pd 7 | import numpy as np 8 | import datetime 9 | import hashlib 10 | import random 11 | import math 12 | import os 13 | import bisect 14 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple 15 | 16 | # Seed for Reproducibility 17 | faker = Faker() 18 | faker.seed_locale('en_US', 0) 19 | 20 | 21 | def set_random_seed(seed: int): 22 | random.seed(seed) 23 | np.random.seed(seed) 24 | faker.seed_instance(seed) 25 | 26 | set_random_seed(12345) 27 | 28 | 29 | TOTAL_UNIQUE_USERS = 1000 30 | TOTAL_UNIQUE_TRANSACTIONS = 54000 31 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 32 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 33 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10] 34 | NORMAL_ATM_RADIUS = 0.01 35 | START_DATE = '2022-01-01 00:00:00' 36 | END_DATE = '2022-03-01 00:00:00' 37 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S' 38 | 39 | AMOUNT_DISTRIBUTION_PERCENTAGES = { 40 | 0.05: (0.01, 1.01), 41 | 0.075: (1, 11.01), 42 | 0.525: (10, 100.01), 43 | 0.25: (100, 1000.01), 44 | 0.099: (1000, 10000.01), 45 | 0.001: (10000, 30000.01) 46 | } 47 | 48 | CATEGORY_PERC_PRICE = { 49 | "Grocery": (0.5, 0.01, 100), 50 | "Restaurant/Cafeteria": (0.2, 1, 100), 51 | "Health/Beauty": (0.1, 10, 500.01), 52 | "Domestic Transport": (0.1, 10, 100.01), 53 | "Clothing": (0.05, 10, 2000.01), 54 | "Electronics": (0.02, 100, 10000.01), 55 | "Sports/Outdoors": (0.015, 10, 100.01), 56 | "Holliday/Travel": (0.014, 10, 100.01), 57 | "Jewelery": (0.001, 10, 100.01) 58 | } 59 | 60 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent 61 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS) 62 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10] 63 | 64 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = { 65 | 0.055: (17, 24), 66 | 0.0015: (24, 34), 67 | 0.0015: (34, 44), 68 | 0.02: (44, 54), 69 | 0.022: (54, 64), 70 | 0.1: (64, 74), 71 | 0.40: (74, 84), 72 | 0.40: (84, 100), 73 | } 74 | 75 | 76 | 77 | def generate_unique_credit_card_numbers(n: int) -> pd.Series: 78 | """.""" 79 | cc_ids = set() 80 | for _ in range(n): 81 | cc_id = faker.credit_card_number(card_type='visa') 82 | cc_ids.add(cc_id) 83 | return pd.Series(list(cc_ids)) 84 | 85 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 86 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit 87 | 88 | def generate_list_credit_card_numbers() -> list: 89 | """.""" 90 | credit_cards = [] 91 | credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS) 92 | delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 93 | delta_time_object + datetime.timedelta(days=-728) 94 | for cc_num in credit_card_numbers: 95 | credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")}) 96 | return credit_cards 97 | 98 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame: 99 | """.""" 100 | profiles = [] 101 | for credit_card in credit_cards: 102 | address = faker.local_latlng(country_code = 'US') 103 | age = 0 104 | profile = None 105 | while age < 18 or age > 100: 106 | profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate']) 107 | dday = profile['birthdate'] 108 | delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day) 109 | age = int(delta.days / 365) 110 | profile['City'] = address[2] 111 | profile['Country'] = address[3] 112 | profile['cc_num'] = credit_card['cc_num'] 113 | credit_card['age'] = age 114 | profiles.append(profile) 115 | 116 | # Cast the columns to the correct Pandas DType 117 | profiles_df = pd.DataFrame.from_records(profiles) 118 | profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate']) 119 | profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num']) 120 | 121 | return profiles_df 122 | 123 | # pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS 124 | def generate_timestamps(n: int) -> list: 125 | """Return a list of timestamps of length 'n'.""" 126 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 127 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 128 | timestamps = list() 129 | for _ in range(n): 130 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT) 131 | timestamps.append(timestamp) 132 | timestamps = sorted(timestamps) 133 | return timestamps 134 | 135 | def get_random_transaction_amount(start: float, end: float) -> float: 136 | """.""" 137 | amt = round(np.random.uniform(start, end), 2) 138 | return amt 139 | 140 | def generate_amounts() -> list: 141 | """.""" 142 | amounts = [] 143 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 144 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 145 | start, end = span 146 | for _ in range(n): 147 | amounts.append(get_random_transaction_amount(start, end+1)) 148 | return amounts 149 | 150 | def generate_categories(amounts) -> list: 151 | """.""" 152 | categories = [] 153 | for category, category_perc_price in CATEGORY_PERC_PRICE.items(): 154 | percentage, min_price, max_price = category_perc_price 155 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 156 | for _ in range(n): 157 | min_price_i = bisect.bisect_left(amounts, min_price) 158 | max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i) 159 | categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])}) 160 | 161 | random.shuffle(categories) 162 | return categories 163 | 164 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str: 165 | """.""" 166 | hashable = f'{timestamp}{credit_card_number}{transaction_amount}' 167 | hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest() 168 | return hexdigest 169 | 170 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list: 171 | """.""" 172 | transactions = [] 173 | for timestamp, category in zip(timestamps, categories): 174 | credit_card_number = random.choice(credit_card_numbers) 175 | point_of_tr = faker.local_latlng(country_code = 'US') 176 | transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount']) 177 | transactions.append({ 178 | 'tid': transaction_id, 179 | 'datetime': timestamp, 180 | 'cc_num': credit_card_number, 181 | 'category': category['category'], 182 | 'amount': category['amount'], 183 | 'latitude': point_of_tr[0], 184 | 'longitude': point_of_tr[1], 185 | 'city': point_of_tr[2], 186 | 'country': point_of_tr[3], 187 | 'fraud_label': 0 188 | } 189 | ) 190 | return transactions 191 | 192 | def generate_cash_amounts() -> list: 193 | """.""" 194 | cash_amounts = [] 195 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 196 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) 197 | start, end = span 198 | for _ in range(n): 199 | cash_amounts.append(get_random_transaction_amount(start, end+1)) 200 | return cash_amounts 201 | 202 | def generate_chains(): 203 | """.""" 204 | visited = set() 205 | chains = defaultdict(list) 206 | 207 | def size(chains: dict) -> int: 208 | counts = {key: len(values)+1 for (key, values) in chains.items()} 209 | return sum(counts.values()) 210 | 211 | 212 | def generate_attack_chain(i: int): 213 | chain_length = random.choice(ATTACK_CHAIN_LENGTHS) 214 | for j in range(1, chain_length): 215 | if i+j not in visited: 216 | if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS: 217 | break 218 | chains[i].append(i+j) 219 | visited.add(i+j) 220 | 221 | while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS: 222 | i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS)) 223 | if i not in visited: 224 | generate_attack_chain(i) 225 | visited.add(i) 226 | return chains 227 | 228 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \ 229 | delta: int, radius: float = None, country_code = 'US') -> List[Dict]: 230 | """.""" 231 | atms = [] 232 | if length < 0: 233 | raise Exception('Length must be > 0') 234 | 235 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 236 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 237 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None) 238 | point_of_tr = faker.local_latlng(country_code = country_code) 239 | latitude = point_of_tr[0] 240 | longitude = point_of_tr[1] 241 | city = point_of_tr[2] 242 | for _ in range(length): 243 | current = timestamp + datetime.timedelta(hours=delta) 244 | if radius is not None: 245 | latitude = faker.coordinate(latitude, radius) 246 | longitude = faker.coordinate(longitude, radius) 247 | amount = random.sample(cash_amounts, 1)[0] 248 | transaction_id = generate_transaction_id(timestamp, credit_card_number, amount) 249 | atms.append({'tid': transaction_id, 250 | 'datetime': current.strftime(DATE_FORMAT), 251 | 'cc_num': credit_card_number, 252 | 'category': 'Cash Withdrawal', 253 | 'amount': amount, 254 | 'latitude': latitude, 255 | 'longitude': longitude, 256 | 'city': city, 257 | 'country': 'US', 258 | 'fraud_label': 0 259 | }) 260 | timestamp = current 261 | return atms 262 | 263 | def generate_susceptible_cards(credit_cards: list) -> list: 264 | """.""" 265 | susceptible_cards = [] 266 | visited_cards = [] 267 | for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items(): 268 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 269 | start, end = span 270 | for _ in range(n): 271 | for card in credit_cards: 272 | if card['age'] > start and card['age'] < end: 273 | if card['cc_num'] not in visited_cards: 274 | current = card 275 | visited_cards.append(card['cc_num']) 276 | break 277 | else: 278 | current = None 279 | if current is not None: 280 | susceptible_cards.append(current) 281 | return susceptible_cards 282 | 283 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list: 284 | """.""" 285 | normal_atm_withdrawals = [] 286 | atm_transactions = len(cash_amounts) 287 | cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1)) 288 | atm_count = 0 289 | while atm_count < atm_transactions: 290 | for card in cash_withdrawal_cards: 291 | for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 292 | # interval in hours between normal cash withdrawals 293 | delta = random.randint(6, 168) 294 | atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS) 295 | normal_atm_withdrawals.append(atm_tr) 296 | atm_count += ATM_WITHRAWAL_SEQ 297 | return normal_atm_withdrawals 298 | 299 | 300 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list: 301 | """.""" 302 | timestamps = [] 303 | timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT) 304 | for _ in range(chain_length): 305 | # interval in seconds between fraudulent attacks 306 | delta = random.randint(30, 120) 307 | current = timestamp + datetime.timedelta(seconds=delta) 308 | timestamps.append(current.strftime(DATE_FORMAT)) 309 | timestamp = current 310 | return timestamps 311 | 312 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list: 313 | """.""" 314 | amounts = [] 315 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 316 | n = math.ceil(chain_length * percentage) 317 | start, end = span 318 | for _ in range(n): 319 | amounts.append(get_random_transaction_amount(start, end+1)) 320 | return amounts[:chain_length] 321 | 322 | 323 | def update_transactions(transactions: list, chains: list) -> list: 324 | """.""" 325 | for key, chain in chains.items(): 326 | transaction = transactions[key] 327 | timestamp = transaction['datetime'] 328 | cc_num = transaction['cc_num'] 329 | amount = transaction['amount'] 330 | transaction['fraud_label'] = 1 331 | inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain)) 332 | inject_amounts = generate_amounts_for_fraud_attacks(len(chain)) 333 | random.shuffle(inject_amounts) 334 | for i, idx in enumerate(chain): 335 | original_transaction = transactions[idx] 336 | inject_timestamp = inject_timestamps[i] 337 | original_transaction['datetime'] = inject_timestamp 338 | original_transaction['fraud_label'] = 1 339 | original_transaction['cc_num'] = cc_num 340 | original_transaction['amount'] = inject_amounts[i] 341 | original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0] 342 | original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount) 343 | transactions[idx] = original_transaction 344 | 345 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list: 346 | """.""" 347 | return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \ 348 | int(FRAUD_RATIO * len(normal_atm_withdrawals))) 349 | 350 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\ 351 | cash_amounts: list): 352 | """.""" 353 | for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs: 354 | # interval in seconds between fraudulent attacks 355 | delta = random.randint(1, 5) 356 | atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx] 357 | pre_fraudulent_atm_tr = atm_withdrawal[0] 358 | fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number = 359 | pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0] 360 | fraudulent_atm_location = faker.location_on_land() 361 | while fraudulent_atm_location[3] == 'US': 362 | fraudulent_atm_location = faker.location_on_land() 363 | fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'], 364 | DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT) 365 | fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0] 366 | fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1] 367 | fraudulent_atm_tr['city'] = fraudulent_atm_location[2] 368 | fraudulent_atm_tr['country'] = fraudulent_atm_location[3] 369 | fraudulent_atm_tr['fraud_label'] = 1 370 | atm_withdrawal.append(fraudulent_atm_tr) 371 | normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal 372 | 373 | 374 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame: 375 | """.""" 376 | for atm_withdrawal in normal_atm_withdrawals: 377 | for withdrawal in atm_withdrawal: 378 | transactions.append(withdrawal) 379 | return pd.DataFrame.from_records(transactions) 380 | 381 | 382 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame: 383 | """.""" 384 | df = pd.DataFrame.from_records(credit_cards) 385 | # Cast the columns to the correct Pandas DType 386 | df['cc_num']= pd.to_numeric(df['cc_num']) 387 | return df 388 | 389 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame: 390 | """.""" 391 | profiles_df = generate_df_with_profiles(credit_cards) 392 | return profiles_df 393 | 394 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame: 395 | """.""" 396 | timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS) 397 | amounts = generate_amounts() 398 | categories = generate_categories(amounts) 399 | cc_df = create_credit_cards_as_df(credit_cards) 400 | transactions = generate_transactions(cc_df['cc_num'], timestamps, categories) 401 | cash_amounts = generate_cash_amounts() 402 | chains = generate_chains() 403 | susceptible_cards = generate_susceptible_cards(credit_cards) 404 | normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards) 405 | update_transactions(transactions, chains) 406 | 407 | fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals) 408 | update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts) 409 | 410 | transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals) 411 | 412 | # Cast the columns to the correct Pandas DType 413 | transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num']) 414 | transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude']) 415 | transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude']) 416 | transactions_df['datetime']= pd.to_datetime(transactions_df['datetime']) 417 | 418 | return transactions_df 419 | 420 | -------------------------------------------------------------------------------- /src/02-module/test_sml/test_sml.py: -------------------------------------------------------------------------------- 1 | from sml import synthetic_data 2 | from unittest import TestCase 3 | import pytest 4 | from contextlib import nullcontext as does_not_raise 5 | 6 | @pytest.mark.parametrize( 7 | "credit_card_number, cash_amounts, length, delta, radius, country_code, excp", 8 | [("1111 2222 3333 4444",[112.10, 11.23], 1, 1, 10.0, 'US', does_not_raise()) 9 | ,("1111 2222 3333 44",[-12.00], -1, 1, 1.0, 'IE', pytest.raises(Exception))] 10 | ) 11 | def test_generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \ 12 | delta: int, radius: float, country_code, excp): 13 | with excp: 14 | synthetic_data.generate_atm_withdrawal(credit_card_number, cash_amounts, length, delta, radius, country_code) 15 | 16 | -------------------------------------------------------------------------------- /src/03-module/scripts/run-fraud-batch-inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd src/03-module 6 | 7 | jupyter nbconvert --to notebook --execute 4_batch_predictions.ipynb 8 | -------------------------------------------------------------------------------- /src/04-module/app.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import numpy as np 3 | from PIL import Image 4 | import requests 5 | 6 | import hopsworks 7 | import joblib 8 | 9 | project = hopsworks.login() 10 | fs = project.get_feature_store() 11 | 12 | 13 | mr = project.get_model_registry() 14 | model = mr.get_model("iris", version=1) 15 | model_dir = model.download() 16 | model = joblib.load(model_dir + "/iris_model.pkl") 17 | 18 | 19 | def iris(sepal_length, sepal_width, petal_length, petal_width): 20 | input_list = [] 21 | input_list.append(sepal_length) 22 | input_list.append(sepal_width) 23 | input_list.append(petal_length) 24 | input_list.append(petal_width) 25 | # 'res' is a list of predictions returned as the label. 26 | res = model.predict(np.asarray(input_list).reshape(1, -1)) 27 | # We add '[0]' to the result of the transformed 'res', because 'res' is a list, and we only want 28 | # the first element. 29 | flower_url = "https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/main/src/01-module/assets/" + res[0] + ".png" 30 | img = Image.open(requests.get(flower_url, stream=True).raw) 31 | return img 32 | 33 | demo = gr.Interface( 34 | fn=iris, 35 | title="Iris Flower Predictive Analytics", 36 | description="Experiment with sepal/petal lengths/widths to predict which flower it is.", 37 | allow_flagging="never", 38 | inputs=[ 39 | gr.inputs.Number(default=1.0, label="sepal length (cm)"), 40 | gr.inputs.Number(default=1.0, label="sepal width (cm)"), 41 | gr.inputs.Number(default=1.0, label="petal length (cm)"), 42 | gr.inputs.Number(default=1.0, label="petal width (cm)"), 43 | ], 44 | outputs=gr.Image(type="pil")) 45 | 46 | demo.launch() 47 | -------------------------------------------------------------------------------- /src/04-module/cc-fraud-streamlit-ui.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import joblib 3 | from math import radians 4 | from sml import cc_features 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import plotly.express as px 9 | from matplotlib import pyplot 10 | import warnings 11 | 12 | import hopsworks 13 | from sml import synthetic_data 14 | 15 | import streamlit as st 16 | 17 | import folium 18 | from streamlit_folium import st_folium 19 | import json 20 | 21 | start_date = (datetime.datetime.now() - datetime.timedelta(hours=200)) 22 | end_date = (datetime.datetime.now()) 23 | 24 | synthetic_data.set_random_seed(12345) 25 | credit_cards = [cc["cc_num"] for cc in synthetic_data.generate_list_credit_card_numbers()] 26 | lat = 0 27 | long = 0 28 | 29 | warnings.filterwarnings("ignore") 30 | 31 | project = hopsworks.login() 32 | fs = project.get_feature_store() 33 | 34 | @st.cache(allow_output_mutation=True, suppress_st_warning=True) 35 | def retrieve_dataset(fv, start_date, end_date): 36 | st.write(36 * "-") 37 | print_fancy_header('\n💾 Dataset Retrieving...') 38 | batch_data = fv.get_batch_data(start_time = start_date, end_time = end_date) 39 | batch_data.drop(["tid", "cc_num", "datetime"], axis = 1, inplace=True) 40 | return batch_data 41 | 42 | 43 | @st.cache(suppress_st_warning=True, allow_output_mutation=True) 44 | def get_feature_view(): 45 | fv = fs.get_feature_view("cc_trans_fraud", 1) 46 | return fv 47 | 48 | 49 | @st.cache(allow_output_mutation=True,suppress_st_warning=True) 50 | def get_model(project = project): 51 | mr = project.get_model_registry() 52 | model = mr.get_model("cc_fraud", version = 1) 53 | model_dir = model.download() 54 | return joblib.load(model_dir + "/cc_fraud_model.pkl") 55 | 56 | def explore_data(batch_data): 57 | st.write(36 * "-") 58 | print_fancy_header('\n👁 Data Exploration...') 59 | labels = ["Suspected of Fraud", "Not Suspected of Fraud"] 60 | unique, counts = np.unique(batch_data.fraud.values, return_counts=True) 61 | values = counts.tolist() 62 | 63 | def plot_pie(values, labels): 64 | fig = px.pie(values=values, names=labels, title='Distribution of predicted fraud transactions') 65 | return fig 66 | 67 | fig1 = plot_pie(values, labels) 68 | st.plotly_chart(fig1) 69 | 70 | 71 | def print_fancy_header(text, font_size=24): 72 | res = f'{text}' 73 | st.markdown(res, unsafe_allow_html=True) 74 | 75 | def transform_preds(predictions): 76 | return ['Fraud' if pred == 1 else 'Not Fraud' for pred in predictions] 77 | 78 | progress_bar = st.sidebar.header('⚙️ Working Progress') 79 | progress_bar = st.sidebar.progress(0) 80 | st.title('🆘 Fraud transactions detection 🆘') 81 | 82 | st.write(36 * "-") 83 | print_fancy_header('\n📡 Connecting to Hopsworks Feature Store...') 84 | 85 | st.write(36 * "-") 86 | print_fancy_header('\n🤖 Connecting to Model Registry on Hopsworks...') 87 | model = get_model(project) 88 | st.write(model) 89 | st.write("✅ Connected!") 90 | 91 | progress_bar.progress(40) 92 | 93 | st.write(36 * "-") 94 | print_fancy_header('\n✨ Fetch batch data and predict') 95 | fv = get_feature_view() 96 | 97 | 98 | if st.button('📊 Make a prediction'): 99 | batch_data = retrieve_dataset(fv, start_date, end_date) 100 | st.write("✅ Retrieved!") 101 | progress_bar.progress(55) 102 | predictions = model.predict(batch_data) 103 | predictions = transform_preds(predictions) 104 | batch_data_to_explore = batch_data.copy() 105 | batch_data_to_explore['fraud'] = predictions 106 | explore_data(batch_data_to_explore) 107 | 108 | st.button("Re-run") 109 | -------------------------------------------------------------------------------- /src/04-module/requirements-gradio.txt: -------------------------------------------------------------------------------- 1 | hopsworks 2 | joblib 3 | scikit-learn 4 | -------------------------------------------------------------------------------- /src/04-module/run-fraud-streamlit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | if [ "$HOPSWORKS_API_KEY" == "" ] ; then 5 | echo "Enter your HOPSWORKS_API_KEY:" 6 | read KEY 7 | export HOPSWORKS_API_KEY="$KEY" 8 | fi 9 | 10 | if [ "$HOPSWORKS_PROJECT" == "" ] ; then 11 | echo "Enter the name of your project on Hopsworks:" 12 | read proj 13 | export HOPSWORKS_PROJECT=$proj 14 | export 15 | 16 | python -m streamlit run cc-fraud-streamlit-ui.py 17 | -------------------------------------------------------------------------------- /src/04-module/sml/cc_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from datetime import datetime, date 5 | from math import radians 6 | 7 | # + 8 | def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame: 9 | """Used only in feature pipelines (not online inference). 10 | Unit test with DataFrames and sample data. 11 | """ 12 | age_df = trans_df.merge(profiles_df, on="cc_num", how="left") 13 | trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y") 14 | return trans_df 15 | 16 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame: 17 | """Used only in feature pipelines (not online inference). 18 | Unit test with DataFrames and sample data. 19 | """ 20 | card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left") 21 | card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y") 22 | trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D") 23 | return trans_df 24 | 25 | 26 | # - 27 | 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float: 29 | """Compute Haversine distance between each consecutive coordinate in (long, lat).""" 30 | 31 | if isinstance(long, pd.Series): 32 | long = long.map(lambda x: (x)) 33 | else: 34 | long = radians(long) 35 | 36 | if isinstance(lat, pd.Series): 37 | lat = lat.map(lambda x: (x)) 38 | else: 39 | lat = radians(lat) 40 | 41 | if isinstance(long, pd.Series): 42 | prev_long = prev_long.map(lambda x: (x)) 43 | else: 44 | prev_long = radians(prev_long) 45 | 46 | if isinstance(lat, pd.Series): 47 | prev_lat = prev_lat.map(lambda x: (x)) 48 | else: 49 | prev_lat = radians(prev_lat) 50 | 51 | long_diff = prev_long - long 52 | lat_diff = prev_lat - lat 53 | 54 | a = np.sin(lat_diff/2.0)**2 55 | b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2 56 | c = 2*np.arcsin(np.sqrt(a + b)) 57 | 58 | return c 59 | 60 | 61 | def time_delta(prev_datetime: int, current_datetime: int)-> int: 62 | """Compute time difference between each consecutive transaction.""" 63 | return prev_datetime - current_datetime 64 | 65 | def time_delta_to_days(time_delta: datetime)-> float: 66 | """.""" 67 | return time_delta.total_seconds() / 86400 68 | 69 | def date_to_timestamp(date_obj: datetime)-> int: 70 | return int(date_obj.timestamp() * 1000) 71 | 72 | def timestamp_to_date(timestamp: int)-> datetime: 73 | return datetime.fromtimestamp(timestamp // 1000) 74 | 75 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame: 76 | 77 | # Convert coordinates into radians: 78 | trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians) 79 | 80 | trans_df.sort_values(["datetime", "cc_num"], inplace=True) 81 | 82 | # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most 83 | # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end), 84 | # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang). 85 | trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 86 | .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\ 87 | .reset_index(level=0, drop=True)\ 88 | .fillna(0) 89 | 90 | # Use the same `shift` operation in Pandas to get the previous row for a given cc_number 91 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 92 | .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\ 93 | .reset_index(level=0, drop=True) 94 | # .fillna(0) # handle the first datetime, which has no previous row when you call `shift` 95 | 96 | # Convert time_delta from seconds to days 97 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x)) 98 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0) 99 | trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country", "age_at_transaction"\ 100 | ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]] 101 | # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC 102 | # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins. 103 | trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x)) 104 | return trans_df 105 | 106 | 107 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame: 108 | 109 | cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime") 110 | 111 | # Moving average of transaction volume. 112 | df_mavg = pd.DataFrame(cc_group.mean()) 113 | df_mavg.columns = ["trans_volume_mavg", "datetime"] 114 | df_mavg = df_mavg.reset_index(level=["cc_num"]) 115 | df_mavg = df_mavg.drop(columns=["cc_num", "datetime"]) 116 | df_mavg = df_mavg.sort_index() 117 | 118 | # Moving standard deviation of transaction volume. 119 | df_std = pd.DataFrame(cc_group.mean()) 120 | df_std.columns = ["trans_volume_mstd", "datetime"] 121 | df_std = df_std.reset_index(level=["cc_num"]) 122 | df_std = df_std.drop(columns=["cc_num", "datetime"]) 123 | df_std = df_std.fillna(0) 124 | df_std = df_std.sort_index() 125 | window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True) 126 | 127 | # Moving average of transaction frequency. 128 | df_count = pd.DataFrame(cc_group.mean()) 129 | df_count.columns = ["trans_freq", "datetime"] 130 | df_count = df_count.reset_index(level=["cc_num"]) 131 | df_count = df_count.drop(columns=["cc_num", "datetime"]) 132 | df_count = df_count.sort_index() 133 | window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True) 134 | 135 | # Moving average of location difference between consecutive transactions. 136 | cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean() 137 | df_loc_delta_mavg = pd.DataFrame(cc_group) 138 | df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"] 139 | df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"]) 140 | df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"]) 141 | df_loc_delta_mavg = df_loc_delta_mavg.sort_index() 142 | window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True) 143 | 144 | window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True) 145 | 146 | return window_aggs_df 147 | -------------------------------------------------------------------------------- /src/04-module/sml/synthetic_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | from collections import defaultdict 5 | from faker import Faker 6 | import pandas as pd 7 | import numpy as np 8 | import datetime 9 | import hashlib 10 | import random 11 | import math 12 | import os 13 | import bisect 14 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple 15 | 16 | # Seed for Reproducibility 17 | faker = Faker() 18 | faker.seed_locale('en_US', 0) 19 | 20 | 21 | def set_random_seed(seed: int): 22 | random.seed(seed) 23 | np.random.seed(seed) 24 | faker.seed_instance(seed) 25 | 26 | set_random_seed(12345) 27 | 28 | 29 | TOTAL_UNIQUE_USERS = 1000 30 | TOTAL_UNIQUE_TRANSACTIONS = 54000 31 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 32 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 33 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10] 34 | NORMAL_ATM_RADIUS = 0.01 35 | START_DATE = '2022-01-01 00:00:00' 36 | END_DATE = '2022-03-01 00:00:00' 37 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S' 38 | 39 | AMOUNT_DISTRIBUTION_PERCENTAGES = { 40 | 0.05: (0.01, 1.01), 41 | 0.075: (1, 11.01), 42 | 0.525: (10, 100.01), 43 | 0.25: (100, 1000.01), 44 | 0.099: (1000, 10000.01), 45 | 0.001: (10000, 30000.01) 46 | } 47 | 48 | CATEGORY_PERC_PRICE = { 49 | "Grocery": (0.5, 0.01, 100), 50 | "Restaurant/Cafeteria": (0.2, 1, 100), 51 | "Health/Beauty": (0.1, 10, 500.01), 52 | "Domestic Transport": (0.1, 10, 100.01), 53 | "Clothing": (0.05, 10, 2000.01), 54 | "Electronics": (0.02, 100, 10000.01), 55 | "Sports/Outdoors": (0.015, 10, 100.01), 56 | "Holliday/Travel": (0.014, 10, 100.01), 57 | "Jewelery": (0.001, 10, 100.01) 58 | } 59 | 60 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent 61 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS) 62 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10] 63 | 64 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = { 65 | 0.055: (17, 24), 66 | 0.0015: (24, 34), 67 | 0.0015: (34, 44), 68 | 0.02: (44, 54), 69 | 0.022: (54, 64), 70 | 0.1: (64, 74), 71 | 0.40: (74, 84), 72 | 0.40: (84, 100), 73 | } 74 | 75 | 76 | 77 | def generate_unique_credit_card_numbers(n: int) -> pd.Series: 78 | """.""" 79 | cc_ids = set() 80 | for _ in range(n): 81 | cc_id = faker.credit_card_number(card_type='visa') 82 | cc_ids.add(cc_id) 83 | return pd.Series(list(cc_ids)) 84 | 85 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 86 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit 87 | 88 | def generate_list_credit_card_numbers() -> list: 89 | """.""" 90 | credit_cards = [] 91 | credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS) 92 | delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 93 | delta_time_object + datetime.timedelta(days=-728) 94 | for cc_num in credit_card_numbers: 95 | credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")}) 96 | return credit_cards 97 | 98 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame: 99 | """.""" 100 | profiles = [] 101 | for credit_card in credit_cards: 102 | address = faker.local_latlng(country_code = 'US') 103 | age = 0 104 | profile = None 105 | while age < 18 or age > 100: 106 | profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate']) 107 | dday = profile['birthdate'] 108 | delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day) 109 | age = int(delta.days / 365) 110 | profile['City'] = address[2] 111 | profile['Country'] = address[3] 112 | profile['cc_num'] = credit_card['cc_num'] 113 | credit_card['age'] = age 114 | profiles.append(profile) 115 | 116 | # Cast the columns to the correct Pandas DType 117 | profiles_df = pd.DataFrame.from_records(profiles) 118 | profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate']) 119 | profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num']) 120 | 121 | return profiles_df 122 | 123 | # pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS 124 | def generate_timestamps(n: int) -> list: 125 | """Return a list of timestamps of length 'n'.""" 126 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 127 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 128 | timestamps = list() 129 | for _ in range(n): 130 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT) 131 | timestamps.append(timestamp) 132 | timestamps = sorted(timestamps) 133 | return timestamps 134 | 135 | def get_random_transaction_amount(start: float, end: float) -> float: 136 | """.""" 137 | amt = round(np.random.uniform(start, end), 2) 138 | return amt 139 | 140 | def generate_amounts() -> list: 141 | """.""" 142 | amounts = [] 143 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 144 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 145 | start, end = span 146 | for _ in range(n): 147 | amounts.append(get_random_transaction_amount(start, end+1)) 148 | return amounts 149 | 150 | def generate_categories(amounts) -> list: 151 | """.""" 152 | categories = [] 153 | for category, category_perc_price in CATEGORY_PERC_PRICE.items(): 154 | percentage, min_price, max_price = category_perc_price 155 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 156 | for _ in range(n): 157 | min_price_i = bisect.bisect_left(amounts, min_price) 158 | max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i) 159 | categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])}) 160 | 161 | random.shuffle(categories) 162 | return categories 163 | 164 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str: 165 | """.""" 166 | hashable = f'{timestamp}{credit_card_number}{transaction_amount}' 167 | hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest() 168 | return hexdigest 169 | 170 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list: 171 | """.""" 172 | transactions = [] 173 | for timestamp, category in zip(timestamps, categories): 174 | credit_card_number = random.choice(credit_card_numbers) 175 | point_of_tr = faker.local_latlng(country_code = 'US') 176 | transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount']) 177 | transactions.append({ 178 | 'tid': transaction_id, 179 | 'datetime': timestamp, 180 | 'cc_num': credit_card_number, 181 | 'category': category['category'], 182 | 'amount': category['amount'], 183 | 'latitude': point_of_tr[0], 184 | 'longitude': point_of_tr[1], 185 | 'city': point_of_tr[2], 186 | 'country': point_of_tr[3], 187 | 'fraud_label': 0 188 | } 189 | ) 190 | return transactions 191 | 192 | def generate_cash_amounts() -> list: 193 | """.""" 194 | cash_amounts = [] 195 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 196 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) 197 | start, end = span 198 | for _ in range(n): 199 | cash_amounts.append(get_random_transaction_amount(start, end+1)) 200 | return cash_amounts 201 | 202 | def generate_chains(): 203 | """.""" 204 | visited = set() 205 | chains = defaultdict(list) 206 | 207 | def size(chains: dict) -> int: 208 | counts = {key: len(values)+1 for (key, values) in chains.items()} 209 | return sum(counts.values()) 210 | 211 | 212 | def generate_attack_chain(i: int): 213 | chain_length = random.choice(ATTACK_CHAIN_LENGTHS) 214 | for j in range(1, chain_length): 215 | if i+j not in visited: 216 | if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS: 217 | break 218 | chains[i].append(i+j) 219 | visited.add(i+j) 220 | 221 | while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS: 222 | i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS)) 223 | if i not in visited: 224 | generate_attack_chain(i) 225 | visited.add(i) 226 | return chains 227 | 228 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \ 229 | delta: int, radius: float = None, country_code = 'US') -> List[Dict]: 230 | """.""" 231 | atms = [] 232 | if length < 0: 233 | raise Exception('Length must be > 0') 234 | 235 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 236 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 237 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None) 238 | point_of_tr = faker.local_latlng(country_code = country_code) 239 | latitude = point_of_tr[0] 240 | longitude = point_of_tr[1] 241 | city = point_of_tr[2] 242 | for _ in range(length): 243 | current = timestamp + datetime.timedelta(hours=delta) 244 | if radius is not None: 245 | latitude = faker.coordinate(latitude, radius) 246 | longitude = faker.coordinate(longitude, radius) 247 | amount = random.sample(cash_amounts, 1)[0] 248 | transaction_id = generate_transaction_id(timestamp, credit_card_number, amount) 249 | atms.append({'tid': transaction_id, 250 | 'datetime': current.strftime(DATE_FORMAT), 251 | 'cc_num': credit_card_number, 252 | 'category': 'Cash Withdrawal', 253 | 'amount': amount, 254 | 'latitude': latitude, 255 | 'longitude': longitude, 256 | 'city': city, 257 | 'country': 'US', 258 | 'fraud_label': 0 259 | }) 260 | timestamp = current 261 | return atms 262 | 263 | def generate_susceptible_cards(credit_cards: list) -> list: 264 | """.""" 265 | susceptible_cards = [] 266 | visited_cards = [] 267 | for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items(): 268 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 269 | start, end = span 270 | for _ in range(n): 271 | for card in credit_cards: 272 | if card['age'] > start and card['age'] < end: 273 | if card['cc_num'] not in visited_cards: 274 | current = card 275 | visited_cards.append(card['cc_num']) 276 | break 277 | else: 278 | current = None 279 | if current is not None: 280 | susceptible_cards.append(current) 281 | return susceptible_cards 282 | 283 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list: 284 | """.""" 285 | normal_atm_withdrawals = [] 286 | atm_transactions = len(cash_amounts) 287 | cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1)) 288 | atm_count = 0 289 | while atm_count < atm_transactions: 290 | for card in cash_withdrawal_cards: 291 | for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 292 | # interval in hours between normal cash withdrawals 293 | delta = random.randint(6, 168) 294 | atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS) 295 | normal_atm_withdrawals.append(atm_tr) 296 | atm_count += ATM_WITHRAWAL_SEQ 297 | return normal_atm_withdrawals 298 | 299 | 300 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list: 301 | """.""" 302 | timestamps = [] 303 | timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT) 304 | for _ in range(chain_length): 305 | # interval in seconds between fraudulent attacks 306 | delta = random.randint(30, 120) 307 | current = timestamp + datetime.timedelta(seconds=delta) 308 | timestamps.append(current.strftime(DATE_FORMAT)) 309 | timestamp = current 310 | return timestamps 311 | 312 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list: 313 | """.""" 314 | amounts = [] 315 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 316 | n = math.ceil(chain_length * percentage) 317 | start, end = span 318 | for _ in range(n): 319 | amounts.append(get_random_transaction_amount(start, end+1)) 320 | return amounts[:chain_length] 321 | 322 | 323 | def update_transactions(transactions: list, chains: list) -> list: 324 | """.""" 325 | for key, chain in chains.items(): 326 | transaction = transactions[key] 327 | timestamp = transaction['datetime'] 328 | cc_num = transaction['cc_num'] 329 | amount = transaction['amount'] 330 | transaction['fraud_label'] = 1 331 | inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain)) 332 | inject_amounts = generate_amounts_for_fraud_attacks(len(chain)) 333 | random.shuffle(inject_amounts) 334 | for i, idx in enumerate(chain): 335 | original_transaction = transactions[idx] 336 | inject_timestamp = inject_timestamps[i] 337 | original_transaction['datetime'] = inject_timestamp 338 | original_transaction['fraud_label'] = 1 339 | original_transaction['cc_num'] = cc_num 340 | original_transaction['amount'] = inject_amounts[i] 341 | original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0] 342 | original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount) 343 | transactions[idx] = original_transaction 344 | 345 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list: 346 | """.""" 347 | return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \ 348 | int(FRAUD_RATIO * len(normal_atm_withdrawals))) 349 | 350 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\ 351 | cash_amounts: list): 352 | """.""" 353 | for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs: 354 | # interval in seconds between fraudulent attacks 355 | delta = random.randint(1, 5) 356 | atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx] 357 | pre_fraudulent_atm_tr = atm_withdrawal[0] 358 | fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number = 359 | pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0] 360 | fraudulent_atm_location = faker.location_on_land() 361 | while fraudulent_atm_location[3] == 'US': 362 | fraudulent_atm_location = faker.location_on_land() 363 | fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'], 364 | DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT) 365 | fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0] 366 | fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1] 367 | fraudulent_atm_tr['city'] = fraudulent_atm_location[2] 368 | fraudulent_atm_tr['country'] = fraudulent_atm_location[3] 369 | fraudulent_atm_tr['fraud_label'] = 1 370 | atm_withdrawal.append(fraudulent_atm_tr) 371 | normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal 372 | 373 | 374 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame: 375 | """.""" 376 | for atm_withdrawal in normal_atm_withdrawals: 377 | for withdrawal in atm_withdrawal: 378 | transactions.append(withdrawal) 379 | return pd.DataFrame.from_records(transactions) 380 | 381 | 382 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame: 383 | """.""" 384 | df = pd.DataFrame.from_records(credit_cards) 385 | # Cast the columns to the correct Pandas DType 386 | df['cc_num']= pd.to_numeric(df['cc_num']) 387 | return df 388 | 389 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame: 390 | """.""" 391 | profiles_df = generate_df_with_profiles(credit_cards) 392 | return profiles_df 393 | 394 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame: 395 | """.""" 396 | timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS) 397 | amounts = generate_amounts() 398 | categories = generate_categories(amounts) 399 | cc_df = create_credit_cards_as_df(credit_cards) 400 | transactions = generate_transactions(cc_df['cc_num'], timestamps, categories) 401 | cash_amounts = generate_cash_amounts() 402 | chains = generate_chains() 403 | susceptible_cards = generate_susceptible_cards(credit_cards) 404 | normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards) 405 | update_transactions(transactions, chains) 406 | 407 | fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals) 408 | update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts) 409 | 410 | transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals) 411 | 412 | # Cast the columns to the correct Pandas DType 413 | transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num']) 414 | transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude']) 415 | transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude']) 416 | transactions_df['datetime']= pd.to_datetime(transactions_df['datetime']) 417 | 418 | return transactions_df 419 | 420 | -------------------------------------------------------------------------------- /src/05-module/pytest-workflow.yml: -------------------------------------------------------------------------------- 1 | name: pytest-workflow 2 | 3 | on: 4 | push 5 | #workflow_dispatch: 6 | 7 | jobs: 8 | test_schedule: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: checkout repo content 12 | uses: actions/checkout@v2 13 | 14 | - name: setup python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.8.9' 18 | 19 | - name: install python packages 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install -r requirements.txt 23 | pip install pytest 24 | 25 | - name: execute python workflows from bash script 26 | env: 27 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 28 | run: cd src/05-module && python -m pytest 29 | 30 | -------------------------------------------------------------------------------- /src/05-module/scripts/run-fraud-feature-pipelines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd src/05-module 6 | 7 | jupyter nbconvert --to notebook --execute 2_cc_feature_pipeline_with_ge.ipynb 8 | 9 | -------------------------------------------------------------------------------- /src/05-module/sml/cc_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from datetime import datetime, date 5 | from math import radians 6 | 7 | # + 8 | # def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame: 9 | # """Used only in feature pipelines (not online inference). 10 | # Unit test with DataFrames and sample data. 11 | # """ 12 | # age_df = trans_df.merge(profiles_df, on="cc_num", how="left") 13 | # trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y") 14 | # return trans_df 15 | 16 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame: 17 | """Used only in feature pipelines (not online inference). 18 | Unit test with DataFrames and sample data. 19 | """ 20 | card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left") 21 | card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y") 22 | trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D") 23 | return trans_df 24 | 25 | 26 | # - 27 | 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float: 29 | """Compute Haversine distance between each consecutive coordinate in (long, lat).""" 30 | 31 | if isinstance(long, pd.Series): 32 | long = long.map(lambda x: (x)) 33 | else: 34 | long = radians(long) 35 | 36 | if isinstance(lat, pd.Series): 37 | lat = lat.map(lambda x: (x)) 38 | else: 39 | lat = radians(lat) 40 | 41 | if isinstance(long, pd.Series): 42 | prev_long = prev_long.map(lambda x: (x)) 43 | else: 44 | prev_long = radians(prev_long) 45 | 46 | if isinstance(lat, pd.Series): 47 | prev_lat = prev_lat.map(lambda x: (x)) 48 | else: 49 | prev_lat = radians(prev_lat) 50 | 51 | long_diff = prev_long - long 52 | lat_diff = prev_lat - lat 53 | 54 | a = np.sin(lat_diff/2.0)**2 55 | b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2 56 | c = 2*np.arcsin(np.sqrt(a + b)) 57 | 58 | return c 59 | 60 | 61 | def time_delta(prev_datetime: int, current_datetime: int)-> int: 62 | """Compute time difference between each consecutive transaction.""" 63 | return prev_datetime - current_datetime 64 | 65 | def time_delta_to_days(time_delta: datetime)-> float: 66 | """.""" 67 | return time_delta.total_seconds() / 86400 68 | 69 | def date_to_timestamp(date_obj: datetime)-> int: 70 | return int(date_obj.timestamp() * 1000) 71 | 72 | def timestamp_to_date(timestamp: int)-> datetime: 73 | return datetime.fromtimestamp(timestamp // 1000) 74 | 75 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame: 76 | 77 | # Convert coordinates into radians: 78 | trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians) 79 | 80 | trans_df.sort_values(["datetime", "cc_num"], inplace=True) 81 | 82 | # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most 83 | # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end), 84 | # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang). 85 | trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 86 | .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\ 87 | .reset_index(level=0, drop=True)\ 88 | .fillna(0) 89 | 90 | # Use the same `shift` operation in Pandas to get the previous row for a given cc_number 91 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 92 | .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\ 93 | .reset_index(level=0, drop=True) 94 | # .fillna(0) # handle the first datetime, which has no previous row when you call `shift` 95 | 96 | # Convert time_delta from seconds to days 97 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x)) 98 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0) 99 | # , "age_at_transaction" 100 | trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country" \ 101 | ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]] 102 | # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC 103 | # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins. 104 | trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x)) 105 | return trans_df 106 | 107 | 108 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame: 109 | 110 | cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime") 111 | 112 | # Moving average of transaction volume. 113 | df_mavg = pd.DataFrame(cc_group.mean()) 114 | df_mavg.columns = ["trans_volume_mavg", "datetime"] 115 | df_mavg = df_mavg.reset_index(level=["cc_num"]) 116 | df_mavg = df_mavg.drop(columns=["cc_num", "datetime"]) 117 | df_mavg = df_mavg.sort_index() 118 | 119 | # Moving standard deviation of transaction volume. 120 | df_std = pd.DataFrame(cc_group.mean()) 121 | df_std.columns = ["trans_volume_mstd", "datetime"] 122 | df_std = df_std.reset_index(level=["cc_num"]) 123 | df_std = df_std.drop(columns=["cc_num", "datetime"]) 124 | df_std = df_std.fillna(0) 125 | df_std = df_std.sort_index() 126 | window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True) 127 | 128 | # Moving average of transaction frequency. 129 | df_count = pd.DataFrame(cc_group.mean()) 130 | df_count.columns = ["trans_freq", "datetime"] 131 | df_count = df_count.reset_index(level=["cc_num"]) 132 | df_count = df_count.drop(columns=["cc_num", "datetime"]) 133 | df_count = df_count.sort_index() 134 | window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True) 135 | 136 | # Moving average of location difference between consecutive transactions. 137 | cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean() 138 | df_loc_delta_mavg = pd.DataFrame(cc_group) 139 | df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"] 140 | df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"]) 141 | df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"]) 142 | df_loc_delta_mavg = df_loc_delta_mavg.sort_index() 143 | window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True) 144 | 145 | window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True) 146 | 147 | return window_aggs_df 148 | -------------------------------------------------------------------------------- /src/05-module/sml/synthetic_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | from collections import defaultdict 5 | from faker import Faker 6 | import pandas as pd 7 | import numpy as np 8 | import datetime 9 | import hashlib 10 | import random 11 | import math 12 | import os 13 | import bisect 14 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple 15 | 16 | # Seed for Reproducibility 17 | faker = Faker() 18 | faker.seed_locale('en_US', 0) 19 | 20 | 21 | def set_random_seed(seed: int): 22 | random.seed(seed) 23 | np.random.seed(seed) 24 | faker.seed_instance(seed) 25 | 26 | set_random_seed(12345) 27 | 28 | 29 | TOTAL_UNIQUE_USERS = 1000 30 | TOTAL_UNIQUE_TRANSACTIONS = 54000 31 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 32 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 33 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10] 34 | NORMAL_ATM_RADIUS = 0.01 35 | START_DATE = '2022-01-01 00:00:00' 36 | END_DATE = '2022-03-01 00:00:00' 37 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S' 38 | 39 | AMOUNT_DISTRIBUTION_PERCENTAGES = { 40 | 0.05: (0.01, 1.01), 41 | 0.075: (1, 11.01), 42 | 0.525: (10, 100.01), 43 | 0.25: (100, 1000.01), 44 | 0.099: (1000, 10000.01), 45 | 0.001: (10000, 30000.01) 46 | } 47 | 48 | CATEGORY_PERC_PRICE = { 49 | "Grocery": (0.5, 0.01, 100), 50 | "Restaurant/Cafeteria": (0.2, 1, 100), 51 | "Health/Beauty": (0.1, 10, 500.01), 52 | "Domestic Transport": (0.1, 10, 100.01), 53 | "Clothing": (0.05, 10, 2000.01), 54 | "Electronics": (0.02, 100, 10000.01), 55 | "Sports/Outdoors": (0.015, 10, 100.01), 56 | "Holliday/Travel": (0.014, 10, 100.01), 57 | "Jewelery": (0.001, 10, 100.01) 58 | } 59 | 60 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent 61 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS) 62 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10] 63 | 64 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = { 65 | 0.055: (17, 24), 66 | 0.0015: (24, 34), 67 | 0.0015: (34, 44), 68 | 0.02: (44, 54), 69 | 0.022: (54, 64), 70 | 0.1: (64, 74), 71 | 0.40: (74, 84), 72 | 0.40: (84, 100), 73 | } 74 | 75 | 76 | 77 | def generate_unique_credit_card_numbers(n: int) -> pd.Series: 78 | """.""" 79 | cc_ids = set() 80 | for _ in range(n): 81 | cc_id = faker.credit_card_number(card_type='visa') 82 | cc_ids.add(cc_id) 83 | return pd.Series(list(cc_ids)) 84 | 85 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 86 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit 87 | 88 | def generate_list_credit_card_numbers() -> list: 89 | """.""" 90 | credit_cards = [] 91 | credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS) 92 | delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 93 | delta_time_object + datetime.timedelta(days=-728) 94 | for cc_num in credit_card_numbers: 95 | credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")}) 96 | return credit_cards 97 | 98 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame: 99 | """.""" 100 | profiles = [] 101 | for credit_card in credit_cards: 102 | address = faker.local_latlng(country_code = 'US') 103 | age = 0 104 | profile = None 105 | while age < 18 or age > 100: 106 | profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate']) 107 | dday = profile['birthdate'] 108 | delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day) 109 | age = int(delta.days / 365) 110 | profile['City'] = address[2] 111 | profile['Country'] = address[3] 112 | profile['cc_num'] = credit_card['cc_num'] 113 | credit_card['age'] = age 114 | profiles.append(profile) 115 | 116 | # Cast the columns to the correct Pandas DType 117 | profiles_df = pd.DataFrame.from_records(profiles) 118 | profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate']) 119 | profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num']) 120 | 121 | return profiles_df 122 | 123 | # pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS 124 | def generate_timestamps(n: int) -> list: 125 | """Return a list of timestamps of length 'n'.""" 126 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 127 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 128 | timestamps = list() 129 | for _ in range(n): 130 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT) 131 | timestamps.append(timestamp) 132 | timestamps = sorted(timestamps) 133 | return timestamps 134 | 135 | def get_random_transaction_amount(start: float, end: float) -> float: 136 | """.""" 137 | amt = round(np.random.uniform(start, end), 2) 138 | return amt 139 | 140 | def generate_amounts() -> list: 141 | """.""" 142 | amounts = [] 143 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 144 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 145 | start, end = span 146 | for _ in range(n): 147 | amounts.append(get_random_transaction_amount(start, end+1)) 148 | return amounts 149 | 150 | def generate_categories(amounts) -> list: 151 | """.""" 152 | categories = [] 153 | for category, category_perc_price in CATEGORY_PERC_PRICE.items(): 154 | percentage, min_price, max_price = category_perc_price 155 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 156 | for _ in range(n): 157 | min_price_i = bisect.bisect_left(amounts, min_price) 158 | max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i) 159 | categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])}) 160 | 161 | random.shuffle(categories) 162 | return categories 163 | 164 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str: 165 | """.""" 166 | hashable = f'{timestamp}{credit_card_number}{transaction_amount}' 167 | hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest() 168 | return hexdigest 169 | 170 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list: 171 | """.""" 172 | transactions = [] 173 | for timestamp, category in zip(timestamps, categories): 174 | credit_card_number = random.choice(credit_card_numbers) 175 | point_of_tr = faker.local_latlng(country_code = 'US') 176 | transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount']) 177 | transactions.append({ 178 | 'tid': transaction_id, 179 | 'datetime': timestamp, 180 | 'cc_num': credit_card_number, 181 | 'category': category['category'], 182 | 'amount': category['amount'], 183 | 'latitude': point_of_tr[0], 184 | 'longitude': point_of_tr[1], 185 | 'city': point_of_tr[2], 186 | 'country': point_of_tr[3], 187 | 'fraud_label': 0 188 | } 189 | ) 190 | return transactions 191 | 192 | def generate_cash_amounts() -> list: 193 | """.""" 194 | cash_amounts = [] 195 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 196 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) 197 | start, end = span 198 | for _ in range(n): 199 | cash_amounts.append(get_random_transaction_amount(start, end+1)) 200 | return cash_amounts 201 | 202 | def generate_chains(): 203 | """.""" 204 | visited = set() 205 | chains = defaultdict(list) 206 | 207 | def size(chains: dict) -> int: 208 | counts = {key: len(values)+1 for (key, values) in chains.items()} 209 | return sum(counts.values()) 210 | 211 | 212 | def generate_attack_chain(i: int): 213 | chain_length = random.choice(ATTACK_CHAIN_LENGTHS) 214 | for j in range(1, chain_length): 215 | if i+j not in visited: 216 | if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS: 217 | break 218 | chains[i].append(i+j) 219 | visited.add(i+j) 220 | 221 | while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS: 222 | i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS)) 223 | if i not in visited: 224 | generate_attack_chain(i) 225 | visited.add(i) 226 | return chains 227 | 228 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \ 229 | delta: int, radius: float = None, country_code = 'US') -> List[Dict]: 230 | """.""" 231 | atms = [] 232 | if length < 0: 233 | raise Exception('Length must be > 0') 234 | 235 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 236 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 237 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None) 238 | point_of_tr = faker.local_latlng(country_code = country_code) 239 | latitude = point_of_tr[0] 240 | longitude = point_of_tr[1] 241 | city = point_of_tr[2] 242 | for _ in range(length): 243 | current = timestamp + datetime.timedelta(hours=delta) 244 | if radius is not None: 245 | latitude = faker.coordinate(latitude, radius) 246 | longitude = faker.coordinate(longitude, radius) 247 | amount = random.sample(cash_amounts, 1)[0] 248 | transaction_id = generate_transaction_id(timestamp, credit_card_number, amount) 249 | atms.append({'tid': transaction_id, 250 | 'datetime': current.strftime(DATE_FORMAT), 251 | 'cc_num': credit_card_number, 252 | 'category': 'Cash Withdrawal', 253 | 'amount': amount, 254 | 'latitude': latitude, 255 | 'longitude': longitude, 256 | 'city': city, 257 | 'country': 'US', 258 | 'fraud_label': 0 259 | }) 260 | timestamp = current 261 | return atms 262 | 263 | def generate_susceptible_cards(credit_cards: list) -> list: 264 | """.""" 265 | susceptible_cards = [] 266 | visited_cards = [] 267 | for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items(): 268 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 269 | start, end = span 270 | for _ in range(n): 271 | for card in credit_cards: 272 | if card['age'] > start and card['age'] < end: 273 | if card['cc_num'] not in visited_cards: 274 | current = card 275 | visited_cards.append(card['cc_num']) 276 | break 277 | else: 278 | current = None 279 | if current is not None: 280 | susceptible_cards.append(current) 281 | return susceptible_cards 282 | 283 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list: 284 | """.""" 285 | normal_atm_withdrawals = [] 286 | atm_transactions = len(cash_amounts) 287 | cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1)) 288 | atm_count = 0 289 | while atm_count < atm_transactions: 290 | for card in cash_withdrawal_cards: 291 | for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 292 | # interval in hours between normal cash withdrawals 293 | delta = random.randint(6, 168) 294 | atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS) 295 | normal_atm_withdrawals.append(atm_tr) 296 | atm_count += ATM_WITHRAWAL_SEQ 297 | return normal_atm_withdrawals 298 | 299 | 300 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list: 301 | """.""" 302 | timestamps = [] 303 | timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT) 304 | for _ in range(chain_length): 305 | # interval in seconds between fraudulent attacks 306 | delta = random.randint(30, 120) 307 | current = timestamp + datetime.timedelta(seconds=delta) 308 | timestamps.append(current.strftime(DATE_FORMAT)) 309 | timestamp = current 310 | return timestamps 311 | 312 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list: 313 | """.""" 314 | amounts = [] 315 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 316 | n = math.ceil(chain_length * percentage) 317 | start, end = span 318 | for _ in range(n): 319 | amounts.append(get_random_transaction_amount(start, end+1)) 320 | return amounts[:chain_length] 321 | 322 | 323 | def update_transactions(transactions: list, chains: list) -> list: 324 | """.""" 325 | for key, chain in chains.items(): 326 | transaction = transactions[key] 327 | timestamp = transaction['datetime'] 328 | cc_num = transaction['cc_num'] 329 | amount = transaction['amount'] 330 | transaction['fraud_label'] = 1 331 | inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain)) 332 | inject_amounts = generate_amounts_for_fraud_attacks(len(chain)) 333 | random.shuffle(inject_amounts) 334 | for i, idx in enumerate(chain): 335 | original_transaction = transactions[idx] 336 | inject_timestamp = inject_timestamps[i] 337 | original_transaction['datetime'] = inject_timestamp 338 | original_transaction['fraud_label'] = 1 339 | original_transaction['cc_num'] = cc_num 340 | original_transaction['amount'] = inject_amounts[i] 341 | original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0] 342 | original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount) 343 | transactions[idx] = original_transaction 344 | 345 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list: 346 | """.""" 347 | return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \ 348 | int(FRAUD_RATIO * len(normal_atm_withdrawals))) 349 | 350 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\ 351 | cash_amounts: list): 352 | """.""" 353 | for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs: 354 | # interval in seconds between fraudulent attacks 355 | delta = random.randint(1, 5) 356 | atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx] 357 | pre_fraudulent_atm_tr = atm_withdrawal[0] 358 | fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number = 359 | pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0] 360 | fraudulent_atm_location = faker.location_on_land() 361 | while fraudulent_atm_location[3] == 'US': 362 | fraudulent_atm_location = faker.location_on_land() 363 | fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'], 364 | DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT) 365 | fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0] 366 | fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1] 367 | fraudulent_atm_tr['city'] = fraudulent_atm_location[2] 368 | fraudulent_atm_tr['country'] = fraudulent_atm_location[3] 369 | fraudulent_atm_tr['fraud_label'] = 1 370 | atm_withdrawal.append(fraudulent_atm_tr) 371 | normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal 372 | 373 | 374 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame: 375 | """.""" 376 | for atm_withdrawal in normal_atm_withdrawals: 377 | for withdrawal in atm_withdrawal: 378 | transactions.append(withdrawal) 379 | return pd.DataFrame.from_records(transactions) 380 | 381 | 382 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame: 383 | """.""" 384 | df = pd.DataFrame.from_records(credit_cards) 385 | # Cast the columns to the correct Pandas DType 386 | df['cc_num']= pd.to_numeric(df['cc_num']) 387 | return df 388 | 389 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame: 390 | """.""" 391 | profiles_df = generate_df_with_profiles(credit_cards) 392 | return profiles_df 393 | 394 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame: 395 | """.""" 396 | timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS) 397 | amounts = generate_amounts() 398 | categories = generate_categories(amounts) 399 | cc_df = create_credit_cards_as_df(credit_cards) 400 | transactions = generate_transactions(cc_df['cc_num'], timestamps, categories) 401 | cash_amounts = generate_cash_amounts() 402 | chains = generate_chains() 403 | susceptible_cards = generate_susceptible_cards(credit_cards) 404 | normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards) 405 | update_transactions(transactions, chains) 406 | 407 | fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals) 408 | update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts) 409 | 410 | transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals) 411 | 412 | # Cast the columns to the correct Pandas DType 413 | transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num']) 414 | transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude']) 415 | transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude']) 416 | transactions_df['datetime']= pd.to_datetime(transactions_df['datetime']) 417 | 418 | return transactions_df 419 | 420 | -------------------------------------------------------------------------------- /src/05-module/test_sml/test_sml.py: -------------------------------------------------------------------------------- 1 | from sml import synthetic_data 2 | from unittest import TestCase 3 | import pytest 4 | from contextlib import nullcontext as does_not_raise 5 | 6 | @pytest.mark.parametrize( 7 | "credit_card_number, cash_amounts, length, delta, radius, country_code, excp", 8 | [("1111 2222 3333 4444",[112.10, 11.23], 1, 1, 10.0, 'US', does_not_raise()) 9 | ,("1111 2222 3333 44",[-12.00], -1, 1, 1.0, 'IE', pytest.raises(Exception))] 10 | ) 11 | def test_generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \ 12 | delta: int, radius: float, country_code, excp): 13 | with excp: 14 | synthetic_data.generate_atm_withdrawal(credit_card_number, cash_amounts, length, delta, radius, country_code) 15 | 16 | -------------------------------------------------------------------------------- /src/06-module/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 Jim Dowling 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /src/06-module/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Directory Structure 4 | 5 | 6 | ├── LICENSE 7 | ├── README.md <- README explains this Python module to both developers and users. 8 | │ 9 | ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), 10 | │ └── my_module <- A symbolic link to the 'my_module' directory 11 | │ On Linux/Mac: cd notebooks ; ln -s ../my_module . 12 | │ 13 | ├── requirements.txt <- The requirements file for creating the Python environment. Install in a venv/conda environment. 14 | │ `conda activate my_env` 15 | │ my_env> `pip install -r requirements.txt` 16 | │ 17 | ├── setup.py <- Make this project pip installable with `pip install -e` 18 | ├── my_module <- Source code for this project. 19 | │ ├── __init__.py <- Makes a Python module 20 | │ │ 21 | │ ├── pipelines <- Feature pipelines, training pipelines, batch inference pipelines. 22 | │ │ │── feature_pipeline.py 23 | │ │ │── training_pipeline.py 24 | │ │ └── batch_inference_pipeline.py 25 | │ │ 26 | │ ├── features <- Python modules to turn raw data into features for use in both training and inference 27 | │ │ └── my_features.py 28 | │ │ 29 | │ ├── transformations<- Python modules with model-specific transformation functions 30 | │ │ └── my_transformations.py 31 | │ │ 32 | │ ├── tests <- Pytest unit tests for feature logic 33 | │ │ └── test_features.py 34 | │ │ 35 | │ ├── pipeline_tests <- Pytest to run end-to-end tests for pipelines 36 | │ │ └── test_feature_pipelines.py 37 | │ │ 38 | │ └── visualization <- Scripts to create exploratory and results oriented visualizations 39 | │ └── eda_visualize.py 40 | │ 41 | └── scripts <- Bash scripts for the project 42 | -------------------------------------------------------------------------------- /src/06-module/notebooks/predict_example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import hsfs 4 | import joblib 5 | 6 | class Predict(object): 7 | 8 | def __init__(self): 9 | """ Initializes the serving state, reads a trained model""" 10 | # load the trained model 11 | self.model = joblib.load(os.environ["ARTIFACT_FILES_PATH"] + "/xgboost.pkl") 12 | print("Initialization Complete") 13 | 14 | def predict(self, inputs): 15 | """ Serves a prediction request usign a trained model""" 16 | return self.model.predict(np.asarray(inputs).reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable 17 | 18 | -------------------------------------------------------------------------------- /src/06-module/notebooks/xgboost.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/06-module/notebooks/xgboost.pkl -------------------------------------------------------------------------------- /src/06-module/requirements.txt: -------------------------------------------------------------------------------- 1 | hopsworks 2 | -------------------------------------------------------------------------------- /src/06-module/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name='sml', 5 | version='0.1', 6 | packages=['sml',], 7 | license='Apache v2', 8 | long_description='Serverless Machine Learning', 9 | ) 10 | -------------------------------------------------------------------------------- /src/06-module/sml/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | 14 | 15 | __version__ = "0.1" 16 | -------------------------------------------------------------------------------- /src/06-module/sml/features/cc_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from datetime import datetime, date 5 | from math import radians 6 | 7 | # + 8 | def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame: 9 | """Used only in feature pipelines (not online inference). 10 | Unit test with DataFrames and sample data. 11 | """ 12 | age_df = trans_df.merge(profiles_df, on="cc_num", how="left") 13 | trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y") 14 | profiles_df = age_df[["name", "sex", "mail", "birthdate", "City", "Country", "cc_num", "datetime", "month"]] 15 | return trans_df, profiles_df 16 | 17 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame: 18 | """Used only in feature pipelines (not online inference). 19 | Unit test with DataFrames and sample data. 20 | """ 21 | card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left") 22 | card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y") 23 | trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D") 24 | return trans_df 25 | 26 | 27 | # + 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float: 29 | """Compute Haversine distance between each consecutive coordinate in (long, lat).""" 30 | 31 | # if long > 180 or prev_long > 180: 32 | # raise Exception('longitude cannot be greater than 180') 33 | 34 | # if lat > 90 or prev_lat > 90: 35 | # raise Exception('latitude cannot be greater than 90') 36 | 37 | # if long < -180 or prev_long < -180: 38 | # raise Exception('longitude cannot be less than -180') 39 | 40 | # if lat < -90 or prev_lat < -90: 41 | # raise Exception('latitude cannot be less than -90') 42 | 43 | if isinstance(long, pd.Series): 44 | long = long.map(lambda x: radians(x)) 45 | else: 46 | long = radians(long) 47 | 48 | if isinstance(lat, pd.Series): 49 | lat = lat.map(lambda x: radians(x)) 50 | else: 51 | lat = radians(lat) 52 | 53 | if isinstance(long, pd.Series): 54 | prev_long = prev_long.map(lambda x: radians(x)) 55 | else: 56 | prev_long = radians(prev_long) 57 | 58 | if isinstance(lat, pd.Series): 59 | prev_lat = prev_lat.map(lambda x: radians(x)) 60 | else: 61 | prev_lat = radians(prev_lat) 62 | 63 | long_diff = prev_long - long 64 | lat_diff = prev_lat - lat 65 | 66 | a = np.sin(lat_diff/2.0)**2 67 | b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2 68 | c = 2*np.arcsin(np.sqrt(a + b)) 69 | 70 | return c 71 | 72 | 73 | # - 74 | 75 | def time_delta(prev_datetime: int, current_datetime: int)-> int: 76 | """Compute time difference between each consecutive transaction.""" 77 | return prev_datetime - current_datetime 78 | 79 | def time_delta_to_days(time_delta: datetime)-> float: 80 | """.""" 81 | return time_delta.total_seconds() / 86400 82 | 83 | def date_to_timestamp(date_obj: datetime)-> int: 84 | return int(date_obj.timestamp() * 1000) 85 | 86 | def timestamp_to_date(timestamp: int)-> datetime: 87 | return datetime.fromtimestamp(timestamp // 1000) 88 | 89 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame: 90 | 91 | # Convert coordinates into radians: 92 | trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians) 93 | 94 | trans_df.sort_values(["datetime", "cc_num"], inplace=True) 95 | 96 | # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most 97 | # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end), 98 | # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang). 99 | trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 100 | .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\ 101 | .reset_index(level=0, drop=True)\ 102 | .fillna(0) 103 | 104 | # Use the same `shift` operation in Pandas to get the previous row for a given cc_number 105 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 106 | .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\ 107 | .reset_index(level=0, drop=True) 108 | # .fillna(0) # handle the first datetime, which has no previous row when you call `shift` 109 | 110 | # Convert time_delta from seconds to days 111 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x)) 112 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0) 113 | trans_df = trans_df[["tid","datetime", "month", "cc_num","category", "amount", "city", "country", "age_at_transaction"\ 114 | ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]] 115 | # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC 116 | # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins. 117 | trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x)) 118 | return trans_df 119 | 120 | 121 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame: 122 | 123 | cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime") 124 | 125 | # Moving average of transaction volume. 126 | df_mavg = pd.DataFrame(cc_group.mean()) 127 | df_mavg.columns = ["trans_volume_mavg", "datetime"] 128 | df_mavg = df_mavg.reset_index(level=["cc_num"]) 129 | df_mavg = df_mavg.drop(columns=["cc_num", "datetime"]) 130 | df_mavg = df_mavg.sort_index() 131 | 132 | # Moving standard deviation of transaction volume. 133 | df_std = pd.DataFrame(cc_group.mean()) 134 | df_std.columns = ["trans_volume_mstd", "datetime"] 135 | df_std = df_std.reset_index(level=["cc_num"]) 136 | df_std = df_std.drop(columns=["cc_num", "datetime"]) 137 | df_std = df_std.fillna(0) 138 | df_std = df_std.sort_index() 139 | window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True) 140 | 141 | # Moving average of transaction frequency. 142 | df_count = pd.DataFrame(cc_group.mean()) 143 | df_count.columns = ["trans_freq", "datetime"] 144 | df_count = df_count.reset_index(level=["cc_num"]) 145 | df_count = df_count.drop(columns=["cc_num", "datetime"]) 146 | df_count = df_count.sort_index() 147 | window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True) 148 | 149 | # Moving average of location difference between consecutive transactions. 150 | cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean() 151 | df_loc_delta_mavg = pd.DataFrame(cc_group) 152 | df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"] 153 | df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"]) 154 | df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"]) 155 | df_loc_delta_mavg = df_loc_delta_mavg.sort_index() 156 | window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True) 157 | 158 | window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime", "month"]].sort_index(),left_index=True, right_index=True) 159 | 160 | return window_aggs_df 161 | -------------------------------------------------------------------------------- /src/06-module/sml/features/synthetic_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | # pip install faker 5 | 6 | from collections import defaultdict 7 | from faker import Faker 8 | import pandas as pd 9 | import numpy as np 10 | import datetime 11 | import hashlib 12 | import random 13 | import math 14 | import os 15 | import bisect 16 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple 17 | 18 | # Seed for Reproducibility 19 | faker = Faker() 20 | faker.seed_locale('en_US', 0) 21 | 22 | 23 | def set_random_seed(seed: int): 24 | random.seed(seed) 25 | np.random.seed(seed) 26 | faker.seed_instance(seed) 27 | 28 | set_random_seed(12345) 29 | 30 | 31 | TOTAL_UNIQUE_USERS = 1000 32 | TOTAL_UNIQUE_TRANSACTIONS = 54000 33 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 34 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 35 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10] 36 | NORMAL_ATM_RADIUS = 0.01 37 | 38 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S' 39 | END_DATE = datetime.datetime.now().strftime(DATE_FORMAT) 40 | START_DATE = (datetime.datetime.now() - datetime.timedelta(days=30*6)).strftime(DATE_FORMAT) 41 | 42 | 43 | AMOUNT_DISTRIBUTION_PERCENTAGES = { 44 | 0.05: (0.01, 1.01), 45 | 0.075: (1, 11.01), 46 | 0.525: (10, 100.01), 47 | 0.25: (100, 1000.01), 48 | 0.099: (1000, 10000.01), 49 | 0.001: (10000, 30000.01) 50 | } 51 | 52 | CATEGORY_PERC_PRICE = { 53 | "Grocery": (0.5, 0.01, 100), 54 | "Restaurant/Cafeteria": (0.2, 1, 100), 55 | "Health/Beauty": (0.1, 10, 500.01), 56 | "Domestic Transport": (0.1, 10, 100.01), 57 | "Clothing": (0.05, 10, 2000.01), 58 | "Electronics": (0.02, 100, 10000.01), 59 | "Sports/Outdoors": (0.015, 10, 100.01), 60 | "Holliday/Travel": (0.014, 10, 100.01), 61 | "Jewelery": (0.001, 10, 100.01) 62 | } 63 | 64 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent 65 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS) 66 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10] 67 | 68 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = { 69 | 0.055: (17, 24), 70 | 0.0015: (24, 34), 71 | 0.0015: (34, 44), 72 | 0.02: (44, 54), 73 | 0.022: (54, 64), 74 | 0.1: (64, 74), 75 | 0.40: (74, 84), 76 | 0.40: (84, 100), 77 | } 78 | 79 | 80 | def date_to_year_month(date_obj: datetime)-> datetime.date: 81 | return date_obj.strftime('%Y-%m') 82 | 83 | def generate_unique_credit_card_numbers(n: int) -> pd.Series: 84 | """.""" 85 | cc_ids = set() 86 | for _ in range(n): 87 | cc_id = faker.credit_card_number(card_type='visa') 88 | cc_ids.add(cc_id) 89 | return pd.Series(list(cc_ids)) 90 | 91 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 92 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit 93 | 94 | def generate_list_credit_card_numbers() -> list: 95 | """.""" 96 | credit_cards = [] 97 | credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS) 98 | delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 99 | delta_time_object + datetime.timedelta(days=-728) 100 | for cc_num in credit_card_numbers: 101 | credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")}) 102 | return credit_cards 103 | 104 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame: 105 | """.""" 106 | profiles = [] 107 | for credit_card in credit_cards: 108 | address = faker.local_latlng(country_code = 'US') 109 | age = 0 110 | profile = None 111 | while age < 18 or age > 100: 112 | profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate']) 113 | dday = profile['birthdate'] 114 | delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day) 115 | age = int(delta.days / 365) 116 | profile['City'] = address[2] 117 | profile['Country'] = address[3] 118 | profile['cc_num'] = credit_card['cc_num'] 119 | credit_card['age'] = age 120 | profiles.append(profile) 121 | 122 | # Cast the columns to the correct Pandas DType 123 | profiles_df = pd.DataFrame.from_records(profiles) 124 | profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate']) 125 | profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num']) 126 | 127 | return profiles_df 128 | 129 | # pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS 130 | def generate_timestamps(n: int) -> list: 131 | """Return a list of timestamps of length 'n'.""" 132 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 133 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 134 | timestamps = list() 135 | for _ in range(n): 136 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT) 137 | timestamps.append(timestamp) 138 | timestamps = sorted(timestamps) 139 | return timestamps 140 | 141 | def get_random_transaction_amount(start: float, end: float) -> float: 142 | """.""" 143 | amt = round(np.random.uniform(start, end), 2) 144 | return amt 145 | 146 | def generate_amounts() -> list: 147 | """.""" 148 | amounts = [] 149 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 150 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 151 | start, end = span 152 | for _ in range(n): 153 | amounts.append(get_random_transaction_amount(start, end+1)) 154 | return amounts 155 | 156 | def generate_categories(amounts) -> list: 157 | """.""" 158 | categories = [] 159 | for category, category_perc_price in CATEGORY_PERC_PRICE.items(): 160 | percentage, min_price, max_price = category_perc_price 161 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 162 | for _ in range(n): 163 | min_price_i = bisect.bisect_left(amounts, min_price) 164 | max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i) 165 | categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])}) 166 | 167 | random.shuffle(categories) 168 | return categories 169 | 170 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str: 171 | """.""" 172 | hashable = f'{timestamp}{credit_card_number}{transaction_amount}' 173 | hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest() 174 | return hexdigest 175 | 176 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list: 177 | """.""" 178 | transactions = [] 179 | for timestamp, category in zip(timestamps, categories): 180 | credit_card_number = random.choice(credit_card_numbers) 181 | point_of_tr = faker.local_latlng(country_code = 'US') 182 | transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount']) 183 | transactions.append({ 184 | 'tid': transaction_id, 185 | 'datetime': timestamp, 186 | 'cc_num': credit_card_number, 187 | 'category': category['category'], 188 | 'amount': category['amount'], 189 | 'latitude': point_of_tr[0], 190 | 'longitude': point_of_tr[1], 191 | 'city': point_of_tr[2], 192 | 'country': point_of_tr[3], 193 | 'fraud_label': 0 194 | } 195 | ) 196 | return transactions 197 | 198 | def generate_cash_amounts() -> list: 199 | """.""" 200 | cash_amounts = [] 201 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 202 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) 203 | start, end = span 204 | for _ in range(n): 205 | cash_amounts.append(get_random_transaction_amount(start, end+1)) 206 | return cash_amounts 207 | 208 | def generate_chains(): 209 | """.""" 210 | visited = set() 211 | chains = defaultdict(list) 212 | 213 | def size(chains: dict) -> int: 214 | counts = {key: len(values)+1 for (key, values) in chains.items()} 215 | return sum(counts.values()) 216 | 217 | 218 | def generate_attack_chain(i: int): 219 | chain_length = random.choice(ATTACK_CHAIN_LENGTHS) 220 | for j in range(1, chain_length): 221 | if i+j not in visited: 222 | if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS: 223 | break 224 | chains[i].append(i+j) 225 | visited.add(i+j) 226 | 227 | while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS: 228 | i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS)) 229 | if i not in visited: 230 | generate_attack_chain(i) 231 | visited.add(i) 232 | return chains 233 | 234 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \ 235 | delta: int, radius: float = None, country_code = 'US') -> List[Dict]: 236 | """.""" 237 | atms = [] 238 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 239 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 240 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None) 241 | point_of_tr = faker.local_latlng(country_code = country_code) 242 | latitude = point_of_tr[0] 243 | longitude = point_of_tr[1] 244 | city = point_of_tr[2] 245 | for _ in range(length): 246 | current = timestamp - datetime.timedelta(hours=delta) 247 | if radius is not None: 248 | latitude = faker.coordinate(latitude, radius) 249 | longitude = faker.coordinate(longitude, radius) 250 | amount = random.sample(cash_amounts, 1)[0] 251 | transaction_id = generate_transaction_id(timestamp, credit_card_number, amount) 252 | atms.append({'tid': transaction_id, 253 | 'datetime': current.strftime(DATE_FORMAT), 254 | 'cc_num': credit_card_number, 255 | 'category': 'Cash Withdrawal', 256 | 'amount': amount, 257 | 'latitude': latitude, 258 | 'longitude': longitude, 259 | 'city': city, 260 | 'country': 'US', 261 | 'fraud_label': 0 262 | }) 263 | timestamp = current 264 | return atms 265 | 266 | def generate_susceptible_cards(credit_cards: list) -> list: 267 | """.""" 268 | susceptible_cards = [] 269 | visited_cards = [] 270 | for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items(): 271 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 272 | start, end = span 273 | for _ in range(n): 274 | for card in credit_cards: 275 | if card['age'] > start and card['age'] < end: 276 | if card['cc_num'] not in visited_cards: 277 | current = card 278 | visited_cards.append(card['cc_num']) 279 | break 280 | else: 281 | current = None 282 | if current is not None: 283 | susceptible_cards.append(current) 284 | return susceptible_cards 285 | 286 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list: 287 | """.""" 288 | normal_atm_withdrawals = [] 289 | atm_transactions = len(cash_amounts) 290 | cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1)) 291 | atm_count = 0 292 | while atm_count < atm_transactions: 293 | for card in cash_withdrawal_cards: 294 | for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 295 | # interval in hours between normal cash withdrawals 296 | delta = random.randint(6, 168) 297 | atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS) 298 | normal_atm_withdrawals.append(atm_tr) 299 | atm_count += ATM_WITHRAWAL_SEQ 300 | return normal_atm_withdrawals 301 | 302 | 303 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list: 304 | """.""" 305 | timestamps = [] 306 | timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT) 307 | for _ in range(chain_length): 308 | # interval in seconds between fraudulent attacks 309 | delta = random.randint(30, 120) 310 | current = timestamp + datetime.timedelta(seconds=delta) 311 | timestamps.append(current.strftime(DATE_FORMAT)) 312 | timestamp = current 313 | return timestamps 314 | 315 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list: 316 | """.""" 317 | amounts = [] 318 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 319 | n = math.ceil(chain_length * percentage) 320 | start, end = span 321 | for _ in range(n): 322 | amounts.append(get_random_transaction_amount(start, end+1)) 323 | return amounts[:chain_length] 324 | 325 | 326 | def update_transactions(transactions: list, chains: list) -> list: 327 | """.""" 328 | for key, chain in chains.items(): 329 | transaction = transactions[key] 330 | timestamp = transaction['datetime'] 331 | cc_num = transaction['cc_num'] 332 | amount = transaction['amount'] 333 | transaction['fraud_label'] = 1 334 | inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain)) 335 | inject_amounts = generate_amounts_for_fraud_attacks(len(chain)) 336 | random.shuffle(inject_amounts) 337 | for i, idx in enumerate(chain): 338 | original_transaction = transactions[idx] 339 | inject_timestamp = inject_timestamps[i] 340 | original_transaction['datetime'] = inject_timestamp 341 | original_transaction['fraud_label'] = 1 342 | original_transaction['cc_num'] = cc_num 343 | original_transaction['amount'] = inject_amounts[i] 344 | original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0] 345 | original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount) 346 | transactions[idx] = original_transaction 347 | 348 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list: 349 | """.""" 350 | return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \ 351 | int(FRAUD_RATIO * len(normal_atm_withdrawals))) 352 | 353 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\ 354 | cash_amounts: list): 355 | """.""" 356 | for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs: 357 | # interval in seconds between fraudulent attacks 358 | delta = random.randint(1, 5) 359 | atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx] 360 | pre_fraudulent_atm_tr = atm_withdrawal[0] 361 | fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number = 362 | pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0] 363 | fraudulent_atm_location = faker.location_on_land() 364 | while fraudulent_atm_location[3] == 'US': 365 | fraudulent_atm_location = faker.location_on_land() 366 | 367 | fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'], 368 | DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT) 369 | 370 | fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0] 371 | fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1] 372 | fraudulent_atm_tr['city'] = fraudulent_atm_location[2] 373 | fraudulent_atm_tr['country'] = fraudulent_atm_location[3] 374 | fraudulent_atm_tr['fraud_label'] = 1 375 | atm_withdrawal.append(fraudulent_atm_tr) 376 | normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal 377 | 378 | 379 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame: 380 | """.""" 381 | for atm_withdrawal in normal_atm_withdrawals: 382 | for withdrawal in atm_withdrawal: 383 | transactions.append(withdrawal) 384 | return pd.DataFrame.from_records(transactions) 385 | 386 | 387 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame: 388 | """.""" 389 | df = pd.DataFrame.from_records(credit_cards) 390 | # Cast the columns to the correct Pandas DType 391 | df['cc_num']= pd.to_numeric(df['cc_num']) 392 | return df 393 | 394 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame: 395 | """.""" 396 | profiles_df = generate_df_with_profiles(credit_cards) 397 | return profiles_df 398 | 399 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame: 400 | """.""" 401 | timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS) 402 | amounts = generate_amounts() 403 | categories = generate_categories(amounts) 404 | cc_df = create_credit_cards_as_df(credit_cards) 405 | transactions = generate_transactions(cc_df['cc_num'], timestamps, categories) 406 | cash_amounts = generate_cash_amounts() 407 | chains = generate_chains() 408 | susceptible_cards = generate_susceptible_cards(credit_cards) 409 | 410 | normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards) 411 | update_transactions(transactions, chains) 412 | fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals) 413 | update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts) 414 | 415 | transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals) 416 | transactions_df["datetime"] = transactions_df.datetime.map(lambda x: datetime.datetime.strptime(x, DATE_FORMAT)) 417 | transactions_df["month"] = transactions_df.datetime.map(lambda x: date_to_year_month(x)) 418 | 419 | # Cast the columns to the correct Pandas DType 420 | transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num']) 421 | transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude']) 422 | transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude']) 423 | transactions_df['datetime']= pd.to_datetime(transactions_df['datetime']) 424 | 425 | fraud_labels = transactions_df[["tid", "cc_num", "datetime", "month", "fraud_label"]] 426 | transactions_df = transactions_df.drop(columns=["fraud_label"]) 427 | return transactions_df, fraud_labels 428 | -------------------------------------------------------------------------------- /src/06-module/sml/pipelines/streamlit_app.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from math import radians 3 | from sml import cc_features 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import plotly.express as px 8 | from matplotlib import pyplot 9 | import warnings 10 | 11 | import hopsworks 12 | from sml import synthetic_data 13 | 14 | import streamlit as st 15 | 16 | import folium 17 | from streamlit_folium import st_folium 18 | import json 19 | 20 | time_now = int(datetime.datetime.now().timestamp() * 1000) 21 | synthetic_data.set_random_seed(12345) 22 | credit_cards = [cc["cc_num"] for cc in synthetic_data.generate_list_credit_card_numbers()] 23 | lat = 0 24 | long = 0 25 | 26 | warnings.filterwarnings("ignore") 27 | 28 | 29 | @st.cache(allow_output_mutation=True, suppress_st_warning=True) 30 | def retrive_dataset(): 31 | st.write(36 * "-") 32 | print_fancy_header('\n💾 Dataset Retrieving...') 33 | feature_view = fs.get_feature_view("transactions_fraud_online_fv", 1) 34 | batch_data = feature_view.get_batch_data() 35 | return batch_data 36 | 37 | 38 | @st.cache(suppress_st_warning=True, allow_output_mutation=True) 39 | def get_feature_views(): 40 | fv = fs.get_feature_view("transactions_fraud_online_fv", 1) 41 | latest_record_fv = fs.get_feature_view("latest_recorded_transactions_fraud_online_fv", 1) 42 | return fv, latest_record_fv 43 | 44 | 45 | @st.cache(suppress_st_warning=True, allow_output_mutation=True) 46 | def get_deployment(project): 47 | mr = project.get_model_registry() 48 | ms = project.get_model_serving() 49 | deployment = ms.get_deployment("fraudonlinemodeldeployment") 50 | return deployment 51 | 52 | 53 | def explore_data(): 54 | st.write(36 * "-") 55 | print_fancy_header('\n👁 Data Exploration...') 56 | labels = ["Normal", "Fraudulent"] 57 | unique, counts = np.unique(test_mar_y.fraud_label.values, return_counts=True) 58 | values = counts.tolist() 59 | 60 | def plot_pie(values, labels): 61 | fig = px.pie(values=values, names=labels, title='Distribution of fraud transactions') 62 | return fig 63 | 64 | fig1 = plot_pie(values, labels) 65 | st.plotly_chart(fig1) 66 | 67 | 68 | def process_input_vector(cc_num, current_datetime, amount, long, lat): 69 | long = radians(long) 70 | lat = radians(lat) 71 | 72 | current_coordinates = pd.DataFrame({ 73 | "datetime": [int(current_datetime)], 74 | "cc_num": [cc_num], 75 | "latitude": [long], 76 | "longitude": [lat] 77 | 78 | }) 79 | 80 | # get fv for the latest recorded transactions 81 | latest_record_vector = latest_record_fv.get_feature_vector({"cc_num": cc_num}) 82 | # compute deltas between previous and current 83 | loc_delta_t_minus_1 = cc_features.haversine_distance(long=long, lat=lat, prev_long=latest_record_vector[3], 84 | prev_lat=latest_record_vector[2]) 85 | time_delta_t_minus_1 = cc_features.time_delta(cc_features.timestamp_to_date(latest_record_vector[0]), 86 | cc_features.timestamp_to_date(current_datetime)) 87 | time_delta_t_minus_1 = cc_features.time_delta_to_days(time_delta_t_minus_1) 88 | # get all features 89 | feature_vector = fv.get_feature_vector({"cc_num": cc_num}, 90 | passed_features={"amout": amount, 91 | "loc_delta_t_minus_1": loc_delta_t_minus_1, 92 | "time_delta_t_minus_1": time_delta_t_minus_1}) 93 | 94 | # drop extra features 95 | indexes_to_remove = [0, 1] 96 | return {"inputs": [i for j, i in enumerate(feature_vector) if j not in indexes_to_remove]}, current_coordinates 97 | 98 | 99 | def print_fancy_header(text, font_size=24): 100 | res = f'{text}' 101 | st.markdown(res, unsafe_allow_html=True) 102 | 103 | 104 | progress_bar = st.sidebar.header('⚙️ Working Progress') 105 | progress_bar = st.sidebar.progress(0) 106 | st.title('🆘 Fraud transactions detection 🆘') 107 | 108 | st.write(36 * "-") 109 | print_fancy_header('\n📡 Connecting to Hopsworks Feature Store...') 110 | 111 | project = hopsworks.login() 112 | fs = project.get_feature_store() 113 | progress_bar.progress(15) 114 | 115 | st.write(36 * "-") 116 | print_fancy_header('\n🤖 Connecting to Model Registry on Hopsworks...') 117 | deployment = get_deployment(project) 118 | deployment.start() 119 | st.write("✅ Connected!") 120 | 121 | progress_bar.progress(40) 122 | 123 | st.write(36 * "-") 124 | print_fancy_header('\n✨ Feature view retrieving...') 125 | fv, latest_record_fv = get_feature_views() 126 | st.write("✅ Retrieved!") 127 | 128 | progress_bar.progress(55) 129 | 130 | st.write(36 * "-") 131 | print_fancy_header('\n🧠 On map bellow select location of ATM machine') 132 | with st.form(key="Selecting cc_num"): 133 | cc_num = st.selectbox( 134 | 'Select a credit card number.', 135 | (credit_cards) 136 | ) 137 | 138 | amount = st.slider( 139 | '💶 Select withdrawal amount', 140 | 5, 1000) 141 | 142 | # my_map = folium.Map(location=[41, -73.5], zoom_start=8) 143 | my_map = folium.Map(location=[52, 24], zoom_start=3) 144 | 145 | my_map.add_child(folium.LatLngPopup()) 146 | folium.TileLayer('Stamen Terrain').add_to(my_map) 147 | folium.TileLayer('Stamen Toner').add_to(my_map) 148 | folium.TileLayer('Stamen Water Color').add_to(my_map) 149 | folium.TileLayer('cartodbpositron').add_to(my_map) 150 | folium.TileLayer('cartodbdark_matter').add_to(my_map) 151 | folium.LayerControl().add_to(my_map) 152 | 153 | res_map = st_folium(my_map, height=300, width=600) 154 | 155 | try: 156 | lat, long = res_map["last_clicked"]["lat"], res_map["last_clicked"]["lng"] 157 | 158 | st.print_fancy_header("🏧 Withdrawal coordinates:") 159 | st.write(f"Latitude: {lat}") 160 | st.write(f"Longitude: {long}") 161 | except Exception as err: 162 | print(err) 163 | pass 164 | 165 | submit_button = st.form_submit_button(label='Withdraw') 166 | 167 | progress_bar.progress(70) 168 | 169 | st.write(36 * "-") 170 | 171 | # run code below if deployment doesnt work 172 | # print_fancy_header("Initialise serving...") 173 | # fv.init_serving(1) 174 | # time_now = int(datetime.datetime.now().timestamp()*1000) 175 | 176 | data, current_coordinates = process_input_vector(cc_num=int(cc_num), 177 | current_datetime=int(time_now), 178 | amount=amount, 179 | lat=lat, long=long) 180 | 181 | if st.button('📊 Make a prediction'): 182 | res = deployment.predict(data) 183 | progress_bar.progress(80) 184 | negative = "**👌 Not a suspicious**" 185 | positive = "**🆘 Fraudulent**" 186 | res = negative if res["predictions"][0] == 0 else positive 187 | print_fancy_header(res + " transaction!") 188 | progress_bar.progress(100) 189 | deployment.stop() 190 | st.write(36 * "-") 191 | st.write("Stopping the deployment...") 192 | st.write("") 193 | st.write('\n🎉 📈 🤝 App Finished Successfully 🤝 📈 🎉') 194 | 195 | # update fg 196 | latest_recorded_transactions_fraud_online_fg = fs.get_or_create_feature_group( 197 | name="latest_recorded_transactions_fraud_online", 198 | version=1 199 | ) 200 | latest_recorded_transactions_fraud_online_fg.insert(current_coordinates) 201 | 202 | st.button("Re-run") 203 | -------------------------------------------------------------------------------- /src/06-module/sml/pipelines/streamlit_batch_app.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import joblib 3 | from math import radians 4 | from sml import cc_features 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import plotly.express as px 9 | from matplotlib import pyplot 10 | import warnings 11 | 12 | import hopsworks 13 | from sml import synthetic_data 14 | 15 | import streamlit as st 16 | 17 | import folium 18 | from streamlit_folium import st_folium 19 | import json 20 | 21 | time_now = int(datetime.datetime.now().timestamp() * 1000) 22 | start_date = (datetime.datetime.now() - datetime.timedelta(hours=24)).date() 23 | end_date = (datetime.datetime.now()).date() 24 | synthetic_data.set_random_seed(12345) 25 | credit_cards = [cc["cc_num"] for cc in synthetic_data.generate_list_credit_card_numbers()] 26 | lat = 0 27 | long = 0 28 | 29 | warnings.filterwarnings("ignore") 30 | 31 | project = hopsworks.login() 32 | fs = project.get_feature_store() 33 | 34 | @st.cache(allow_output_mutation=True, suppress_st_warning=True) 35 | def retrive_dataset(start_date, end_date): 36 | st.write(36 * "-") 37 | print_fancy_header('\n💾 Dataset Retrieving...') 38 | feature_view = fs.get_feature_view("transactions_fraud_online_fv", 1) 39 | batch_data = feature_view.get_batch_data(start_time = start_date, end_time = end_date) 40 | batch_data.drop(["cc_num", "datetime"], axis = 1, inplace=True) 41 | return batch_data 42 | 43 | 44 | @st.cache(suppress_st_warning=True, allow_output_mutation=True) 45 | def get_feature_views(): 46 | fv = fs.get_feature_view("transactions_fraud_online_fv", 1) 47 | latest_record_fv = fs.get_feature_view("latest_recorded_transactions_fraud_online_fv", 1) 48 | return fv, latest_record_fv 49 | 50 | 51 | @st.cache(allow_output_mutation=True,suppress_st_warning=True) 52 | def get_model(project = project): 53 | mr = project.get_model_registry() 54 | model = mr.get_model("transactions_fraud_online_xgboost", version = 1) 55 | model_dir = model.download() 56 | return joblib.load(model_dir + "/xgboost.pkl") 57 | 58 | def explore_data(batch_data): 59 | st.write(36 * "-") 60 | print_fancy_header('\n👁 Data Exploration...') 61 | labels = ["Normal", "Fraudulent"] 62 | unique, counts = np.unique(batch_data.fraud.values, return_counts=True) 63 | values = counts.tolist() 64 | 65 | def plot_pie(values, labels): 66 | fig = px.pie(values=values, names=labels, title='Distribution of predicted fraud transactions') 67 | return fig 68 | 69 | fig1 = plot_pie(values, labels) 70 | st.plotly_chart(fig1) 71 | 72 | 73 | def print_fancy_header(text, font_size=24): 74 | res = f'{text}' 75 | st.markdown(res, unsafe_allow_html=True) 76 | 77 | def transform_preds(predictions): 78 | return ['Fraud' if pred == 1 else 'Not Fraud' for pred in predictions] 79 | 80 | progress_bar = st.sidebar.header('⚙️ Working Progress') 81 | progress_bar = st.sidebar.progress(0) 82 | st.title('🆘 Fraud transactions detection 🆘') 83 | 84 | st.write(36 * "-") 85 | print_fancy_header('\n📡 Connecting to Hopsworks Feature Store...') 86 | 87 | st.write(36 * "-") 88 | print_fancy_header('\n🤖 Connecting to Model Registry on Hopsworks...') 89 | model = get_model(project) 90 | st.write(model) 91 | st.write("✅ Connected!") 92 | 93 | progress_bar.progress(40) 94 | 95 | st.write(36 * "-") 96 | print_fancy_header('\n✨ Fetch batch data and predict') 97 | fv, latest_record_fv = get_feature_views() 98 | 99 | batch_data = retrive_dataset(start_date, end_date) 100 | st.write("✅ Retrieved!") 101 | progress_bar.progress(55) 102 | 103 | if st.button('📊 Make a prediction'): 104 | predictions = model.predict(batch_data) 105 | predictions = transform_preds(predictions) 106 | batch_data_to_explore = batch_data.copy() 107 | batch_data_to_explore['fraud'] = predictions 108 | explore_data(batch_data_to_explore) 109 | 110 | st.button("Re-run") 111 | --------------------------------------------------------------------------------