├── .github
    └── workflows
    │   ├── features-and-predictions.yml
    │   ├── fraud-batch-inference-pipeline.yml
    │   ├── fraud-feature-pipelines.yml
    │   └── main.yml
├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── README.md
    ├── actual_iris.png
    ├── confusion_matrix.png
    ├── credit_cards.parquet
    ├── df_recent.png
    ├── images
    │   ├── card_horizontal.jpg
    │   └── serverless-ml-architecture.svg
    ├── latest_iris.png
    ├── profiles.parquet
    └── transactions.parquet
├── requirements.txt
└── src
    ├── 00-intro
        ├── Feature-Store-Intro.ipynb
        ├── Pandas-Intro.ipynb
        ├── green-apples-vs-oranges.ipynb
        ├── red-and-green-apples-vs-oranges.ipynb
        └── streamlit-example.py
    ├── 01-module
        ├── assets
        │   ├── Setosa.png
        │   ├── Versicolor.png
        │   ├── Virginica.png
        │   ├── confusion_matrix.png
        │   └── iris.png
        ├── iris-batch-inference-pipeline.ipynb
        ├── iris-feature-pipeline.ipynb
        ├── iris-train-pipeline.ipynb
        ├── iris_end_to_end_ml_pipeline.ipynb
        ├── iris_model
        │   ├── confusion_matrix.png
        │   └── iris_model.pkl
        ├── orchest
        │   ├── clean repository.ipynb
        │   ├── iris-batch-inference-pipeline-orchest.ipynb
        │   ├── iris-train-pipeline-orchest.ipynb
        │   └── push-work.ipynb
        └── scripts
        │   └── run-feature-and-prediction-pipelines.sh
    ├── 02-module
        ├── 1_backfill_cc_feature_groups.ipynb
        ├── 2_cc_feature_pipeline.ipynb
        ├── scripts
        │   └── run-fraud-feature-pipelines.sh
        ├── sml
        │   ├── cc_features.py
        │   └── synthetic_data.py
        ├── test_sml
        │   └── test_sml.py
        └── titanic
        │   ├── titanic_feature_pipelines.ipynb
        │   └── titanic_training_pipeline.ipynb
    ├── 03-module
        ├── 3_model_training.ipynb
        ├── 4_batch_predictions.ipynb
        ├── iris_with_sklearn_transformer.ipynb
        └── scripts
        │   └── run-fraud-batch-inference.sh
    ├── 04-module
        ├── app.py
        ├── cc-fraud-streamlit-ui.py
        ├── requirements-gradio.txt
        ├── run-fraud-streamlit.sh
        └── sml
        │   ├── cc_features.py
        │   └── synthetic_data.py
    ├── 05-module
        ├── 1_backfill_cc_feature_groups.ipynb
        ├── 2_cc_feature_pipeline_with_ge.ipynb
        ├── iris-feature-pipeline-with-ge.ipynb
        ├── pytest-workflow.yml
        ├── scripts
        │   └── run-fraud-feature-pipelines.sh
        ├── sml
        │   ├── cc_features.py
        │   └── synthetic_data.py
        └── test_sml
        │   └── test_sml.py
    └── 06-module
        ├── LICENSE
        ├── README.md
        ├── notebooks
            ├── 1_backfill_cc_feature_groups.ipynb
            ├── 2_cc_feature_pipeline.ipynb
            ├── 2_cc_usage_window_features_pipeline.ipynb
            ├── 3_feature_view_creation.ipynb
            ├── 4_model_training.ipynb
            ├── 5_model_deployment.ipynb
            ├── 6_online_predictions.ipynb
            ├── predict_example.py
            └── xgboost.pkl
        ├── requirements.txt
        ├── setup.py
        └── sml
            ├── __init__.py
            ├── features
                ├── cc_features.py
                └── synthetic_data.py
            └── pipelines
                ├── streamlit_app.py
                └── streamlit_batch_app.py


/.github/workflows/features-and-predictions.yml:
--------------------------------------------------------------------------------
  1 | name: iris-feature-and-prediction-pipelines
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |   schedule:
  6 |     - cron: '00 00 * * *'
  7 | 
  8 | jobs:
  9 |   test_schedule:
 10 |     runs-on: ubuntu-latest
 11 |     steps:
 12 |       - name: checkout repo content
 13 |         uses: actions/checkout@v2
 14 | 
 15 |       - name: setup python
 16 |         uses: actions/setup-python@v3
 17 |         with:
 18 |           python-version: '3.10.9'
 19 | 
 20 |       - name: Install Jupyter
 21 |         run: |
 22 |          python -m pip install jupyter
 23 | 
 24 |       - name: List Jupyter Kernels
 25 |         run: jupyter kernelspec list
 26 | 
 27 |       - name: Install scikit-learn
 28 |         run: python -m pip install scikit-learn==1.2.1
 29 | 
 30 | 
 31 |           
 32 |       - name: install python packages
 33 |         run: |
 34 |           python -m pip install --upgrade pip
 35 |           pip install -r requirements.txt
 36 |           
 37 |       - name: execute python workflows from bash script
 38 |         env: 
 39 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
 40 |         run: ./src/01-module/scripts/run-feature-and-prediction-pipelines.sh
 41 | 
 42 |       - name: publish github pages
 43 |         uses: stefanzweifel/git-auto-commit-action@v4
 44 |         with:
 45 |           commit_message: "Automated graph published"
 46 |       
 47 |           # Optional. Local and remote branch name where commit is going to be pushed
 48 |           #  to. Defaults to the current branch.
 49 |           #  You might need to set `create_branch: true` if the branch does not exist.
 50 |           branch: main
 51 |  
 52 |           # Optional. Options used by `git-commit`.
 53 |           # See https://git-scm.com/docs/git-commit#_options
 54 |           commit_options: '--no-verify --signoff'
 55 |       
 56 |           # Optional glob pattern of files which should be added to the commit
 57 |           # Defaults to all (.)
 58 |           file_pattern: assets/latest_iris.png assets/actual_iris.png assets/confusion_matrix.png assets/df_recent.png
 59 |       
 60 |           # Optional. Local file path to the repository.
 61 |           # Defaults to the root of the repository.
 62 |           repository: .
 63 |       
 64 |           # Optional commit user and author settings
 65 |           commit_user_name: My GitHub Actions Bot # defaults to "github-actions[bot]"
 66 |           commit_user_email: my-github-actions-bot@example.org # defaults to "github-actions[bot]@users.noreply.github.com"
 67 |           commit_author: Author <actions@github.com> # defaults to author of the commit that triggered the run
 68 |       
 69 |           # Optional. Tag name being created in the local repository and 
 70 |           # pushed to remote repository and defined branch.
 71 |           #tagging_message: 'v1.0.0'
 72 |       
 73 |           # Optional. Option used by `git-status` to determine if the repository is 
 74 |           # dirty. See https://git-scm.com/docs/git-status#_options
 75 |           #status_options: '--untracked-files=no'
 76 |       
 77 |           # Optional. Options used by `git-add`.
 78 |           # See https://git-scm.com/docs/git-add#_options
 79 |           #add_options: '-u'
 80 |       
 81 |           # Optional. Options used by `git-push`.
 82 |           # See https://git-scm.com/docs/git-push#_options
 83 |           #push_options: '--force'
 84 |           
 85 |           # Optional. Disable dirty check and always try to create a commit and push
 86 |           skip_dirty_check: true 
 87 |           
 88 |           # Optional. Skip internal call to `git fetch`
 89 |           skip_fetch: false
 90 |           
 91 |           # Optional. Skip internal call to `git checkout`
 92 |           skip_checkout: false
 93 |       
 94 |           # Optional. Prevents the shell from expanding filenames. 
 95 |           # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html
 96 |           disable_globbing: true
 97 |       
 98 |           # Optional. Create given branch name in local and remote repository.
 99 |           create_branch: false
100 | 
101 | 


--------------------------------------------------------------------------------
/.github/workflows/fraud-batch-inference-pipeline.yml:
--------------------------------------------------------------------------------
 1 | name: fraud-batch-inference-pipeline
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | #  schedule:
 6 | #    - cron: '11 11 * * *'
 7 | 
 8 | jobs:
 9 |   test_schedule:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: checkout repo content
13 |         uses: actions/checkout@v2
14 | 
15 |       - name: setup python
16 |         uses: actions/setup-python@v2
17 |         with:
18 |           python-version: '3.8.1'
19 |           
20 |       - name: install python packages
21 |         run: |
22 |           python -m pip install --upgrade pip
23 |           pip install -r requirements.txt
24 |           
25 |       - name: execute python workflows from bash script
26 |         env: 
27 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
28 |         run: ./src/03-module/scripts/run-fraud-batch-inference.sh
29 | 
30 | 


--------------------------------------------------------------------------------
/.github/workflows/fraud-feature-pipelines.yml:
--------------------------------------------------------------------------------
 1 | name: fraud-feature-pipelines
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | #  schedule:
 6 | #    - cron: '11 11 * * *'
 7 | 
 8 | jobs:
 9 |   test_schedule:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: checkout repo content
13 |         uses: actions/checkout@v2
14 | 
15 |       - name: setup python
16 |         uses: actions/setup-python@v2
17 |         with:
18 |           python-version: '3.8.1'
19 |           
20 |       - name: install python packages
21 |         run: |
22 |           python -m pip install --upgrade pip
23 |           pip install -r requirements.txt
24 |           
25 |       - name: execute python workflows from bash script
26 |         env: 
27 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
28 |         run: ./src/02-module/scripts/run-fraud-feature-pipelines.sh
29 | 
30 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: iris-feature-and-prediction-pipelines
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: '00 00 * * *'
 7 | 
 8 | jobs:
 9 |   test_schedule:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: checkout repo content
13 |         uses: actions/checkout@v2
14 | 
15 |       - name: setup python
16 |         uses: actions/setup-python@v3
17 |         with:
18 |           python-version: '3.10.9'
19 | 
20 |       - name: Install Jupyter
21 |         run: |
22 |          python -m pip install jupyter
23 |           
24 |       - name: install python packages
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install -r requirements.txt
28 |           
29 |       - name: execute python workflows from bash script
30 |         env: 
31 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
32 |         run: ./src/01-module/scripts/run-feature-and-prediction-pipelines.sh
33 | 
34 |       - name: publish github pages
35 |         uses: stefanzweifel/git-auto-commit-action@v4
36 |         with:
37 |           commit_message: "Automated graph published"
38 |       
39 |           # Optional. Local and remote branch name where commit is going to be pushed
40 |           #  to. Defaults to the current branch.
41 |           #  You might need to set `create_branch: true` if the branch does not exist.
42 |           branch: main
43 |  
44 |           # Optional. Options used by `git-commit`.
45 |           # See https://git-scm.com/docs/git-commit#_options
46 |           commit_options: '--no-verify --signoff'
47 |       
48 |           # Optional glob pattern of files which should be added to the commit
49 |           # Defaults to all (.)
50 |           file_pattern: assets/latest_iris.png assets/actual_iris.png assets/confusion_matrix.png assets/df_recent.png
51 |       
52 |           # Optional. Local file path to the repository.
53 |           # Defaults to the root of the repository.
54 |           repository: .
55 |       
56 |           # Optional commit user and author settings
57 |           commit_user_name: My GitHub Actions Bot # defaults to "github-actions[bot]"
58 |           commit_user_email: my-github-actions-bot@example.org # defaults to "github-actions[bot]@users.noreply.github.com"
59 |           commit_author: Author <actions@github.com> # defaults to author of the commit that triggered the run
60 |       
61 |           # Optional. Tag name being created in the local repository and 
62 |           # pushed to remote repository and defined branch.
63 |           #tagging_message: 'v1.0.0'
64 |       
65 |           # Optional. Option used by `git-status` to determine if the repository is 
66 |           # dirty. See https://git-scm.com/docs/git-status#_options
67 |           #status_options: '--untracked-files=no'
68 |       
69 |           # Optional. Options used by `git-add`.
70 |           # See https://git-scm.com/docs/git-add#_options
71 |           #add_options: '-u'
72 |       
73 |           # Optional. Options used by `git-push`.
74 |           # See https://git-scm.com/docs/git-push#_options
75 |           #push_options: '--force'
76 |           
77 |           # Optional. Disable dirty check and always try to create a commit and push
78 |           skip_dirty_check: true 
79 |           
80 |           # Optional. Skip internal call to `git fetch`
81 |           skip_fetch: false
82 |           
83 |           # Optional. Skip internal call to `git checkout`
84 |           skip_checkout: false
85 |       
86 |           # Optional. Prevents the shell from expanding filenames. 
87 |           # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html
88 |           disable_globbing: true
89 |       
90 |           # Optional. Create given branch name in local and remote repository.
91 |           create_branch: false
92 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | *.nbconvert.ipynb
  7 | *~
  8 | 
  9 | .hw_api_key
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ![readme header](/assets//images/card_horizontal.jpg)
  3 | 
  4 | 
  5 | <a href="https://join.slack.com/t/featurestoreorg/shared_invite/zt-ssh8dec1-IsvwLdTsRMssjT4~Ru2RKg" alt="slack"> <img src="https://img.shields.io/badge/Join Slack-blue.svg?logo=slack" /></a> 
  6 | <a href="https://www.youtube.com/channel/UC-LrK8ei6w57RmKeswkU23Q" alt="youtube"> <img src="https://img.shields.io/badge/Youtube-red.svg?logo=Youtube" /></a>
  7 | <a href="https://discord.com/channels/1065622064800735282/1077878719881945109" alt="discord"> <img src="https://img.shields.io/badge/Join discord-white.svg?logo=discord" /></a> 
  8 | 
  9 | 
 10 | 
 11 | # **[Beyond Notebooks - Serverless Machine Learning](https://www.serverless-ml.org)**
 12 | ***Build Batch and Real-Time Prediction Services with Python***
 13 | 
 14 | ![serverless architecture](/assets/images/serverless-ml-architecture.svg "Serverless Architecture")
 15 | 
 16 | # **Overview**
 17 | You should not need to be an expert in Kubernetes or cloud computing to build an end-to-end service that makes intelligent decisions with the help of a ML model. Serverless Machine Learning (ML) makes it easy to build a system that uses ML models to make predictions. 
 18 | 
 19 | With Serverless ML, you do not need to install, upgrade, or operate any systems. You only need to be able to write Python programs that can be scheduled to run as pipelines. The features and models your pipelines produce are managed by a serverless feature store / model registry. We will also show you how to build a UI for your prediction service by writing Python and some HTML.
 20 | 
 21 | Read <a href="https://www.serverless-ml.org/what-is-serverless-machine-learning">this article</a> for an overview on serverless machine learning.
 22 | 
 23 | **Prerequisites:** Python - Pandas - Github 
 24 | 
 25 | # **Modules**
 26 | - ## **Module 00** - Introduction and optional content.
 27 |    - Why Serverless ML: [Video](https://www.youtube.com/watch?v=zM2_m898P5g) | [Slides](https://drive.google.com/file/d/15gwryDoHq88tgxu8CoCbTqr5L9YN9O5p/view?usp=sharing)
 28 |    - Introduction to the course: [Video](https://www.youtube.com/watch?v=FM1YkIl1wXI&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=3) | [slides](https://drive.google.com/file/d/1a5uZHhVSUyxxjrESFea9vONovKROra4L/view?usp=sharing)
 29 |    - Development Environment & Platforms [Video](https://www.youtube.com/watch?v=9kNjky0MQtc&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=3) | [slides](https://drive.google.com/file/d/1LTTHkwV8RirYaz1MeZtoYgTc9TRSrBwr/view?usp=sharing)
 30 | 
 31 |    - ***Introduction to Machine Learning (ML 101)*** [Video](https://www.youtube.com/watch?v=RmAGTZ7dy58&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=4) | [slides](https://drive.google.com/file/d/1HXsrSRPcBMW53lgnBnYb95m5eS9oLqRk/view?usp=sharing)
 32 |    
 33 | - ## **Module 01** - Pandas and ML Pipelines in Python. Write your first serverless App.
 34 |   - Full Lecture: [Video](https://www.youtube.com/watch?v=j-XnCflCc0I) | [Slides](https://drive.google.com/file/d/1L8DHGC5xo0NlNe8xfh4xf4NZV1CEGBA6/view?usp=sharing)
 35 | 
 36 |   - [Lab](https://www.youtube.com/watch?v=zAD3miW0Og0) | [Slides](https://drive.google.com/file/d/1hve9nVrImRhNE8lE26zPcr3X1DDDk7uD/view?usp=sharing)  | [Homework form](https://forms.gle/2p5odBdpAqvavH1T7)
 37 |   
 38 | - ## **Module 02** - Data modeling and the Feature Store. The Credit-card fraud prediction service. 
 39 |     - Full Lecture: [Video](https://youtu.be/tpxZh8lbcBk) | [Slides](https://drive.google.com/file/d/1HgAKsHnOms1XCtl_KIEuELudTLtDkhxk/view?usp=sharing)
 40 |     
 41 |     - [Lab](https://www.youtube.com/watch?v=niPayagVxFg) | [Slides](https://drive.google.com/file/d/1_1oDN5nfpWSUpKNlls45HLllQ75yAWd-/view?usp=sharing) | [Homework form](https://forms.gle/5g9XtaeBEigKEirGA)
 42 | - ## **Module 03** - Training Pipelines, Inference Pipelines, and the Model Registry.
 43 |     - Full lecture: [Video](https://youtu.be/BD1UOJs1Bvo) | [Slides](https://drive.google.com/file/d/1XhfnH7DzwDqQKS6WxDVqWFFas0fi_jnJ/view?usp=sharing)
 44 | 
 45 |     - [Lab](https://youtu.be/QfzrKgLqEXc) | [Slides](https://drive.google.com/file/d/1jITx5HGh2uM5vAeknvCaeN6ZPOc2i8AS/view?usp=sharing)
 46 | - ## **Module 04** - Serverless User Interfaces for Machine Learning Systems.
 47 |     - Full lecture: [Video](https://youtu.be/GgwIspMUovM) | [Slides](https://drive.google.com/file/d/10JzJCDwi6IPnJNZ0iApzbwACkAn3C9Y9/view?usp=sharing)
 48 | 
 49 |     - [Lab](https://youtu.be/sMhCXwm_Wmw) | [Slides](https://drive.google.com/file/d/1bASaZN68__Ut0RnSuTvhF8LKn240UPtE/view?usp=sharing)
 50 | 
 51 | - ## **Module 05** - Principles and Practices of MLOps
 52 |     - Part 01: [Video](https://youtu.be/-vbLMtfoBeo) | [Slides](https://drive.google.com/file/d/1orKJJ2e_1pNgF8X6CFBUKw7qEDQVoVqt/view?usp=share_link)
 53 |     - Part 02:  [Video](https://youtu.be/j4wZmywPs1E) | [Slides](https://drive.google.com/file/d/13r1OvuuV6Snq1r5PmAvwHU0dTiExQ4iE/view?usp=share_link)
 54 |     - Lab: [Video](https://youtu.be/BaAbiFsx25E) | [Slides](https://drive.google.com/file/d/1WOahxd4s9_NVr8JUUVJvvUFU6ea9konS/view?usp=share_link)
 55 | 
 56 | - ## **Module 06** -Operational machine learning systems: Real-time Machine Learning. 
 57 |     - Full lecture: [Video](https://youtu.be/GEgiIh9a048) | [Slides](https://drive.google.com/file/d/1VXU2jxEUMIvIY_Xe7XSrNuy0yxXt8glP/view?usp=share_link)
 58 |     - Lab: [Video](https://youtu.be/DsyNk3A6ouA) | [Slides](https://drive.google.com/file/d/1nZJKFMvAFoAu4s5smuc-rIb9EBQVME0Z/view?usp=share_link)
 59 | 
 60 | 
 61 | ---
 62 | 
 63 | ## **Learning Outcomes:**
 64 | - Learn to develop and operate AI-enabled (prediction) services on serverless infrastructure
 65 | - Develop and run serverless feature pipelines 
 66 | - Deploy features and models to serverless infrastructure
 67 | - Train models and and run batch/inference pipelines
 68 | - Develop a serverless UI for your prediction service
 69 | - Learn MLOps fundamentals: versioning, testing, data validation, and operations
 70 | - Develop and run a real-time serverless machine learning system
 71 | 
 72 | ## **Course Contents:**
 73 | - Pandas and ML Pipelines in Python. Write your first serverless App.
 74 | - The Feature Store for Machine Learning. Feature engineering for a credit-card fraud serverless App.
 75 | - Training Pipelines and Inference Pipelines
 76 | - Bring a Prediction Service to Life with a User Interface (Gradio, Github Pages, Streamlit)
 77 | - Automated Testing and Versioning of features and models
 78 | - Real-time serverless machine learning systems. Project presentation.
 79 | 
 80 | ## **Who is the target audience?**
 81 | You have taken a course in machine learning (ML) and you can program in Python. You want to take the next step beyond training models on static datasets in notebooks. You want to be able to build a prediction service around your model. Maybe you work at an Enterprise and want to demonstrate your models’ value to stakeholders in the stakeholder's own language. Maybe you want to include ML in an existing application or system.
 82 | 
 83 | ## **Why is this course different?**
 84 | You don’t need any operations experience beyond using GitHub and writing Python code. You will learn the essentials of MLOps: versioning artifacts, testing artifacts, validating artifacts, and monitoring and upgrading running systems. You will work with raw and live data - you will need to engineer features in pipelines. You will learn how to select, extract, compute, and transform features.
 85 | 
 86 | ## **Will this course cost me money?**
 87 | No. You will become a serveless machine learning engineer without having to pay to run your serverless pipelines or to manage your features/models/user-interface. We will use Github Actions and Hopsworks that both have generous time-unlimited free tiers.  
 88 | 
 89 | **Register now at [Serveless ML Course](https://www.serverless-ml.org/register)** 
 90 | 
 91 | ## **Timeline**
 92 | _Self-paced_
 93 | 
 94 | ## **Requirements**
 95 | - **Python** environment include a notebook (Jupyter or Colaboratory)
 96 | - https://github.com  account
 97 | - https://hopsworks.ai  account 
 98 | 
 99 | # **Key Technologies**
100 | 
101 | ## **Development environment**
102 | You can write, test, debug, and train your models in some Python IDE. We will focus on notebooks and Python programs. You can use Jupyter notebooks or Colaboratory.
103 | 
104 | ## **Github**
105 | Github to manage your code, GitHub Actions to run your workflows, and Github Pages for your user interface for non-interactive applications. Github Actions offers a free tier of 500 MB and 2,000 minutes to run your pipelines.
106 | https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions
107 | 
108 | ## **Hopsworks**
109 | [Hopsworks.ai](https://app.hopsworks.ai) has a free tier of 10 GB of storage.
110 | <br/><br/>
111 | 
112 | ---
113 | 
114 | ## **Useful Resources**
115 | | name | Description | link |
116 | |------|-------------|------|
117 | |**Awesome MLOps**| A collection of links and resources for MLOps| https://github.com/visenger/awesome-mlops| 
118 | |**Machine Learning Ops**| a collection of resources on how to facilitate Machine Learning Ops with GitHub.| https://mlops.githubapp.com/|
119 | |**MLOps Toys**| A curated list of MLOps projects.|https://mlops.toys/|
120 | |**MLOps Zoomcamp**| teaches practical aspects of productionizing ML services.|https://github.com/DataTalksClub/mlops-zoomcamp| 
121 | |**PYSLACKERS**|A large open community for Python programming enthusiasts.|https://pyslackers.com/web| 
122 | |**Feature Store Org**|An open community for everything feature stores.|https://www.featurestore.org| 
123 | 
124 | 
125 | ## **Other MLOps Courses**
126 | | name | Description | link |
127 | |------|-------------|------|
128 | |**MlOps Zoomcamp**| DevOps style course with Python and Docker as prerequisites.| https://github.com/DataTalksClub/mlops-zoomcamp |
129 | |**Full Stack Deep Learning**| This course shares best practices for the full stack; topics range from problem selection to dataset management to monitoring.| https://fullstackdeeplearning.com/| 
130 | |**MLOps course**| A series of lessons teaching how to apply ML to build production-grade products (by Goku Mohandas).|https://github.com/GokuMohandas/mlops-course |
131 | 
132 | ---
133 | 
134 | # **Definitions**
135 | 
136 | - [Context windows for LLMs](http://www.hopsworks.ai/dictionary/context-window-for-llms)
137 | - [Compound AI Systems](https://www.hopsworks.ai/dictionary/compound-ai-systems)
138 | - [Feature Store](https://www.hopsworks.ai/dictionary/feature-store)
139 | - [Feature Monitoring](https://www.hopsworks.ai/dictionary/feature-monitoring)
140 | - [Feature Data](https://www.hopsworks.ai/dictionary/feature-data) 
141 | - [Flash Attention](https://www.hopsworks.ai/dictionary/flash-attention)
142 | - [Function Calling with LLMs](https://www.hopsworks.ai/dictionary/function-calling-with-llms)
143 | - [Gradient Accumulation](https://www.hopsworks.ai/dictionary/gradient-accumulation)
144 | - [In Context Learning (ICL)](http://www.hopsworks.ai/dictionary/in-context-learning-icl)
145 | - [KServe](https://www.hopsworks.ai/dictionary/kserve)
146 | - [ML Logs](https://www.hopsworks.ai/dictionary/machine-learning-logs)
147 | - [ML Infrastructure](https://www.hopsworks.ai/dictionary/machine-learning-infrastructure)
148 | - [ML Observability](https://www.hopsworks.ai/dictionary/machine-learning-observability)
149 | - [ML Pipeline](https://www.hopsworks.ai/dictionary/ml-pipeline)
150 | - [ML Systems](https://www.hopsworks.ai/dictionary/ml-systems)
151 | - [Model Deployment](https://www.hopsworks.ai/dictionary/model-deployment)
152 | - [Model Monitoring](https://www.hopsworks.ai/dictionary/model-monitoring)
153 | - [Model Registry](https://www.hopsworks.ai/dictionary/model-registry)
154 | - [Model Serving](https://www.hopsworks.ai/dictionary/model-serving)
155 | - [PagedAttention](https://www.hopsworks.ai/dictionary/pagedattention)
156 | - [Prompt Store](https://www.hopsworks.ai/dictionary/prompt-store)
157 | - [Retrieval Augmented Generation (RAG) LLM](https://www.hopsworks.ai/dictionary/retrieval-augmented-generation-llm)
158 | - [RoPE Scaling](https://www.hopsworks.ai/dictionary/rope-scaling)
159 | - [Sample Packing](https://www.hopsworks.ai/dictionary/sample-packing)
160 | - [Similarity Search](http://www.hopsworks.ai/dictionary/similarity-search)
161 | 
162 | # **Support and Partners**
163 | </br>
164 | <p align="center">
165 |     <a href="https://www.featurestore.org">
166 |         <img src="https://uploads-ssl.webflow.com/5f32a0dcc815d2a49c58481a/61d5bd3ac0dfb6cabe8d5ebc_FS%20Logo%20.svg" alt="FSorg" width="105"/>
167 |     </a>
168 | </p>
169 | <br/>
170 | <p align="center">
171 |     <a href="https://www.hopsworks.ai">
172 |         <img src="https://assets.website-files.com/5f6353590bb01cacbcecfbac/6202a13e7cafec5553703f6b_logo.svg" alt="Hopsworks" width="250"/>
173 |     </a>
174 | </p>
175 | 


--------------------------------------------------------------------------------
/assets/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/README.md


--------------------------------------------------------------------------------
/assets/actual_iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/actual_iris.png


--------------------------------------------------------------------------------
/assets/confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/confusion_matrix.png


--------------------------------------------------------------------------------
/assets/credit_cards.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/credit_cards.parquet


--------------------------------------------------------------------------------
/assets/df_recent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/df_recent.png


--------------------------------------------------------------------------------
/assets/images/card_horizontal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/images/card_horizontal.jpg


--------------------------------------------------------------------------------
/assets/latest_iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/latest_iris.png


--------------------------------------------------------------------------------
/assets/profiles.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/profiles.parquet


--------------------------------------------------------------------------------
/assets/transactions.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/assets/transactions.parquet


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | faker
 2 | parsedatetime
 3 | hopsworks
 4 | nbconvert
 5 | scikit-learn
 6 | plotly
 7 | Pillow
 8 | seaborn
 9 | dataframe-image
10 | streamlit_folium
11 | plotly
12 | 


--------------------------------------------------------------------------------
/src/00-intro/Pandas-Intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f66fadac",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Pandas in 2 mins\n",
  9 |     "You can't learn Pandas in 2 mins, but here are some of the basics needed for this course.\n",
 10 |     "\n",
 11 |     "First, you can define a dict containing credit card payments, labeled as fraud or not-fraud, and create a Pandas DataFrame from it."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "27b01f37",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "\n",
 23 |     "data = { \n",
 24 |     "    'credit_card_number': ['1111 2222 3333 4444', '1111 2222 3333 4444','1111 2222 3333 4444',\n",
 25 |     "                           '1111 2222 3333 4444'],\n",
 26 |     "    'trans_datetime': ['2022-01-01 08:44', '2022-01-01 19:44', '2022-01-01 20:44', '2022-01-01 20:55'],\n",
 27 |     "    'amount': [142.34, 12.34, 66.29, 112.33],\n",
 28 |     "    'location': ['Sao Paolo', 'Rio De Janeiro', 'Stockholm', 'Stockholm'],\n",
 29 |     "    'fraud': [False, False, True, True] \n",
 30 |     "}\n",
 31 |     "\n",
 32 |     "df = pd.DataFrame.from_dict(data)\n",
 33 |     "df['trans_datetime']= pd.to_datetime(df['trans_datetime'])\n",
 34 |     "df"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "d0146eac",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "df"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "id": "dd7889c9",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "df.info()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "id": "ecc3bb3b",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "df['trans_datetime']= pd.to_datetime(df['trans_datetime'])\n",
 65 |     "df.info()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "id": "280b5ebb",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### Lambda functions\n",
 74 |     "\n",
 75 |     "We will now apply a lambda function to the column `amount` and save the result in a new column `is_big` in our DataFrame `df`."
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "73ba75de",
 82 |    "metadata": {
 83 |     "scrolled": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "df['is_big'] = df['amount'].apply(lambda amount: amount > 100)\n",
 88 |     "df"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "id": "f845b92e",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### Apply and UDFs\n",
 97 |     "\n",
 98 |     "We will now apply a user-defined function (UDF), `is_small`, to each row in the data DataFrame `df`.  \n",
 99 |     "The result is a series that we store in a new column in `df` called 'is_small'."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "36cf67ef",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "def is_small(row):\n",
110 |     "    return row['amount'] < 100\n",
111 |     "    \n",
112 |     "df['is_small'] = df.apply(is_small, axis=1)\n",
113 |     "df"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "id": "c678d9ba",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Rolling Windows\n",
122 |     "\n",
123 |     "We will compute a rolling window over the day."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "8bc7a844",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "df_rolling = df.set_index('trans_datetime')\n",
134 |     "df_rolling"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "4b3b6d2d",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "df_rolling['rolling_max_1d'] = df_rolling.rolling('1D').amount.max()\n",
145 |     "df_rolling"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "id": "12d55895",
151 |    "metadata": {},
152 |    "source": [
153 |     "Let's create a new DataFrame, `d2`, with new data."
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "id": "f38554ad",
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "import numpy as np\n",
164 |     "import timeit \n",
165 |     "\n",
166 |     "df2 = pd.DataFrame({\n",
167 |     "    'a':np.random.randint(1,100, size=10000),\n",
168 |     "    'b':np.random.randint(100,1000, size=10000),\n",
169 |     "    'c':np.random.random(10000)\n",
170 |     "})\n",
171 |     "df2.shape\n",
172 |     "(100000, 3)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "id": "36e93895",
178 |    "metadata": {},
179 |    "source": [
180 |     "### Vectorized operations are faster than \"apply\" with UDFs\n",
181 |     "\n",
182 |     "We will see that apply is approximately 50 times slower than the equivalent vectorized operation on 100k rows.\n",
183 |     "\n"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "b35aa5a2",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "%%timeit\n",
194 |     "df2['a'].apply(lambda x: x**2)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "id": "622dc43c",
200 |    "metadata": {},
201 |    "source": [
202 |     "This vectorized operation is much faster"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "id": "de746618",
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "%%timeit\n",
213 |     "df2['a'] ** 2"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "id": "4aededa8",
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "df2.describe()"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "id": "c40d50fe",
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "df.trans_datetime.unique()"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "id": "361d75ee",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "df.credit_card_number.nunique()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "id": "8f7de134",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "df.isnull().sum()"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "id": "b66d799c",
259 |    "metadata": {},
260 |    "source": [
261 |     "## Transformations\n",
262 |     "\n",
263 |     "Plot a histogram with a long tail.\n",
264 |     "Use numpy to seed the random number generator and generate a univariate data sample.\n"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "id": "32ebde28",
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "import seaborn as sns\n",
275 |     "\n",
276 |     "from numpy.random import seed\n",
277 |     "from numpy.random import randn\n",
278 |     "from numpy.random import rand\n",
279 |     "from numpy import append\n",
280 |     "seed(1)\n",
281 |     "array = 5 * randn(100) + 10\n",
282 |     "tail = 10 + (rand(50) * 100)\n",
283 |     "array = append(array, tail)\n",
284 |     "sns.histplot(array)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "id": "262bf19c",
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "columns = ['amount']\n",
295 |     "df_exp = pd.DataFrame(data = array, columns = columns)\n",
296 |     "  \n",
297 |     "df_exp.describe()"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "id": "bb560fa4",
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "df_exp"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "id": "31a8bac9",
313 |    "metadata": {},
314 |    "source": [
315 |     "## Standard Scalar in Vectorized Pandas\n",
316 |     "\n",
317 |     "This is an efficient way to transform our input Pandas column into a range of [0.0, 1.]"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "id": "ae928d6c",
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "# Min-Max Normalization in Pandas\n",
328 |     "df_norm = (df_exp-df_exp.min())/(df_exp.max()-df_exp.min())\n",
329 |     "df_norm.head()"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "id": "bca3a9f9",
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "sns.histplot(df_norm)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "id": "ff81e054",
345 |    "metadata": {},
346 |    "source": [
347 |     "## Power Transformer in Scikit-Learn\n",
348 |     "\n",
349 |     "Scikit-Learn has many different transformation libraries.\n",
350 |     "For heavy-tailed distributions, it is often recommended to perform a [power transformation](\n",
351 |     "https://towardsdatascience.com/how-to-differentiate-between-scaling-normalization-and-log-transformations-69873d365a94)\n",
352 |     "\n",
353 |     "We can see in the histogram, this produces a more Gaussian (normal) distribution than the MinMax Scalar."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "id": "85f5e6d6",
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "from sklearn.preprocessing import PowerTransformer\n",
364 |     "\n",
365 |     "pt = PowerTransformer()\n",
366 |     "\n",
367 |     "df_power = pd.DataFrame(\n",
368 |     "    pt.fit_transform(df_exp[[\"amount\"]]), columns=[\"amount\"]\n",
369 |     ")\n",
370 |     "\n",
371 |     "sns.histplot(df_power)"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "id": "1ced0dce",
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": []
381 |   }
382 |  ],
383 |  "metadata": {
384 |   "kernelspec": {
385 |    "display_name": "Python 3 (ipykernel)",
386 |    "language": "python",
387 |    "name": "python3"
388 |   },
389 |   "language_info": {
390 |    "codemirror_mode": {
391 |     "name": "ipython",
392 |     "version": 3
393 |    },
394 |    "file_extension": ".py",
395 |    "mimetype": "text/x-python",
396 |    "name": "python",
397 |    "nbconvert_exporter": "python",
398 |    "pygments_lexer": "ipython3",
399 |    "version": "3.9.7"
400 |   }
401 |  },
402 |  "nbformat": 4,
403 |  "nbformat_minor": 5
404 | }
405 | 


--------------------------------------------------------------------------------
/src/00-intro/green-apples-vs-oranges.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "id": "6b138a28",
 7 |    "metadata": {},
 8 |    "outputs": [
 9 |     {
10 |      "name": "stdout",
11 |      "output_type": "stream",
12 |      "text": [
13 |       "[0 1]\n"
14 |      ]
15 |     }
16 |    ],
17 |    "source": [
18 |     "import sklearn \n",
19 |     "from sklearn.linear_model import LogisticRegression\n",
20 |     "from sklearn import tree \n",
21 |     "\n",
22 |     "# 4 examples of features with [red-color, green-color]\n",
23 |     "features = [[0,120], [0, 110], [250, 150], [255, 163]]\n",
24 |     "# green apples == 0; oranges == 1\n",
25 |     "labels = [0, 0, 1, 1]\n",
26 |     "\n",
27 |     "clf = tree.DecisionTreeClassifier()\n",
28 |     "clf = clf.fit(features, labels)\n",
29 |     "\n",
30 |     "test_fruits = [[0,128], [249, 155]]\n",
31 |     "test_labels = [0, 1] \n",
32 |     "pred_labels = clf.predict(test_fruits)\n",
33 |     "print(pred_labels)\n"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "id": "b40db72f",
40 |    "metadata": {},
41 |    "outputs": [],
42 |    "source": []
43 |   }
44 |  ],
45 |  "metadata": {
46 |   "kernelspec": {
47 |    "display_name": "Python 3 (ipykernel)",
48 |    "language": "python",
49 |    "name": "python3"
50 |   },
51 |   "language_info": {
52 |    "codemirror_mode": {
53 |     "name": "ipython",
54 |     "version": 3
55 |    },
56 |    "file_extension": ".py",
57 |    "mimetype": "text/x-python",
58 |    "name": "python",
59 |    "nbconvert_exporter": "python",
60 |    "pygments_lexer": "ipython3",
61 |    "version": "3.9.7"
62 |   }
63 |  },
64 |  "nbformat": 4,
65 |  "nbformat_minor": 5
66 | }
67 | 


--------------------------------------------------------------------------------
/src/00-intro/red-and-green-apples-vs-oranges.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "id": "4948e813",
 7 |    "metadata": {},
 8 |    "outputs": [
 9 |     {
10 |      "name": "stdout",
11 |      "output_type": "stream",
12 |      "text": [
13 |       "[0 1 1 2]\n"
14 |      ]
15 |     }
16 |    ],
17 |    "source": [
18 |     "import sklearn \n",
19 |     "from sklearn.linear_model import LogisticRegression\n",
20 |     "\n",
21 |     "# [green_apple(0,120), green_apple(0,110), orange(250,150), orange(255, 163), red_apple(255,0), red_apple(240,0)]\n",
22 |     "features = [[0,120], [75, 40], [60, 60], [255, 163], [255, 0], [240, 0]]\n",
23 |     "\n",
24 |     "# [green_apple, green_apple, orange, orange, red_apple, red_apple]\n",
25 |     "labels = [0, 0, 1, 1, 2, 2]\n",
26 |     "\n",
27 |     "clf = LogisticRegression()\n",
28 |     "clf = clf.fit(features, labels)\n",
29 |     "\n",
30 |     "# (66,66) is labelled as a green apple\n",
31 |     "test_features = [[0,110], [66, 66], [249, 155], [245, 0]]\n",
32 |     "test_labels = [0, 1, 0, 2] \n",
33 |     "pred_labels = clf.predict(test_features)\n",
34 |     "\n",
35 |     "print(pred_labels)"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "code",
40 |    "execution_count": null,
41 |    "id": "f3ad083c",
42 |    "metadata": {},
43 |    "outputs": [],
44 |    "source": []
45 |   }
46 |  ],
47 |  "metadata": {
48 |   "kernelspec": {
49 |    "display_name": "Python 3 (ipykernel)",
50 |    "language": "python",
51 |    "name": "python3"
52 |   },
53 |   "language_info": {
54 |    "codemirror_mode": {
55 |     "name": "ipython",
56 |     "version": 3
57 |    },
58 |    "file_extension": ".py",
59 |    "mimetype": "text/x-python",
60 |    "name": "python",
61 |    "nbconvert_exporter": "python",
62 |    "pygments_lexer": "ipython3",
63 |    "version": "3.9.7"
64 |   }
65 |  },
66 |  "nbformat": 4,
67 |  "nbformat_minor": 5
68 | }
69 | 


--------------------------------------------------------------------------------
/src/00-intro/streamlit-example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import streamlit as st
 3 | import numpy as np
 4 | 
 5 | st.title("Streamlit for ServerlessML")
 6 | st.header("Easy UI in Python with Streamlit")
 7 | 
 8 | chart_data = pd.DataFrame(np.random.randn(30, 3),
 9 | columns=["Data Engineers", "Data Scientists", "ML Engineers"])
10 | 
11 | st.bar_chart(chart_data)
12 | 


--------------------------------------------------------------------------------
/src/01-module/assets/Setosa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/Setosa.png


--------------------------------------------------------------------------------
/src/01-module/assets/Versicolor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/Versicolor.png


--------------------------------------------------------------------------------
/src/01-module/assets/Virginica.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/Virginica.png


--------------------------------------------------------------------------------
/src/01-module/assets/confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/confusion_matrix.png


--------------------------------------------------------------------------------
/src/01-module/assets/iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/assets/iris.png


--------------------------------------------------------------------------------
/src/01-module/iris_model/confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/iris_model/confusion_matrix.png


--------------------------------------------------------------------------------
/src/01-module/iris_model/iris_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/01-module/iris_model/iris_model.pkl


--------------------------------------------------------------------------------
/src/01-module/orchest/clean repository.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f0db3065",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "### This notebooks will setup our Github Credentials, and make sure that remote and local repository are synced."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "id": "fef79a2b-3616-40f0-8b08-a4ce2381474c",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# get the environement variable for the token\n",
 19 |     "import os\n",
 20 |     "secret = os.environ['GIT_TOKEN']\n",
 21 |     "account = os.environ['ACCOUNT']\n",
 22 |     "repo_url = os.environ['REPO']"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "f847dbb1-ce8f-4210-a3b8-1f23bbc604fb",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from datetime import datetime\n",
 33 |     "from git import Repo\n",
 34 |     "import git"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "76887f5c-dffe-413c-8a3c-7af077a46747",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Setup \n",
 45 |     "full_local_path = \"/project-dir/\"\n",
 46 |     "repo = git.Repo('/project-dir/')\n",
 47 |     "\n",
 48 |     "remote = f\"https://{secret}@github.com/{account}/{repo_url}.git\"\n",
 49 |     "repo = Repo(full_local_path)\n",
 50 |     "\n",
 51 |     "origin = repo.remote(name=\"origin\") \n",
 52 |     "if origin.url != remote:\n",
 53 |     "    origin.set_url(remote, origin.url)\n",
 54 |     "\n",
 55 |     "# uncomment if you need to pull\n",
 56 |     "# origin.pull()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 14,
 62 |    "id": "e0bc7741-4b28-4c33-a261-20b97dec0267",
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "\"Your branch is up to date with 'origin/gh-pages'.\""
 69 |       ]
 70 |      },
 71 |      "execution_count": 14,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "repo.git.checkout('gh-pages', force=True)\n",
 78 |     "\n",
 79 |     "# Going back to the main branch\n",
 80 |     "repo.git.checkout('main', force=True)\n",
 81 |     "\n",
 82 |     "# List remotes\n",
 83 |     "print('Remotes:')\n",
 84 |     "for remote in repo.remotes:\n",
 85 |     "    print(f'- {remote.name} {remote.url}')"
 86 |    ]
 87 |   }
 88 |  ],
 89 |  "metadata": {
 90 |   "kernelspec": {
 91 |    "display_name": "Python 3.10.6 64-bit",
 92 |    "language": "python",
 93 |    "name": "python3"
 94 |   },
 95 |   "language_info": {
 96 |    "codemirror_mode": {
 97 |     "name": "ipython",
 98 |     "version": 3
 99 |    },
100 |    "file_extension": ".py",
101 |    "mimetype": "text/x-python",
102 |    "name": "python",
103 |    "nbconvert_exporter": "python",
104 |    "pygments_lexer": "ipython3",
105 |    "version": "3.10.6"
106 |   },
107 |   "vscode": {
108 |    "interpreter": {
109 |     "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
110 |    }
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 5
115 | }
116 | 


--------------------------------------------------------------------------------
/src/01-module/orchest/iris-batch-inference-pipeline-orchest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "d2kLrOh-bpGy"
  7 |    },
  8 |    "source": [
  9 |     "# Iris Flower - Batch Prediction\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "In this notebook we will, \n",
 13 |     "\n",
 14 |     "1. Load the batch inference data that arrived in the last 24 hours\n",
 15 |     "2. Predict the first Iris Flower found in the batch\n",
 16 |     "3. Write the ouput png of the Iris flower predicted, to be displayed in Github Pages."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {
 23 |     "id": "xRtpj-psbpG8"
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import pandas as pd\n",
 28 |     "import hopsworks\n",
 29 |     "import joblib\n",
 30 |     "\n",
 31 |     "project = hopsworks.login()\n",
 32 |     "fs = project.get_feature_store()"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "mr = project.get_model_registry()\n",
 42 |     "# model = mr.get_model(\"iris\", version=1) # selecting a specific model\n",
 43 |     "model = mr.get_best_model(\"iris\",'accuracy', 'max') # selecting the best model for accuracy\n",
 44 |     "model_dir = model.download()\n",
 45 |     "model = joblib.load(model_dir + \"/iris_model.pkl\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "We are downloading the 'raw' iris data. We explicitly do not want transformed data, reading for training. \n",
 53 |     "\n",
 54 |     "So, let's download the iris dataset, and preview some rows. \n",
 55 |     "\n",
 56 |     "Note, that it is 'tabular data'. There are 5 columns: 4 of them are \"features\", and the \"variety\" column is the **target** (what we are trying to predict using the 4 feature values in the target's row)."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "colab": {
 64 |      "base_uri": "https://localhost:8080/",
 65 |      "height": 206
 66 |     },
 67 |     "id": "nRmFM7vcbpHA",
 68 |     "outputId": "d920d168-9818-40c5-c292-4cf0afcbbcfd"
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "feature_view = fs.get_feature_view(name=\"iris\", version=1)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "Now we will do some **Batch Inference**. \n",
 80 |     "\n",
 81 |     "We will read all the input features that have arrived in the last 24 hours, and score them."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "id": "uHuAD3ttP8Ep"
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "import datetime\n",
 93 |     "from PIL import Image\n",
 94 |     "\n",
 95 |     "batch_data = feature_view.get_batch_data()\n",
 96 |     "\n",
 97 |     "y_pred = model.predict(batch_data)\n",
 98 |     "\n",
 99 |     "y_pred"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "batch_data"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "Batch prediction output is the last entry in the batch - it is output as a file 'latest_iris.png'"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "flower = y_pred[y_pred.size-1]\n",
125 |     "flower_img = \"../assets/\" + flower + \".png\"\n",
126 |     "img = Image.open(flower_img)            \n",
127 |     "\n",
128 |     "img.save(\"../../../assets/latest_iris.png\")"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "iris_fg = fs.get_feature_group(name=\"iris\", version=1)\n",
138 |     "df = iris_fg.read()\n",
139 |     "df"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "label = df.iloc[-1][\"variety\"]\n",
149 |     "label"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "label_flower = \"../assets/\" + label + \".png\"\n",
159 |     "\n",
160 |     "img = Image.open(label_flower)            \n",
161 |     "\n",
162 |     "img.save(\"../../../assets/actual_iris.png\")"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "import pandas as pd\n",
172 |     "\n",
173 |     "monitor_fg = fs.get_or_create_feature_group(name=\"iris_predictions\",\n",
174 |     "                                  version=1,\n",
175 |     "                                  primary_key=[\"datetime\"],\n",
176 |     "                                  description=\"Iris flower Prediction/Outcome Monitoring\"\n",
177 |     "                                 )"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "from datetime import datetime\n",
187 |     "now = datetime.now().strftime(\"%m/%d/%Y, %H:%M:%S\")\n",
188 |     "\n",
189 |     "data = {\n",
190 |     "    'prediction': [flower],\n",
191 |     "    'label': [label],\n",
192 |     "    'datetime': [now],\n",
193 |     "}\n",
194 |     "monitor_df = pd.DataFrame(data)\n",
195 |     "monitor_fg.insert(monitor_df)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "history_df = monitor_fg.read()\n",
205 |     "history_df"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "import dataframe_image as dfi\n",
215 |     "\n",
216 |     "df_recent = history_df.tail(5)\n",
217 |     " \n",
218 |     "# If you exclude this image, you may have the same iris_latest.png and iris_actual.png files\n",
219 |     "# If no files have changed, the GH-action 'git commit/push' stage fails, failing your GH action (last step)\n",
220 |     "# This image, however, is always new, ensuring git commit/push will succeed.\n",
221 |     "dfi.export(df_recent, '../../../assets/df_recent.png', table_conversion = 'matplotlib')"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "from sklearn.metrics import confusion_matrix\n",
231 |     "\n",
232 |     "predictions = history_df[['prediction']]\n",
233 |     "labels = history_df[['label']]\n",
234 |     "\n",
235 |     "results = confusion_matrix(labels, predictions)\n",
236 |     "print(results)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "from matplotlib import pyplot\n",
246 |     "import seaborn as sns\n",
247 |     "\n",
248 |     "# Only create the confusion matrix when our iris_predictions feature group has examples of all 3 iris flowers\n",
249 |     "if results.shape == (3,3):\n",
250 |     "\n",
251 |     "    df_cm = pd.DataFrame(results, ['True Setosa', 'True Versicolor', 'True Virginica'],\n",
252 |     "                         ['Pred Setosa', 'Pred Versicolor', 'Pred Virginica'])\n",
253 |     "\n",
254 |     "    cm = sns.heatmap(df_cm, annot=True)\n",
255 |     "\n",
256 |     "    fig = cm.get_figure()\n",
257 |     "    fig.savefig(\"../../../assets/confusion_matrix.png\") \n",
258 |     "    df_cm\n",
259 |     "else:\n",
260 |     "    print(\"Run the batch inference pipeline more times until you get 3 different iris flowers\")    "
261 |    ]
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "colab": {
266 |    "collapsed_sections": [],
267 |    "provenance": []
268 |   },
269 |   "kernelspec": {
270 |    "display_name": "Python 3.10.6 64-bit",
271 |    "language": "python",
272 |    "name": "python3"
273 |   },
274 |   "language_info": {
275 |    "codemirror_mode": {
276 |     "name": "ipython",
277 |     "version": 3
278 |    },
279 |    "file_extension": ".py",
280 |    "mimetype": "text/x-python",
281 |    "name": "python",
282 |    "nbconvert_exporter": "python",
283 |    "pygments_lexer": "ipython3",
284 |    "version": "3.10.6"
285 |   },
286 |   "vscode": {
287 |    "interpreter": {
288 |     "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
289 |    }
290 |   }
291 |  },
292 |  "nbformat": 4,
293 |  "nbformat_minor": 1
294 | }
295 | 


--------------------------------------------------------------------------------
/src/01-module/orchest/iris-train-pipeline-orchest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "d2kLrOh-bpGy"
  7 |    },
  8 |    "source": [
  9 |     "# Iris Flower Train and Publish Model\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "In this notebook we will, \n",
 13 |     "\n",
 14 |     "1. Load the Iris Flower dataset into random split (train/test) DataFrames using a Feature View\n",
 15 |     "2. Train a KNN Model using SkLearn\n",
 16 |     "3. Evaluate model performance on the test set\n",
 17 |     "4. Register the model with Hopsworks Model Registry"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "!pip install -U hopsworks --quiet"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {
 33 |     "id": "xRtpj-psbpG8"
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 38 |     "from sklearn.metrics import accuracy_score\n",
 39 |     "import pandas as pd\n",
 40 |     "import seaborn as sns\n",
 41 |     "import hopsworks"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "Let's first get a feature_view for the iris flower dataset, or create one if it does not already exist.\n",
 49 |     "If you are running this notebook for the first time, it will create the feature view, which contains all of the columns from the **iris feature group**.\n",
 50 |     "\n",
 51 |     "There are 5 columns: 4 of them are \"features\", and the **variety** column is the **label** (what we are trying to predict using the 4 feature values in the label's row). The label is often called the **target**."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {
 58 |     "colab": {
 59 |      "base_uri": "https://localhost:8080/",
 60 |      "height": 206
 61 |     },
 62 |     "id": "nRmFM7vcbpHA",
 63 |     "outputId": "d920d168-9818-40c5-c292-4cf0afcbbcfd"
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "project = hopsworks.login()\n",
 68 |     "fs = project.get_feature_store()\n",
 69 |     "\n",
 70 |     "try: \n",
 71 |     "    feature_view = fs.get_feature_view(name=\"iris\", version=1)\n",
 72 |     "except:\n",
 73 |     "    iris_fg = fs.get_feature_group(name=\"iris\", version=1)\n",
 74 |     "    query = iris_fg.select_all()\n",
 75 |     "    feature_view = fs.create_feature_view(name=\"iris\",\n",
 76 |     "                                      version=1,\n",
 77 |     "                                      description=\"Read from Iris flower dataset\",\n",
 78 |     "                                      labels=[\"variety\"],\n",
 79 |     "                                      query=query)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "We will read our features and labels split into a **train_set** and a **test_set**. You split your data into a train_set and a test_set, because you want to train your model on only the train_set, and then evaluate its performance on data that was not seen during training, the test_set. This technique helps evaluate the ability of your model to accurately predict on data it has not seen before.\n",
 87 |     "\n",
 88 |     "We can ask the feature_view to return a **train_test_split** and it returns:\n",
 89 |     "\n",
 90 |     "* **X_** is a vector of features, so **X_train** is a vector of features from the **train_set**. \n",
 91 |     "* **y_** is a scale of labels, so **y_train** is a scalar of labels from the **train_set**. \n",
 92 |     "\n",
 93 |     "Note: a vector is an array of values and a scalar is a single value.\n",
 94 |     "\n",
 95 |     "Note: that mathematical convention is that a vector is denoted by an uppercase letter (hence \"X\") and a scalar is denoted by a lowercase letter (hence \"y\").\n",
 96 |     "\n",
 97 |     "**X_test** is the features and **y_test** is the labels from our holdout **test_set**. The **test_set** is used to evaluate model performance after the model has been trained."
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {
104 |     "id": "JR8HeEs6bpHB"
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "X_train, X_test, y_train, y_test = feature_view.train_test_split(0.2, )"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "y_train"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "Now, we can fit a model to our features and labels from our training set (**X_train** and **y_train**). \n",
125 |     "\n",
126 |     "Fitting a model to a dataset is more commonly called \"training a model\"."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {
133 |     "colab": {
134 |      "base_uri": "https://localhost:8080/"
135 |     },
136 |     "id": "PNZcUPHJPIu9",
137 |     "outputId": "389acb4d-74ff-46f1-dee8-a7c27ee79a09"
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "model = KNeighborsClassifier(n_neighbors=2)\n",
142 |     "model.fit(X_train, y_train.values.ravel())"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "Now, we have trained our model. We can evaluate our model on the **test_set** to estimate its performance."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "id": "uHuAD3ttP8Ep"
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "y_pred = model.predict(X_test)\n",
161 |     "y_pred"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "We can report on how accurate these predictions (**y_pred**) are compared to the labels (the actual results - **y_test**). "
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {
175 |     "colab": {
176 |      "base_uri": "https://localhost:8080/"
177 |     },
178 |     "id": "b8EC4_SvbpHE",
179 |     "outputId": "5d73b375-76f0-4518-8e88-4db23e8f2486"
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "from sklearn.metrics import classification_report\n",
184 |     "\n",
185 |     "metrics = classification_report(y_test, y_pred, output_dict=True)\n",
186 |     "print(metrics)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "from sklearn.metrics import confusion_matrix\n",
196 |     "\n",
197 |     "results = confusion_matrix(y_test, y_pred)\n",
198 |     "print(results)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "Notice in the confusion matrix results that we have 1 or 2 incorrect predictions.\n",
206 |     "We have only 30 flowers in our test set - **y_test**.\n",
207 |     "Our model predicted 1 or 2 flowers were of type \"Virginica\", but the flowers were, in fact, \"Versicolor\"."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "from matplotlib import pyplot\n",
217 |     "\n",
218 |     "df_cm = pd.DataFrame(results, ['True Setosa', 'True Versicolor', 'True Virginica'],\n",
219 |     "                     ['Pred Setosa', 'Pred Versicolor', 'Pred Virginica'])\n",
220 |     "\n",
221 |     "cm = sns.heatmap(df_cm, annot=True)\n",
222 |     "\n",
223 |     "fig = cm.get_figure()\n",
224 |     "fig.savefig(\"../assets/confusion_matrix.png\") \n",
225 |     "fig.show()"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "## Register the Model with Hopsworks Model Registry\n",
233 |     "\n"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "from hsml.schema import Schema\n",
243 |     "from hsml.model_schema import ModelSchema\n",
244 |     "import os\n",
245 |     "import joblib\n",
246 |     "import hopsworks\n",
247 |     "import shutil\n",
248 |     "\n",
249 |     "project =  hopsworks.login()\n",
250 |     "mr = project.get_model_registry()\n",
251 |     "\n",
252 |     "# The 'iris_model' directory will be saved to the model registry\n",
253 |     "model_dir=\"iris_model\"\n",
254 |     "if os.path.isdir(model_dir) == False:\n",
255 |     "    os.mkdir(model_dir)\n",
256 |     "joblib.dump(model, model_dir + \"/iris_model.pkl\")\n",
257 |     "shutil.copyfile(\"../assets/confusion_matrix.png\", model_dir + \"/confusion_matrix.png\")\n",
258 |     "\n",
259 |     "input_example = X_train.sample()\n",
260 |     "input_schema = Schema(X_train)\n",
261 |     "output_schema = Schema(y_train)\n",
262 |     "model_schema = ModelSchema(input_schema, output_schema)\n",
263 |     "\n",
264 |     "iris_model = mr.python.create_model(\n",
265 |     "#    version=1, #removing version to incrementally create a new version at each training\n",
266 |     "    name=\"iris\", \n",
267 |     "    metrics={\"accuracy\" : metrics['accuracy']},\n",
268 |     "    model_schema=model_schema,\n",
269 |     "    input_example=input_example, \n",
270 |     "    description=\"Iris Flower Predictor\")\n",
271 |     "\n",
272 |     "iris_model.save(model_dir)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": []
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "colab": {
285 |    "collapsed_sections": [],
286 |    "provenance": []
287 |   },
288 |   "kernelspec": {
289 |    "display_name": "Python 3 (ipykernel)",
290 |    "language": "python",
291 |    "name": "python3"
292 |   },
293 |   "language_info": {
294 |    "codemirror_mode": {
295 |     "name": "ipython",
296 |     "version": 3
297 |    },
298 |    "file_extension": ".py",
299 |    "mimetype": "text/x-python",
300 |    "name": "python",
301 |    "nbconvert_exporter": "python",
302 |    "pygments_lexer": "ipython3",
303 |    "version": "3.9.7"
304 |   }
305 |  },
306 |  "nbformat": 4,
307 |  "nbformat_minor": 1
308 | }
309 | 


--------------------------------------------------------------------------------
/src/01-module/orchest/push-work.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "dde7af30",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# get the environement variable for the token\n",
 11 |     "import os\n",
 12 |     "secret = os.environ['GIT_TOKEN']\n",
 13 |     "account = os.environ['ACCOUNT']\n",
 14 |     "repo_url = os.environ['REPO']"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "id": "e0bc7741-4b28-4c33-a261-20b97dec0267",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from datetime import datetime\n",
 25 |     "from git import Repo\n",
 26 |     "import git\n",
 27 |     "import shutil\n",
 28 |     "\n",
 29 |     "# Setup \n",
 30 |     "full_local_path = \"/project-dir/\"\n",
 31 |     "repo = git.Repo('/project-dir/')\n",
 32 |     "\n",
 33 |     "remote = f\"https://{secret}@github.com/{account}/{repo_url}.git\"\n",
 34 |     "repo = Repo(full_local_path)\n",
 35 |     "\n",
 36 |     "origin = repo.remote(name=\"origin\") \n",
 37 |     "if origin.url != remote:\n",
 38 |     "    origin.set_url(remote, origin.url)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "f2911067",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# move the files to the /data folder in orchest\n",
 49 |     "assets_folder = r\"../../../assets/\"\n",
 50 |     "env_folder = r\"/data/\"\n",
 51 |     "files_to_move = ['latest_iris.png', 'actual_iris.png', 'confusion_matrix.png','df_recent.png']\n",
 52 |     "\n",
 53 |     "for file in files_to_move:\n",
 54 |     "    # construct full file path\n",
 55 |     "    source = assets_folder + file\n",
 56 |     "    destination = env_folder + file\n",
 57 |     "     # move file\n",
 58 |     "    shutil.move(source, destination)\n",
 59 |     "\n",
 60 |     "# move to the branch for pages\n",
 61 |     "repo.git.checkout('gh-pages', force=True)\n",
 62 |     "\n",
 63 |     "#move back to an asset folder in the gh-pages branch \n",
 64 |     "for file in files_to_move:\n",
 65 |     "    # construct full file path\n",
 66 |     "    source = env_folder + file\n",
 67 |     "    destination = assets_folder + file\n",
 68 |     "     # move file\n",
 69 |     "    shutil.move(source, destination)\n",
 70 |     "\n",
 71 |     "# Add our file, and set our commit\n",
 72 |     "repo.git.add('assets/latest_iris.png', 'assets/actual_iris.png', 'assets/confusion_matrix.png', 'assets/df_recent.png')\n",
 73 |     "current = datetime.now()\n",
 74 |     "repo.index.commit(f'New prediction! time and date: {current}')\n",
 75 |     "\n",
 76 |     "# Push to the pages repository\n",
 77 |     "origin.push()\n",
 78 |     "\n",
 79 |     "# Going back to the main branch\n",
 80 |     "repo.git.checkout('main', force=True)"
 81 |    ]
 82 |   }
 83 |  ],
 84 |  "metadata": {
 85 |   "kernelspec": {
 86 |    "display_name": "Python 3.10.6 64-bit",
 87 |    "language": "python",
 88 |    "name": "python3"
 89 |   },
 90 |   "language_info": {
 91 |    "codemirror_mode": {
 92 |     "name": "ipython",
 93 |     "version": 3
 94 |    },
 95 |    "file_extension": ".py",
 96 |    "mimetype": "text/x-python",
 97 |    "name": "python",
 98 |    "nbconvert_exporter": "python",
 99 |    "pygments_lexer": "ipython3",
100 |    "version": "3.10.6"
101 |   },
102 |   "vscode": {
103 |    "interpreter": {
104 |     "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
105 |    }
106 |   }
107 |  },
108 |  "nbformat": 4,
109 |  "nbformat_minor": 5
110 | }
111 | 


--------------------------------------------------------------------------------
/src/01-module/scripts/run-feature-and-prediction-pipelines.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | cd src/01-module
 6 | 
 7 | jupyter nbconvert --to notebook --execute iris-feature-pipeline.ipynb
 8 | jupyter nbconvert --to notebook --execute iris-batch-inference-pipeline.ipynb
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/02-module/scripts/run-fraud-feature-pipelines.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | cd src/02-module
6 | 
7 | jupyter nbconvert --to notebook --execute 2_cc_feature_pipeline.ipynb
8 | 
9 | 


--------------------------------------------------------------------------------
/src/02-module/sml/cc_features.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from datetime import datetime, date
  5 | from math import radians
  6 | 
  7 | # +
  8 | def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame:
  9 |     """Used only in feature pipelines (not online inference). 
 10 |        Unit test with DataFrames and sample data.
 11 |     """
 12 |     age_df = trans_df.merge(profiles_df, on="cc_num", how="left")
 13 |     trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y")
 14 |     return trans_df
 15 | 
 16 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame:
 17 |     """Used only in feature pipelines (not online inference). 
 18 |        Unit test with DataFrames and sample data.
 19 |     """
 20 |     card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left")
 21 |     card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y")
 22 |     trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D")
 23 |     return trans_df
 24 | 
 25 | 
 26 | # -
 27 | 
 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float:
 29 |     """Compute Haversine distance between each consecutive coordinate in (long, lat)."""
 30 | 
 31 |     if isinstance(long, pd.Series):
 32 |         long = long.map(lambda x: (x))
 33 |     else:
 34 |         long = radians(long)
 35 |     
 36 |     if isinstance(lat, pd.Series):
 37 |         lat = lat.map(lambda x: (x))
 38 |     else:
 39 |         lat = radians(lat)
 40 | 
 41 |     if isinstance(long, pd.Series):
 42 |         prev_long = prev_long.map(lambda x: (x))
 43 |     else:
 44 |         prev_long = radians(prev_long)
 45 | 
 46 |     if isinstance(lat, pd.Series):
 47 |         prev_lat = prev_lat.map(lambda x: (x))
 48 |     else:
 49 |         prev_lat = radians(prev_lat)
 50 | 
 51 |     long_diff = prev_long - long
 52 |     lat_diff = prev_lat - lat
 53 | 
 54 |     a = np.sin(lat_diff/2.0)**2
 55 |     b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2
 56 |     c = 2*np.arcsin(np.sqrt(a + b))
 57 | 
 58 |     return c
 59 | 
 60 | 
 61 | def time_delta(prev_datetime: int, current_datetime: int)-> int:
 62 |     """Compute time difference between each consecutive transaction."""        
 63 |     return prev_datetime - current_datetime
 64 | 
 65 | def time_delta_to_days(time_delta: datetime)-> float:
 66 |     """."""    
 67 |     return time_delta.total_seconds() / 86400
 68 | 
 69 | def date_to_timestamp(date_obj: datetime)-> int:
 70 |     return int(date_obj.timestamp() * 1000)
 71 | 
 72 | def timestamp_to_date(timestamp: int)-> datetime:
 73 |     return datetime.fromtimestamp(timestamp // 1000)
 74 | 
 75 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame:
 76 |     
 77 |     # Convert coordinates into radians:
 78 |     trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians)
 79 |     
 80 |     trans_df.sort_values(["datetime", "cc_num"], inplace=True) 
 81 | 
 82 |     # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most
 83 |     # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end),
 84 |     # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang).
 85 |     trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
 86 |         .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\
 87 |         .reset_index(level=0, drop=True)\
 88 |         .fillna(0)
 89 | 
 90 |     # Use the same `shift` operation in Pandas to get the previous row for a given cc_number
 91 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
 92 |         .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\
 93 |         .reset_index(level=0, drop=True)
 94 | #        .fillna(0) # handle the first datetime, which has no previous row when you call `shift`
 95 | 
 96 |     # Convert time_delta from seconds to days
 97 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x))
 98 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0)    
 99 |     trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country", "age_at_transaction"\
100 |                          ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]]
101 |     # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC
102 |     # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins.
103 |     trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x))
104 |     return trans_df
105 | 
106 | 
107 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame:
108 |     
109 |     cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime")
110 | 
111 |     # Moving average of transaction volume.
112 |     df_mavg = pd.DataFrame(cc_group.mean())
113 |     df_mavg.columns = ["trans_volume_mavg", "datetime"]
114 |     df_mavg = df_mavg.reset_index(level=["cc_num"])
115 |     df_mavg = df_mavg.drop(columns=["cc_num", "datetime"])
116 |     df_mavg = df_mavg.sort_index()
117 | 
118 |     # Moving standard deviation of transaction volume.
119 |     df_std = pd.DataFrame(cc_group.mean())
120 |     df_std.columns = ["trans_volume_mstd", "datetime"]
121 |     df_std = df_std.reset_index(level=["cc_num"])
122 |     df_std = df_std.drop(columns=["cc_num", "datetime"])
123 |     df_std = df_std.fillna(0)
124 |     df_std = df_std.sort_index()
125 |     window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True)
126 | 
127 |     # Moving average of transaction frequency.
128 |     df_count = pd.DataFrame(cc_group.mean())
129 |     df_count.columns = ["trans_freq", "datetime"]
130 |     df_count = df_count.reset_index(level=["cc_num"])
131 |     df_count = df_count.drop(columns=["cc_num", "datetime"])
132 |     df_count = df_count.sort_index()
133 |     window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True)
134 | 
135 |     # Moving average of location difference between consecutive transactions.
136 |     cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean()
137 |     df_loc_delta_mavg = pd.DataFrame(cc_group)
138 |     df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"]
139 |     df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"])
140 |     df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"])
141 |     df_loc_delta_mavg = df_loc_delta_mavg.sort_index()
142 |     window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True)
143 | 
144 |     window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True)
145 |  
146 |     return window_aggs_df
147 | 


--------------------------------------------------------------------------------
/src/02-module/sml/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | # %%
  4 | from collections import defaultdict
  5 | from faker import Faker
  6 | import pandas as pd
  7 | import numpy as np
  8 | import datetime
  9 | import hashlib
 10 | import random
 11 | import math
 12 | import os
 13 | import bisect
 14 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple
 15 | 
 16 | # Seed for Reproducibility
 17 | faker = Faker()
 18 | faker.seed_locale('en_US', 0)
 19 | 
 20 | 
 21 | def set_random_seed(seed: int):
 22 |     random.seed(seed)
 23 |     np.random.seed(seed)
 24 |     faker.seed_instance(seed)
 25 | 
 26 | set_random_seed(12345)
 27 | 
 28 | 
 29 | TOTAL_UNIQUE_USERS = 1000
 30 | TOTAL_UNIQUE_TRANSACTIONS = 54000
 31 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 
 32 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 
 33 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10]
 34 | NORMAL_ATM_RADIUS = 0.01
 35 | START_DATE = '2022-01-01 00:00:00'
 36 | END_DATE = '2022-03-01 00:00:00'
 37 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
 38 | 
 39 | AMOUNT_DISTRIBUTION_PERCENTAGES = {
 40 |                                    0.05: (0.01, 1.01), 
 41 |                                    0.075: (1, 11.01),
 42 |                                    0.525: (10, 100.01),
 43 |                                    0.25: (100, 1000.01),
 44 |                                    0.099: (1000, 10000.01),
 45 |                                    0.001: (10000, 30000.01)
 46 |                                   }
 47 | 
 48 | CATEGORY_PERC_PRICE = {
 49 |                        "Grocery":              (0.5, 0.01, 100), 
 50 |                        "Restaurant/Cafeteria": (0.2, 1, 100),
 51 |                        "Health/Beauty":        (0.1, 10, 500.01),
 52 |                        "Domestic Transport":   (0.1, 10, 100.01),
 53 |                        "Clothing":             (0.05, 10, 2000.01),
 54 |                        "Electronics":          (0.02, 100, 10000.01),
 55 |                        "Sports/Outdoors":      (0.015, 10, 100.01),
 56 |                        "Holliday/Travel":      (0.014, 10, 100.01),              
 57 |                        "Jewelery":             (0.001, 10, 100.01)
 58 |                        }
 59 | 
 60 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent
 61 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS)
 62 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10]
 63 | 
 64 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = {
 65 |                                    0.055: (17, 24), 
 66 |                                    0.0015: (24, 34),
 67 |                                    0.0015: (34, 44),
 68 |                                    0.02: (44, 54),
 69 |                                    0.022: (54, 64),
 70 |                                    0.1: (64, 74),
 71 |                                    0.40: (74, 84),
 72 |                                    0.40: (84, 100),
 73 |                                   }
 74 | 
 75 | 
 76 | 
 77 | def generate_unique_credit_card_numbers(n: int) -> pd.Series:
 78 |     """."""    
 79 |     cc_ids = set()
 80 |     for _ in range(n):
 81 |         cc_id = faker.credit_card_number(card_type='visa')
 82 |         cc_ids.add(cc_id)
 83 |     return pd.Series(list(cc_ids))
 84 | 
 85 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 
 86 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit
 87 | 
 88 | def generate_list_credit_card_numbers() -> list:
 89 |     """."""    
 90 |     credit_cards = []
 91 |     credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS)
 92 |     delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
 93 |     delta_time_object + datetime.timedelta(days=-728)
 94 |     for cc_num in credit_card_numbers:
 95 |         credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")})        
 96 |     return credit_cards
 97 | 
 98 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame:
 99 |     """."""    
100 |     profiles = []
101 |     for credit_card in credit_cards:
102 |         address = faker.local_latlng(country_code = 'US')
103 |         age = 0 
104 |         profile = None
105 |         while age < 18 or age > 100:
106 |             profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate'])
107 |             dday = profile['birthdate']
108 |             delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day)
109 |             age = int(delta.days / 365)
110 |         profile['City'] = address[2]
111 |         profile['Country'] = address[3]
112 |         profile['cc_num'] = credit_card['cc_num']
113 |         credit_card['age'] = age
114 |         profiles.append(profile)
115 | 
116 |     # Cast the columns to the correct Pandas DType        
117 |     profiles_df = pd.DataFrame.from_records(profiles)
118 |     profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate'])
119 |     profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num'])
120 | 
121 |     return profiles_df
122 | 
123 | #  pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS
124 | def generate_timestamps(n: int) -> list:
125 |     """Return a list of timestamps of length 'n'."""    
126 |     start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
127 |     end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
128 |     timestamps = list()
129 |     for _ in range(n):
130 |         timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT)
131 |         timestamps.append(timestamp)
132 |     timestamps = sorted(timestamps)
133 |     return timestamps
134 | 
135 | def get_random_transaction_amount(start: float, end: float) -> float:
136 |     """."""    
137 |     amt = round(np.random.uniform(start, end), 2)
138 |     return amt
139 | 
140 | def generate_amounts() -> list:
141 |     """."""    
142 |     amounts = []
143 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
144 |         n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
145 |         start, end = span
146 |         for _ in range(n):
147 |             amounts.append(get_random_transaction_amount(start, end+1))
148 |     return amounts
149 | 
150 | def generate_categories(amounts) -> list:
151 |     """."""    
152 |     categories = []        
153 |     for category, category_perc_price in CATEGORY_PERC_PRICE.items():
154 |         percentage, min_price, max_price = category_perc_price
155 |         n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
156 |         for _ in range(n):
157 |             min_price_i = bisect.bisect_left(amounts, min_price)
158 |             max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i)
159 |             categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])})
160 | 
161 |     random.shuffle(categories)
162 |     return categories
163 | 
164 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str:
165 |     """."""    
166 |     hashable = f'{timestamp}{credit_card_number}{transaction_amount}'
167 |     hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest()
168 |     return hexdigest
169 | 
170 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list:
171 |     """."""    
172 |     transactions = []
173 |     for timestamp, category in zip(timestamps, categories):
174 |         credit_card_number = random.choice(credit_card_numbers)
175 |         point_of_tr = faker.local_latlng(country_code = 'US')
176 |         transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount'])
177 |         transactions.append({
178 |                              'tid': transaction_id, 
179 |                              'datetime': timestamp, 
180 |                              'cc_num': credit_card_number, 
181 |                              'category': category['category'], 
182 |                              'amount': category['amount'],
183 |                              'latitude': point_of_tr[0], 
184 |                              'longitude': point_of_tr[1],
185 |                              'city': point_of_tr[2],
186 |                              'country': point_of_tr[3],
187 |                              'fraud_label': 0
188 |                             }
189 |                            )
190 |     return transactions
191 | 
192 | def generate_cash_amounts() -> list:
193 |     """."""    
194 |     cash_amounts = []
195 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
196 |         n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage)
197 |         start, end = span
198 |         for _ in range(n):
199 |             cash_amounts.append(get_random_transaction_amount(start, end+1))
200 |     return cash_amounts
201 | 
202 | def generate_chains():
203 |     """."""    
204 |     visited = set()
205 |     chains = defaultdict(list)
206 | 
207 |     def size(chains: dict) -> int:
208 |         counts = {key: len(values)+1 for (key, values) in chains.items()}
209 |         return sum(counts.values())
210 | 
211 | 
212 |     def generate_attack_chain(i: int):
213 |         chain_length = random.choice(ATTACK_CHAIN_LENGTHS)
214 |         for j in range(1, chain_length):
215 |             if i+j not in visited:
216 |                 if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS:
217 |                     break
218 |                 chains[i].append(i+j)
219 |                 visited.add(i+j)
220 | 
221 |     while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS:
222 |         i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS))
223 |         if i not in visited:
224 |             generate_attack_chain(i)
225 |             visited.add(i)
226 |     return chains
227 | 
228 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \
229 |                             delta: int, radius: float = None, country_code = 'US') -> List[Dict]:
230 |     """."""
231 |     atms = [] 
232 |     if length < 0:
233 |         raise Exception('Length must be > 0')
234 |         
235 |     start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
236 |     end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
237 |     timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None)
238 |     point_of_tr = faker.local_latlng(country_code = country_code)
239 |     latitude = point_of_tr[0] 
240 |     longitude = point_of_tr[1]
241 |     city = point_of_tr[2]
242 |     for _ in range(length):
243 |         current = timestamp + datetime.timedelta(hours=delta)
244 |         if radius is not None:
245 |             latitude = faker.coordinate(latitude, radius) 
246 |             longitude = faker.coordinate(longitude, radius)
247 |         amount = random.sample(cash_amounts, 1)[0]
248 |         transaction_id = generate_transaction_id(timestamp, credit_card_number, amount)
249 |         atms.append({'tid': transaction_id, 
250 |                      'datetime': current.strftime(DATE_FORMAT), 
251 |                      'cc_num': credit_card_number, 
252 |                      'category': 'Cash Withdrawal', 
253 |                      'amount': amount,
254 |                      'latitude': latitude, 
255 |                      'longitude': longitude,
256 |                      'city': city,
257 |                      'country': 'US',
258 |                      'fraud_label': 0
259 |                      })
260 |         timestamp = current
261 |     return atms
262 | 
263 | def generate_susceptible_cards(credit_cards: list) -> list:
264 |     """."""
265 |     susceptible_cards = []
266 |     visited_cards = []
267 |     for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items():
268 |         n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 
269 |         start, end = span
270 |         for _ in range(n):
271 |             for card in credit_cards:
272 |                 if card['age'] > start and card['age'] < end:
273 |                     if card['cc_num'] not in visited_cards:
274 |                         current = card
275 |                         visited_cards.append(card['cc_num'])
276 |                         break
277 |                     else:
278 |                         current = None                    
279 |             if current is not None:
280 |                 susceptible_cards.append(current)
281 |     return susceptible_cards
282 | 
283 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list:
284 |     """."""
285 |     normal_atm_withdrawals = []
286 |     atm_transactions = len(cash_amounts)
287 |     cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1))
288 |     atm_count = 0
289 |     while atm_count < atm_transactions:
290 |         for card in cash_withdrawal_cards:
291 |             for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 
292 |                 # interval in hours between normal cash withdrawals
293 |                 delta = random.randint(6, 168)
294 |                 atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS)         
295 |                 normal_atm_withdrawals.append(atm_tr)
296 |                 atm_count += ATM_WITHRAWAL_SEQ
297 |     return normal_atm_withdrawals
298 | 
299 | 
300 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list:
301 |     """."""
302 |     timestamps = []
303 |     timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT)
304 |     for _ in range(chain_length):
305 |         # interval in seconds between fraudulent attacks
306 |         delta = random.randint(30, 120)
307 |         current = timestamp + datetime.timedelta(seconds=delta)
308 |         timestamps.append(current.strftime(DATE_FORMAT))
309 |         timestamp = current
310 |     return timestamps 
311 | 
312 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list:
313 |     """."""
314 |     amounts = []
315 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
316 |         n = math.ceil(chain_length * percentage)
317 |         start, end = span
318 |         for _ in range(n):
319 |             amounts.append(get_random_transaction_amount(start, end+1))
320 |     return amounts[:chain_length]
321 | 
322 | 
323 | def update_transactions(transactions: list, chains: list) -> list:
324 |     """."""
325 |     for key, chain in chains.items():
326 |         transaction = transactions[key]
327 |         timestamp = transaction['datetime']
328 |         cc_num = transaction['cc_num']
329 |         amount = transaction['amount']
330 |         transaction['fraud_label'] = 1
331 |         inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain))
332 |         inject_amounts = generate_amounts_for_fraud_attacks(len(chain))
333 |         random.shuffle(inject_amounts)
334 |         for i, idx in enumerate(chain):
335 |             original_transaction = transactions[idx]
336 |             inject_timestamp = inject_timestamps[i]
337 |             original_transaction['datetime'] = inject_timestamp
338 |             original_transaction['fraud_label'] = 1
339 |             original_transaction['cc_num'] = cc_num
340 |             original_transaction['amount'] = inject_amounts[i]
341 |             original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0]
342 |             original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount)
343 |             transactions[idx] = original_transaction
344 | 
345 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list:
346 |     """."""
347 |     return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \
348 |                   int(FRAUD_RATIO * len(normal_atm_withdrawals)))
349 | 
350 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\
351 |                                   cash_amounts: list):
352 |     """."""
353 |     for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs:
354 |         # interval in seconds between fraudulent attacks
355 |         delta = random.randint(1, 5)
356 |         atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx]
357 |         pre_fraudulent_atm_tr = atm_withdrawal[0]
358 |         fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number =
359 |                 pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0]
360 |         fraudulent_atm_location = faker.location_on_land()
361 |         while fraudulent_atm_location[3] == 'US':
362 |             fraudulent_atm_location = faker.location_on_land()
363 |         fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'],
364 |                 DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT)
365 |         fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0]
366 |         fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1]
367 |         fraudulent_atm_tr['city'] = fraudulent_atm_location[2]
368 |         fraudulent_atm_tr['country'] = fraudulent_atm_location[3]  
369 |         fraudulent_atm_tr['fraud_label'] = 1 
370 |         atm_withdrawal.append(fraudulent_atm_tr)
371 |         normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal
372 |         
373 |         
374 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame:
375 |     """."""
376 |     for atm_withdrawal in normal_atm_withdrawals:
377 |         for withdrawal in atm_withdrawal:
378 |             transactions.append(withdrawal)  
379 |     return pd.DataFrame.from_records(transactions)
380 | 
381 | 
382 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame:
383 |     """."""
384 |     df = pd.DataFrame.from_records(credit_cards)
385 |     # Cast the columns to the correct Pandas DType
386 |     df['cc_num']= pd.to_numeric(df['cc_num'])
387 |     return df
388 | 
389 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame:
390 |     """."""
391 |     profiles_df = generate_df_with_profiles(credit_cards)
392 |     return profiles_df
393 | 
394 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame:
395 |     """."""
396 |     timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS)
397 |     amounts = generate_amounts()
398 |     categories = generate_categories(amounts)
399 |     cc_df = create_credit_cards_as_df(credit_cards)    
400 |     transactions = generate_transactions(cc_df['cc_num'], timestamps, categories)
401 |     cash_amounts = generate_cash_amounts()
402 |     chains = generate_chains()
403 |     susceptible_cards = generate_susceptible_cards(credit_cards)
404 |     normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards)
405 |     update_transactions(transactions, chains)
406 | 
407 |     fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals)
408 |     update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts)
409 | 
410 |     transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals)
411 |                                 
412 |     # Cast the columns to the correct Pandas DType
413 |     transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num'])
414 |     transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude'])
415 |     transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude'])
416 |     transactions_df['datetime']= pd.to_datetime(transactions_df['datetime'])
417 | 
418 |     return transactions_df
419 | 
420 | 


--------------------------------------------------------------------------------
/src/02-module/test_sml/test_sml.py:
--------------------------------------------------------------------------------
 1 | from sml import synthetic_data
 2 | from unittest import TestCase
 3 | import pytest
 4 | from contextlib import nullcontext as does_not_raise
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "credit_card_number, cash_amounts, length, delta, radius, country_code, excp",
 8 |     [("1111 2222 3333 4444",[112.10, 11.23], 1, 1, 10.0, 'US', does_not_raise())
 9 |     ,("1111 2222 3333 44",[-12.00], -1, 1, 1.0, 'IE', pytest.raises(Exception))]
10 | )    
11 | def test_generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \
12 |                                  delta: int, radius: float, country_code, excp):
13 |     with excp:
14 |         synthetic_data.generate_atm_withdrawal(credit_card_number, cash_amounts, length, delta, radius, country_code)
15 |         
16 | 


--------------------------------------------------------------------------------
/src/03-module/scripts/run-fraud-batch-inference.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | cd src/03-module
6 | 
7 | jupyter nbconvert --to notebook --execute 4_batch_predictions.ipynb
8 | 


--------------------------------------------------------------------------------
/src/04-module/app.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | import numpy as np
 3 | from PIL import Image
 4 | import requests
 5 | 
 6 | import hopsworks
 7 | import joblib
 8 | 
 9 | project = hopsworks.login()
10 | fs = project.get_feature_store()
11 | 
12 | 
13 | mr = project.get_model_registry()
14 | model = mr.get_model("iris", version=1)
15 | model_dir = model.download()
16 | model = joblib.load(model_dir + "/iris_model.pkl")
17 | 
18 | 
19 | def iris(sepal_length, sepal_width, petal_length, petal_width):
20 |     input_list = []
21 |     input_list.append(sepal_length)
22 |     input_list.append(sepal_width)
23 |     input_list.append(petal_length)
24 |     input_list.append(petal_width)
25 |     # 'res' is a list of predictions returned as the label.
26 |     res = model.predict(np.asarray(input_list).reshape(1, -1)) 
27 |     # We add '[0]' to the result of the transformed 'res', because 'res' is a list, and we only want 
28 |     # the first element.
29 |     flower_url = "https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/main/src/01-module/assets/" + res[0] + ".png"
30 |     img = Image.open(requests.get(flower_url, stream=True).raw)            
31 |     return img
32 |         
33 | demo = gr.Interface(
34 |     fn=iris,
35 |     title="Iris Flower Predictive Analytics",
36 |     description="Experiment with sepal/petal lengths/widths to predict which flower it is.",
37 |     allow_flagging="never",
38 |     inputs=[
39 |         gr.inputs.Number(default=1.0, label="sepal length (cm)"),
40 |         gr.inputs.Number(default=1.0, label="sepal width (cm)"),
41 |         gr.inputs.Number(default=1.0, label="petal length (cm)"),
42 |         gr.inputs.Number(default=1.0, label="petal width (cm)"),
43 |         ],
44 |     outputs=gr.Image(type="pil"))
45 | 
46 | demo.launch()
47 | 


--------------------------------------------------------------------------------
/src/04-module/cc-fraud-streamlit-ui.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import joblib
  3 | from math import radians
  4 | from sml import cc_features
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | import plotly.express as px
  9 | from matplotlib import pyplot
 10 | import warnings
 11 | 
 12 | import hopsworks
 13 | from sml import synthetic_data
 14 | 
 15 | import streamlit as st
 16 | 
 17 | import folium
 18 | from streamlit_folium import st_folium
 19 | import json
 20 | 
 21 | start_date = (datetime.datetime.now() - datetime.timedelta(hours=200)) 
 22 | end_date = (datetime.datetime.now()) 
 23 | 
 24 | synthetic_data.set_random_seed(12345)
 25 | credit_cards = [cc["cc_num"] for cc in synthetic_data.generate_list_credit_card_numbers()]
 26 | lat = 0
 27 | long = 0
 28 | 
 29 | warnings.filterwarnings("ignore")
 30 | 
 31 | project = hopsworks.login()
 32 | fs = project.get_feature_store()
 33 | 
 34 | @st.cache(allow_output_mutation=True, suppress_st_warning=True)
 35 | def retrieve_dataset(fv, start_date, end_date):
 36 |     st.write(36 * "-")
 37 |     print_fancy_header('\n💾 Dataset Retrieving...')
 38 |     batch_data = fv.get_batch_data(start_time = start_date, end_time = end_date)
 39 |     batch_data.drop(["tid", "cc_num", "datetime"], axis = 1, inplace=True)
 40 |     return batch_data
 41 | 
 42 | 
 43 | @st.cache(suppress_st_warning=True, allow_output_mutation=True)
 44 | def get_feature_view():
 45 |     fv = fs.get_feature_view("cc_trans_fraud", 1)
 46 |     return fv
 47 | 
 48 | 
 49 | @st.cache(allow_output_mutation=True,suppress_st_warning=True)
 50 | def get_model(project = project):
 51 |     mr = project.get_model_registry()
 52 |     model = mr.get_model("cc_fraud", version = 1)
 53 |     model_dir = model.download()
 54 |     return joblib.load(model_dir + "/cc_fraud_model.pkl")
 55 | 
 56 | def explore_data(batch_data):
 57 |     st.write(36 * "-")
 58 |     print_fancy_header('\n👁 Data Exploration...')
 59 |     labels = ["Suspected of Fraud", "Not Suspected of Fraud"]
 60 |     unique, counts = np.unique(batch_data.fraud.values, return_counts=True)
 61 |     values = counts.tolist()
 62 | 
 63 |     def plot_pie(values, labels):
 64 |         fig = px.pie(values=values, names=labels, title='Distribution of predicted fraud transactions')
 65 |         return fig
 66 | 
 67 |     fig1 = plot_pie(values, labels)
 68 |     st.plotly_chart(fig1)
 69 | 
 70 | 
 71 | def print_fancy_header(text, font_size=24):
 72 |     res = f'<span style="color:#ff5f27; font-size: {font_size}px;">{text}</span>'
 73 |     st.markdown(res, unsafe_allow_html=True)
 74 | 
 75 | def transform_preds(predictions):
 76 |     return ['Fraud' if pred == 1 else 'Not Fraud' for pred in predictions]    
 77 | 
 78 | progress_bar = st.sidebar.header('⚙️ Working Progress')
 79 | progress_bar = st.sidebar.progress(0)
 80 | st.title('🆘 Fraud transactions detection 🆘')
 81 | 
 82 | st.write(36 * "-")
 83 | print_fancy_header('\n📡 Connecting to Hopsworks Feature Store...')
 84 | 
 85 | st.write(36 * "-")
 86 | print_fancy_header('\n🤖 Connecting to Model Registry on Hopsworks...')
 87 | model = get_model(project)
 88 | st.write(model)
 89 | st.write("✅ Connected!")
 90 | 
 91 | progress_bar.progress(40)
 92 | 
 93 | st.write(36 * "-")
 94 | print_fancy_header('\n✨ Fetch batch data and predict')
 95 | fv = get_feature_view()
 96 | 
 97 | 
 98 | if st.button('📊 Make a prediction'):
 99 |     batch_data = retrieve_dataset(fv, start_date, end_date)
100 |     st.write("✅ Retrieved!")
101 |     progress_bar.progress(55)
102 |     predictions = model.predict(batch_data)
103 |     predictions = transform_preds(predictions)
104 |     batch_data_to_explore = batch_data.copy()
105 |     batch_data_to_explore['fraud'] = predictions
106 |     explore_data(batch_data_to_explore)
107 | 
108 | st.button("Re-run")
109 | 


--------------------------------------------------------------------------------
/src/04-module/requirements-gradio.txt:
--------------------------------------------------------------------------------
1 | hopsworks
2 | joblib
3 | scikit-learn
4 | 


--------------------------------------------------------------------------------
/src/04-module/run-fraud-streamlit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | if [ "$HOPSWORKS_API_KEY" == "" ] ; then
 5 |   echo "Enter your HOPSWORKS_API_KEY:"
 6 |   read KEY
 7 |   export HOPSWORKS_API_KEY="$KEY"
 8 | fi
 9 | 
10 | if [ "$HOPSWORKS_PROJECT" == "" ] ; then
11 |   echo "Enter the name of your project on Hopsworks:"
12 |   read proj
13 |   export HOPSWORKS_PROJECT=$proj
14 | export 
15 | 
16 | python -m streamlit run cc-fraud-streamlit-ui.py
17 | 


--------------------------------------------------------------------------------
/src/04-module/sml/cc_features.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from datetime import datetime, date
  5 | from math import radians
  6 | 
  7 | # +
  8 | def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame:
  9 |     """Used only in feature pipelines (not online inference). 
 10 |        Unit test with DataFrames and sample data.
 11 |     """
 12 |     age_df = trans_df.merge(profiles_df, on="cc_num", how="left")
 13 |     trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y")
 14 |     return trans_df
 15 | 
 16 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame:
 17 |     """Used only in feature pipelines (not online inference). 
 18 |        Unit test with DataFrames and sample data.
 19 |     """
 20 |     card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left")
 21 |     card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y")
 22 |     trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D")
 23 |     return trans_df
 24 | 
 25 | 
 26 | # -
 27 | 
 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float:
 29 |     """Compute Haversine distance between each consecutive coordinate in (long, lat)."""
 30 | 
 31 |     if isinstance(long, pd.Series):
 32 |         long = long.map(lambda x: (x))
 33 |     else:
 34 |         long = radians(long)
 35 |     
 36 |     if isinstance(lat, pd.Series):
 37 |         lat = lat.map(lambda x: (x))
 38 |     else:
 39 |         lat = radians(lat)
 40 | 
 41 |     if isinstance(long, pd.Series):
 42 |         prev_long = prev_long.map(lambda x: (x))
 43 |     else:
 44 |         prev_long = radians(prev_long)
 45 | 
 46 |     if isinstance(lat, pd.Series):
 47 |         prev_lat = prev_lat.map(lambda x: (x))
 48 |     else:
 49 |         prev_lat = radians(prev_lat)
 50 | 
 51 |     long_diff = prev_long - long
 52 |     lat_diff = prev_lat - lat
 53 | 
 54 |     a = np.sin(lat_diff/2.0)**2
 55 |     b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2
 56 |     c = 2*np.arcsin(np.sqrt(a + b))
 57 | 
 58 |     return c
 59 | 
 60 | 
 61 | def time_delta(prev_datetime: int, current_datetime: int)-> int:
 62 |     """Compute time difference between each consecutive transaction."""        
 63 |     return prev_datetime - current_datetime
 64 | 
 65 | def time_delta_to_days(time_delta: datetime)-> float:
 66 |     """."""    
 67 |     return time_delta.total_seconds() / 86400
 68 | 
 69 | def date_to_timestamp(date_obj: datetime)-> int:
 70 |     return int(date_obj.timestamp() * 1000)
 71 | 
 72 | def timestamp_to_date(timestamp: int)-> datetime:
 73 |     return datetime.fromtimestamp(timestamp // 1000)
 74 | 
 75 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame:
 76 |     
 77 |     # Convert coordinates into radians:
 78 |     trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians)
 79 |     
 80 |     trans_df.sort_values(["datetime", "cc_num"], inplace=True) 
 81 | 
 82 |     # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most
 83 |     # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end),
 84 |     # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang).
 85 |     trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
 86 |         .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\
 87 |         .reset_index(level=0, drop=True)\
 88 |         .fillna(0)
 89 | 
 90 |     # Use the same `shift` operation in Pandas to get the previous row for a given cc_number
 91 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
 92 |         .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\
 93 |         .reset_index(level=0, drop=True)
 94 | #        .fillna(0) # handle the first datetime, which has no previous row when you call `shift`
 95 | 
 96 |     # Convert time_delta from seconds to days
 97 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x))
 98 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0)    
 99 |     trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country", "age_at_transaction"\
100 |                          ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]]
101 |     # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC
102 |     # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins.
103 |     trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x))
104 |     return trans_df
105 | 
106 | 
107 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame:
108 |     
109 |     cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime")
110 | 
111 |     # Moving average of transaction volume.
112 |     df_mavg = pd.DataFrame(cc_group.mean())
113 |     df_mavg.columns = ["trans_volume_mavg", "datetime"]
114 |     df_mavg = df_mavg.reset_index(level=["cc_num"])
115 |     df_mavg = df_mavg.drop(columns=["cc_num", "datetime"])
116 |     df_mavg = df_mavg.sort_index()
117 | 
118 |     # Moving standard deviation of transaction volume.
119 |     df_std = pd.DataFrame(cc_group.mean())
120 |     df_std.columns = ["trans_volume_mstd", "datetime"]
121 |     df_std = df_std.reset_index(level=["cc_num"])
122 |     df_std = df_std.drop(columns=["cc_num", "datetime"])
123 |     df_std = df_std.fillna(0)
124 |     df_std = df_std.sort_index()
125 |     window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True)
126 | 
127 |     # Moving average of transaction frequency.
128 |     df_count = pd.DataFrame(cc_group.mean())
129 |     df_count.columns = ["trans_freq", "datetime"]
130 |     df_count = df_count.reset_index(level=["cc_num"])
131 |     df_count = df_count.drop(columns=["cc_num", "datetime"])
132 |     df_count = df_count.sort_index()
133 |     window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True)
134 | 
135 |     # Moving average of location difference between consecutive transactions.
136 |     cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean()
137 |     df_loc_delta_mavg = pd.DataFrame(cc_group)
138 |     df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"]
139 |     df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"])
140 |     df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"])
141 |     df_loc_delta_mavg = df_loc_delta_mavg.sort_index()
142 |     window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True)
143 | 
144 |     window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True)
145 |  
146 |     return window_aggs_df
147 | 


--------------------------------------------------------------------------------
/src/04-module/sml/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | # %%
  4 | from collections import defaultdict
  5 | from faker import Faker
  6 | import pandas as pd
  7 | import numpy as np
  8 | import datetime
  9 | import hashlib
 10 | import random
 11 | import math
 12 | import os
 13 | import bisect
 14 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple
 15 | 
 16 | # Seed for Reproducibility
 17 | faker = Faker()
 18 | faker.seed_locale('en_US', 0)
 19 | 
 20 | 
 21 | def set_random_seed(seed: int):
 22 |     random.seed(seed)
 23 |     np.random.seed(seed)
 24 |     faker.seed_instance(seed)
 25 | 
 26 | set_random_seed(12345)
 27 | 
 28 | 
 29 | TOTAL_UNIQUE_USERS = 1000
 30 | TOTAL_UNIQUE_TRANSACTIONS = 54000
 31 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 
 32 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 
 33 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10]
 34 | NORMAL_ATM_RADIUS = 0.01
 35 | START_DATE = '2022-01-01 00:00:00'
 36 | END_DATE = '2022-03-01 00:00:00'
 37 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
 38 | 
 39 | AMOUNT_DISTRIBUTION_PERCENTAGES = {
 40 |                                    0.05: (0.01, 1.01), 
 41 |                                    0.075: (1, 11.01),
 42 |                                    0.525: (10, 100.01),
 43 |                                    0.25: (100, 1000.01),
 44 |                                    0.099: (1000, 10000.01),
 45 |                                    0.001: (10000, 30000.01)
 46 |                                   }
 47 | 
 48 | CATEGORY_PERC_PRICE = {
 49 |                        "Grocery":              (0.5, 0.01, 100), 
 50 |                        "Restaurant/Cafeteria": (0.2, 1, 100),
 51 |                        "Health/Beauty":        (0.1, 10, 500.01),
 52 |                        "Domestic Transport":   (0.1, 10, 100.01),
 53 |                        "Clothing":             (0.05, 10, 2000.01),
 54 |                        "Electronics":          (0.02, 100, 10000.01),
 55 |                        "Sports/Outdoors":      (0.015, 10, 100.01),
 56 |                        "Holliday/Travel":      (0.014, 10, 100.01),              
 57 |                        "Jewelery":             (0.001, 10, 100.01)
 58 |                        }
 59 | 
 60 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent
 61 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS)
 62 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10]
 63 | 
 64 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = {
 65 |                                    0.055: (17, 24), 
 66 |                                    0.0015: (24, 34),
 67 |                                    0.0015: (34, 44),
 68 |                                    0.02: (44, 54),
 69 |                                    0.022: (54, 64),
 70 |                                    0.1: (64, 74),
 71 |                                    0.40: (74, 84),
 72 |                                    0.40: (84, 100),
 73 |                                   }
 74 | 
 75 | 
 76 | 
 77 | def generate_unique_credit_card_numbers(n: int) -> pd.Series:
 78 |     """."""    
 79 |     cc_ids = set()
 80 |     for _ in range(n):
 81 |         cc_id = faker.credit_card_number(card_type='visa')
 82 |         cc_ids.add(cc_id)
 83 |     return pd.Series(list(cc_ids))
 84 | 
 85 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 
 86 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit
 87 | 
 88 | def generate_list_credit_card_numbers() -> list:
 89 |     """."""    
 90 |     credit_cards = []
 91 |     credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS)
 92 |     delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
 93 |     delta_time_object + datetime.timedelta(days=-728)
 94 |     for cc_num in credit_card_numbers:
 95 |         credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")})        
 96 |     return credit_cards
 97 | 
 98 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame:
 99 |     """."""    
100 |     profiles = []
101 |     for credit_card in credit_cards:
102 |         address = faker.local_latlng(country_code = 'US')
103 |         age = 0 
104 |         profile = None
105 |         while age < 18 or age > 100:
106 |             profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate'])
107 |             dday = profile['birthdate']
108 |             delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day)
109 |             age = int(delta.days / 365)
110 |         profile['City'] = address[2]
111 |         profile['Country'] = address[3]
112 |         profile['cc_num'] = credit_card['cc_num']
113 |         credit_card['age'] = age
114 |         profiles.append(profile)
115 | 
116 |     # Cast the columns to the correct Pandas DType        
117 |     profiles_df = pd.DataFrame.from_records(profiles)
118 |     profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate'])
119 |     profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num'])
120 | 
121 |     return profiles_df
122 | 
123 | #  pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS
124 | def generate_timestamps(n: int) -> list:
125 |     """Return a list of timestamps of length 'n'."""    
126 |     start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
127 |     end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
128 |     timestamps = list()
129 |     for _ in range(n):
130 |         timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT)
131 |         timestamps.append(timestamp)
132 |     timestamps = sorted(timestamps)
133 |     return timestamps
134 | 
135 | def get_random_transaction_amount(start: float, end: float) -> float:
136 |     """."""    
137 |     amt = round(np.random.uniform(start, end), 2)
138 |     return amt
139 | 
140 | def generate_amounts() -> list:
141 |     """."""    
142 |     amounts = []
143 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
144 |         n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
145 |         start, end = span
146 |         for _ in range(n):
147 |             amounts.append(get_random_transaction_amount(start, end+1))
148 |     return amounts
149 | 
150 | def generate_categories(amounts) -> list:
151 |     """."""    
152 |     categories = []        
153 |     for category, category_perc_price in CATEGORY_PERC_PRICE.items():
154 |         percentage, min_price, max_price = category_perc_price
155 |         n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
156 |         for _ in range(n):
157 |             min_price_i = bisect.bisect_left(amounts, min_price)
158 |             max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i)
159 |             categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])})
160 | 
161 |     random.shuffle(categories)
162 |     return categories
163 | 
164 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str:
165 |     """."""    
166 |     hashable = f'{timestamp}{credit_card_number}{transaction_amount}'
167 |     hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest()
168 |     return hexdigest
169 | 
170 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list:
171 |     """."""    
172 |     transactions = []
173 |     for timestamp, category in zip(timestamps, categories):
174 |         credit_card_number = random.choice(credit_card_numbers)
175 |         point_of_tr = faker.local_latlng(country_code = 'US')
176 |         transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount'])
177 |         transactions.append({
178 |                              'tid': transaction_id, 
179 |                              'datetime': timestamp, 
180 |                              'cc_num': credit_card_number, 
181 |                              'category': category['category'], 
182 |                              'amount': category['amount'],
183 |                              'latitude': point_of_tr[0], 
184 |                              'longitude': point_of_tr[1],
185 |                              'city': point_of_tr[2],
186 |                              'country': point_of_tr[3],
187 |                              'fraud_label': 0
188 |                             }
189 |                            )
190 |     return transactions
191 | 
192 | def generate_cash_amounts() -> list:
193 |     """."""    
194 |     cash_amounts = []
195 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
196 |         n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage)
197 |         start, end = span
198 |         for _ in range(n):
199 |             cash_amounts.append(get_random_transaction_amount(start, end+1))
200 |     return cash_amounts
201 | 
202 | def generate_chains():
203 |     """."""    
204 |     visited = set()
205 |     chains = defaultdict(list)
206 | 
207 |     def size(chains: dict) -> int:
208 |         counts = {key: len(values)+1 for (key, values) in chains.items()}
209 |         return sum(counts.values())
210 | 
211 | 
212 |     def generate_attack_chain(i: int):
213 |         chain_length = random.choice(ATTACK_CHAIN_LENGTHS)
214 |         for j in range(1, chain_length):
215 |             if i+j not in visited:
216 |                 if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS:
217 |                     break
218 |                 chains[i].append(i+j)
219 |                 visited.add(i+j)
220 | 
221 |     while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS:
222 |         i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS))
223 |         if i not in visited:
224 |             generate_attack_chain(i)
225 |             visited.add(i)
226 |     return chains
227 | 
228 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \
229 |                             delta: int, radius: float = None, country_code = 'US') -> List[Dict]:
230 |     """."""
231 |     atms = [] 
232 |     if length < 0:
233 |         raise Exception('Length must be > 0')
234 |         
235 |     start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
236 |     end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
237 |     timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None)
238 |     point_of_tr = faker.local_latlng(country_code = country_code)
239 |     latitude = point_of_tr[0] 
240 |     longitude = point_of_tr[1]
241 |     city = point_of_tr[2]
242 |     for _ in range(length):
243 |         current = timestamp + datetime.timedelta(hours=delta)
244 |         if radius is not None:
245 |             latitude = faker.coordinate(latitude, radius) 
246 |             longitude = faker.coordinate(longitude, radius)
247 |         amount = random.sample(cash_amounts, 1)[0]
248 |         transaction_id = generate_transaction_id(timestamp, credit_card_number, amount)
249 |         atms.append({'tid': transaction_id, 
250 |                      'datetime': current.strftime(DATE_FORMAT), 
251 |                      'cc_num': credit_card_number, 
252 |                      'category': 'Cash Withdrawal', 
253 |                      'amount': amount,
254 |                      'latitude': latitude, 
255 |                      'longitude': longitude,
256 |                      'city': city,
257 |                      'country': 'US',
258 |                      'fraud_label': 0
259 |                      })
260 |         timestamp = current
261 |     return atms
262 | 
263 | def generate_susceptible_cards(credit_cards: list) -> list:
264 |     """."""
265 |     susceptible_cards = []
266 |     visited_cards = []
267 |     for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items():
268 |         n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 
269 |         start, end = span
270 |         for _ in range(n):
271 |             for card in credit_cards:
272 |                 if card['age'] > start and card['age'] < end:
273 |                     if card['cc_num'] not in visited_cards:
274 |                         current = card
275 |                         visited_cards.append(card['cc_num'])
276 |                         break
277 |                     else:
278 |                         current = None                    
279 |             if current is not None:
280 |                 susceptible_cards.append(current)
281 |     return susceptible_cards
282 | 
283 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list:
284 |     """."""
285 |     normal_atm_withdrawals = []
286 |     atm_transactions = len(cash_amounts)
287 |     cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1))
288 |     atm_count = 0
289 |     while atm_count < atm_transactions:
290 |         for card in cash_withdrawal_cards:
291 |             for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 
292 |                 # interval in hours between normal cash withdrawals
293 |                 delta = random.randint(6, 168)
294 |                 atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS)         
295 |                 normal_atm_withdrawals.append(atm_tr)
296 |                 atm_count += ATM_WITHRAWAL_SEQ
297 |     return normal_atm_withdrawals
298 | 
299 | 
300 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list:
301 |     """."""
302 |     timestamps = []
303 |     timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT)
304 |     for _ in range(chain_length):
305 |         # interval in seconds between fraudulent attacks
306 |         delta = random.randint(30, 120)
307 |         current = timestamp + datetime.timedelta(seconds=delta)
308 |         timestamps.append(current.strftime(DATE_FORMAT))
309 |         timestamp = current
310 |     return timestamps 
311 | 
312 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list:
313 |     """."""
314 |     amounts = []
315 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
316 |         n = math.ceil(chain_length * percentage)
317 |         start, end = span
318 |         for _ in range(n):
319 |             amounts.append(get_random_transaction_amount(start, end+1))
320 |     return amounts[:chain_length]
321 | 
322 | 
323 | def update_transactions(transactions: list, chains: list) -> list:
324 |     """."""
325 |     for key, chain in chains.items():
326 |         transaction = transactions[key]
327 |         timestamp = transaction['datetime']
328 |         cc_num = transaction['cc_num']
329 |         amount = transaction['amount']
330 |         transaction['fraud_label'] = 1
331 |         inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain))
332 |         inject_amounts = generate_amounts_for_fraud_attacks(len(chain))
333 |         random.shuffle(inject_amounts)
334 |         for i, idx in enumerate(chain):
335 |             original_transaction = transactions[idx]
336 |             inject_timestamp = inject_timestamps[i]
337 |             original_transaction['datetime'] = inject_timestamp
338 |             original_transaction['fraud_label'] = 1
339 |             original_transaction['cc_num'] = cc_num
340 |             original_transaction['amount'] = inject_amounts[i]
341 |             original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0]
342 |             original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount)
343 |             transactions[idx] = original_transaction
344 | 
345 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list:
346 |     """."""
347 |     return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \
348 |                   int(FRAUD_RATIO * len(normal_atm_withdrawals)))
349 | 
350 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\
351 |                                   cash_amounts: list):
352 |     """."""
353 |     for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs:
354 |         # interval in seconds between fraudulent attacks
355 |         delta = random.randint(1, 5)
356 |         atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx]
357 |         pre_fraudulent_atm_tr = atm_withdrawal[0]
358 |         fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number =
359 |                 pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0]
360 |         fraudulent_atm_location = faker.location_on_land()
361 |         while fraudulent_atm_location[3] == 'US':
362 |             fraudulent_atm_location = faker.location_on_land()
363 |         fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'],
364 |                 DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT)
365 |         fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0]
366 |         fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1]
367 |         fraudulent_atm_tr['city'] = fraudulent_atm_location[2]
368 |         fraudulent_atm_tr['country'] = fraudulent_atm_location[3]  
369 |         fraudulent_atm_tr['fraud_label'] = 1 
370 |         atm_withdrawal.append(fraudulent_atm_tr)
371 |         normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal
372 |         
373 |         
374 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame:
375 |     """."""
376 |     for atm_withdrawal in normal_atm_withdrawals:
377 |         for withdrawal in atm_withdrawal:
378 |             transactions.append(withdrawal)  
379 |     return pd.DataFrame.from_records(transactions)
380 | 
381 | 
382 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame:
383 |     """."""
384 |     df = pd.DataFrame.from_records(credit_cards)
385 |     # Cast the columns to the correct Pandas DType
386 |     df['cc_num']= pd.to_numeric(df['cc_num'])
387 |     return df
388 | 
389 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame:
390 |     """."""
391 |     profiles_df = generate_df_with_profiles(credit_cards)
392 |     return profiles_df
393 | 
394 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame:
395 |     """."""
396 |     timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS)
397 |     amounts = generate_amounts()
398 |     categories = generate_categories(amounts)
399 |     cc_df = create_credit_cards_as_df(credit_cards)    
400 |     transactions = generate_transactions(cc_df['cc_num'], timestamps, categories)
401 |     cash_amounts = generate_cash_amounts()
402 |     chains = generate_chains()
403 |     susceptible_cards = generate_susceptible_cards(credit_cards)
404 |     normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards)
405 |     update_transactions(transactions, chains)
406 | 
407 |     fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals)
408 |     update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts)
409 | 
410 |     transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals)
411 |                                 
412 |     # Cast the columns to the correct Pandas DType
413 |     transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num'])
414 |     transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude'])
415 |     transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude'])
416 |     transactions_df['datetime']= pd.to_datetime(transactions_df['datetime'])
417 | 
418 |     return transactions_df
419 | 
420 | 


--------------------------------------------------------------------------------
/src/05-module/pytest-workflow.yml:
--------------------------------------------------------------------------------
 1 | name: pytest-workflow
 2 | 
 3 | on:
 4 |   push
 5 |   #workflow_dispatch:
 6 | 
 7 | jobs:
 8 |   test_schedule:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: checkout repo content
12 |         uses: actions/checkout@v2
13 | 
14 |       - name: setup python
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: '3.8.9'
18 | 
19 |       - name: install python packages
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           pip install -r requirements.txt
23 |           pip install pytest
24 | 
25 |       - name: execute python workflows from bash script
26 |         env:
27 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
28 |         run: cd src/05-module && python -m pytest
29 | 
30 | 


--------------------------------------------------------------------------------
/src/05-module/scripts/run-fraud-feature-pipelines.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | cd src/05-module
6 | 
7 | jupyter nbconvert --to notebook --execute 2_cc_feature_pipeline_with_ge.ipynb
8 | 
9 | 


--------------------------------------------------------------------------------
/src/05-module/sml/cc_features.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from datetime import datetime, date
  5 | from math import radians
  6 | 
  7 | # +
  8 | # def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame:
  9 | #     """Used only in feature pipelines (not online inference). 
 10 | #        Unit test with DataFrames and sample data.
 11 | #     """
 12 | #     age_df = trans_df.merge(profiles_df, on="cc_num", how="left")
 13 | #     trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y")
 14 | #     return trans_df
 15 | 
 16 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame:
 17 |     """Used only in feature pipelines (not online inference). 
 18 |        Unit test with DataFrames and sample data.
 19 |     """
 20 |     card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left")
 21 |     card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y")
 22 |     trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D")
 23 |     return trans_df
 24 | 
 25 | 
 26 | # -
 27 | 
 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float:
 29 |     """Compute Haversine distance between each consecutive coordinate in (long, lat)."""
 30 | 
 31 |     if isinstance(long, pd.Series):
 32 |         long = long.map(lambda x: (x))
 33 |     else:
 34 |         long = radians(long)
 35 |     
 36 |     if isinstance(lat, pd.Series):
 37 |         lat = lat.map(lambda x: (x))
 38 |     else:
 39 |         lat = radians(lat)
 40 | 
 41 |     if isinstance(long, pd.Series):
 42 |         prev_long = prev_long.map(lambda x: (x))
 43 |     else:
 44 |         prev_long = radians(prev_long)
 45 | 
 46 |     if isinstance(lat, pd.Series):
 47 |         prev_lat = prev_lat.map(lambda x: (x))
 48 |     else:
 49 |         prev_lat = radians(prev_lat)
 50 | 
 51 |     long_diff = prev_long - long
 52 |     lat_diff = prev_lat - lat
 53 | 
 54 |     a = np.sin(lat_diff/2.0)**2
 55 |     b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2
 56 |     c = 2*np.arcsin(np.sqrt(a + b))
 57 | 
 58 |     return c
 59 | 
 60 | 
 61 | def time_delta(prev_datetime: int, current_datetime: int)-> int:
 62 |     """Compute time difference between each consecutive transaction."""        
 63 |     return prev_datetime - current_datetime
 64 | 
 65 | def time_delta_to_days(time_delta: datetime)-> float:
 66 |     """."""    
 67 |     return time_delta.total_seconds() / 86400
 68 | 
 69 | def date_to_timestamp(date_obj: datetime)-> int:
 70 |     return int(date_obj.timestamp() * 1000)
 71 | 
 72 | def timestamp_to_date(timestamp: int)-> datetime:
 73 |     return datetime.fromtimestamp(timestamp // 1000)
 74 | 
 75 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame:
 76 |     
 77 |     # Convert coordinates into radians:
 78 |     trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians)
 79 |     
 80 |     trans_df.sort_values(["datetime", "cc_num"], inplace=True) 
 81 | 
 82 |     # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most
 83 |     # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end),
 84 |     # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang).
 85 |     trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
 86 |         .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\
 87 |         .reset_index(level=0, drop=True)\
 88 |         .fillna(0)
 89 | 
 90 |     # Use the same `shift` operation in Pandas to get the previous row for a given cc_number
 91 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
 92 |         .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\
 93 |         .reset_index(level=0, drop=True)
 94 | #        .fillna(0) # handle the first datetime, which has no previous row when you call `shift`
 95 | 
 96 |     # Convert time_delta from seconds to days
 97 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x))
 98 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0)
 99 |     # , "age_at_transaction"
100 |     trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country" \
101 |                          ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]]
102 |     # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC
103 |     # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins.
104 |     trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x))
105 |     return trans_df
106 | 
107 | 
108 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame:
109 |     
110 |     cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime")
111 | 
112 |     # Moving average of transaction volume.
113 |     df_mavg = pd.DataFrame(cc_group.mean())
114 |     df_mavg.columns = ["trans_volume_mavg", "datetime"]
115 |     df_mavg = df_mavg.reset_index(level=["cc_num"])
116 |     df_mavg = df_mavg.drop(columns=["cc_num", "datetime"])
117 |     df_mavg = df_mavg.sort_index()
118 | 
119 |     # Moving standard deviation of transaction volume.
120 |     df_std = pd.DataFrame(cc_group.mean())
121 |     df_std.columns = ["trans_volume_mstd", "datetime"]
122 |     df_std = df_std.reset_index(level=["cc_num"])
123 |     df_std = df_std.drop(columns=["cc_num", "datetime"])
124 |     df_std = df_std.fillna(0)
125 |     df_std = df_std.sort_index()
126 |     window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True)
127 | 
128 |     # Moving average of transaction frequency.
129 |     df_count = pd.DataFrame(cc_group.mean())
130 |     df_count.columns = ["trans_freq", "datetime"]
131 |     df_count = df_count.reset_index(level=["cc_num"])
132 |     df_count = df_count.drop(columns=["cc_num", "datetime"])
133 |     df_count = df_count.sort_index()
134 |     window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True)
135 | 
136 |     # Moving average of location difference between consecutive transactions.
137 |     cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean()
138 |     df_loc_delta_mavg = pd.DataFrame(cc_group)
139 |     df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"]
140 |     df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"])
141 |     df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"])
142 |     df_loc_delta_mavg = df_loc_delta_mavg.sort_index()
143 |     window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True)
144 | 
145 |     window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True)
146 |  
147 |     return window_aggs_df
148 | 


--------------------------------------------------------------------------------
/src/05-module/sml/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | # %%
  4 | from collections import defaultdict
  5 | from faker import Faker
  6 | import pandas as pd
  7 | import numpy as np
  8 | import datetime
  9 | import hashlib
 10 | import random
 11 | import math
 12 | import os
 13 | import bisect
 14 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple
 15 | 
 16 | # Seed for Reproducibility
 17 | faker = Faker()
 18 | faker.seed_locale('en_US', 0)
 19 | 
 20 | 
 21 | def set_random_seed(seed: int):
 22 |     random.seed(seed)
 23 |     np.random.seed(seed)
 24 |     faker.seed_instance(seed)
 25 | 
 26 | set_random_seed(12345)
 27 | 
 28 | 
 29 | TOTAL_UNIQUE_USERS = 1000
 30 | TOTAL_UNIQUE_TRANSACTIONS = 54000
 31 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 
 32 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 
 33 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10]
 34 | NORMAL_ATM_RADIUS = 0.01
 35 | START_DATE = '2022-01-01 00:00:00'
 36 | END_DATE = '2022-03-01 00:00:00'
 37 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
 38 | 
 39 | AMOUNT_DISTRIBUTION_PERCENTAGES = {
 40 |                                    0.05: (0.01, 1.01), 
 41 |                                    0.075: (1, 11.01),
 42 |                                    0.525: (10, 100.01),
 43 |                                    0.25: (100, 1000.01),
 44 |                                    0.099: (1000, 10000.01),
 45 |                                    0.001: (10000, 30000.01)
 46 |                                   }
 47 | 
 48 | CATEGORY_PERC_PRICE = {
 49 |                        "Grocery":              (0.5, 0.01, 100), 
 50 |                        "Restaurant/Cafeteria": (0.2, 1, 100),
 51 |                        "Health/Beauty":        (0.1, 10, 500.01),
 52 |                        "Domestic Transport":   (0.1, 10, 100.01),
 53 |                        "Clothing":             (0.05, 10, 2000.01),
 54 |                        "Electronics":          (0.02, 100, 10000.01),
 55 |                        "Sports/Outdoors":      (0.015, 10, 100.01),
 56 |                        "Holliday/Travel":      (0.014, 10, 100.01),              
 57 |                        "Jewelery":             (0.001, 10, 100.01)
 58 |                        }
 59 | 
 60 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent
 61 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS)
 62 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10]
 63 | 
 64 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = {
 65 |                                    0.055: (17, 24), 
 66 |                                    0.0015: (24, 34),
 67 |                                    0.0015: (34, 44),
 68 |                                    0.02: (44, 54),
 69 |                                    0.022: (54, 64),
 70 |                                    0.1: (64, 74),
 71 |                                    0.40: (74, 84),
 72 |                                    0.40: (84, 100),
 73 |                                   }
 74 | 
 75 | 
 76 | 
 77 | def generate_unique_credit_card_numbers(n: int) -> pd.Series:
 78 |     """."""    
 79 |     cc_ids = set()
 80 |     for _ in range(n):
 81 |         cc_id = faker.credit_card_number(card_type='visa')
 82 |         cc_ids.add(cc_id)
 83 |     return pd.Series(list(cc_ids))
 84 | 
 85 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 
 86 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit
 87 | 
 88 | def generate_list_credit_card_numbers() -> list:
 89 |     """."""    
 90 |     credit_cards = []
 91 |     credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS)
 92 |     delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
 93 |     delta_time_object + datetime.timedelta(days=-728)
 94 |     for cc_num in credit_card_numbers:
 95 |         credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")})        
 96 |     return credit_cards
 97 | 
 98 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame:
 99 |     """."""    
100 |     profiles = []
101 |     for credit_card in credit_cards:
102 |         address = faker.local_latlng(country_code = 'US')
103 |         age = 0 
104 |         profile = None
105 |         while age < 18 or age > 100:
106 |             profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate'])
107 |             dday = profile['birthdate']
108 |             delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day)
109 |             age = int(delta.days / 365)
110 |         profile['City'] = address[2]
111 |         profile['Country'] = address[3]
112 |         profile['cc_num'] = credit_card['cc_num']
113 |         credit_card['age'] = age
114 |         profiles.append(profile)
115 | 
116 |     # Cast the columns to the correct Pandas DType        
117 |     profiles_df = pd.DataFrame.from_records(profiles)
118 |     profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate'])
119 |     profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num'])
120 | 
121 |     return profiles_df
122 | 
123 | #  pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS
124 | def generate_timestamps(n: int) -> list:
125 |     """Return a list of timestamps of length 'n'."""    
126 |     start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
127 |     end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
128 |     timestamps = list()
129 |     for _ in range(n):
130 |         timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT)
131 |         timestamps.append(timestamp)
132 |     timestamps = sorted(timestamps)
133 |     return timestamps
134 | 
135 | def get_random_transaction_amount(start: float, end: float) -> float:
136 |     """."""    
137 |     amt = round(np.random.uniform(start, end), 2)
138 |     return amt
139 | 
140 | def generate_amounts() -> list:
141 |     """."""    
142 |     amounts = []
143 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
144 |         n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
145 |         start, end = span
146 |         for _ in range(n):
147 |             amounts.append(get_random_transaction_amount(start, end+1))
148 |     return amounts
149 | 
150 | def generate_categories(amounts) -> list:
151 |     """."""    
152 |     categories = []        
153 |     for category, category_perc_price in CATEGORY_PERC_PRICE.items():
154 |         percentage, min_price, max_price = category_perc_price
155 |         n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
156 |         for _ in range(n):
157 |             min_price_i = bisect.bisect_left(amounts, min_price)
158 |             max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i)
159 |             categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])})
160 | 
161 |     random.shuffle(categories)
162 |     return categories
163 | 
164 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str:
165 |     """."""    
166 |     hashable = f'{timestamp}{credit_card_number}{transaction_amount}'
167 |     hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest()
168 |     return hexdigest
169 | 
170 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list:
171 |     """."""    
172 |     transactions = []
173 |     for timestamp, category in zip(timestamps, categories):
174 |         credit_card_number = random.choice(credit_card_numbers)
175 |         point_of_tr = faker.local_latlng(country_code = 'US')
176 |         transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount'])
177 |         transactions.append({
178 |                              'tid': transaction_id, 
179 |                              'datetime': timestamp, 
180 |                              'cc_num': credit_card_number, 
181 |                              'category': category['category'], 
182 |                              'amount': category['amount'],
183 |                              'latitude': point_of_tr[0], 
184 |                              'longitude': point_of_tr[1],
185 |                              'city': point_of_tr[2],
186 |                              'country': point_of_tr[3],
187 |                              'fraud_label': 0
188 |                             }
189 |                            )
190 |     return transactions
191 | 
192 | def generate_cash_amounts() -> list:
193 |     """."""    
194 |     cash_amounts = []
195 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
196 |         n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage)
197 |         start, end = span
198 |         for _ in range(n):
199 |             cash_amounts.append(get_random_transaction_amount(start, end+1))
200 |     return cash_amounts
201 | 
202 | def generate_chains():
203 |     """."""    
204 |     visited = set()
205 |     chains = defaultdict(list)
206 | 
207 |     def size(chains: dict) -> int:
208 |         counts = {key: len(values)+1 for (key, values) in chains.items()}
209 |         return sum(counts.values())
210 | 
211 | 
212 |     def generate_attack_chain(i: int):
213 |         chain_length = random.choice(ATTACK_CHAIN_LENGTHS)
214 |         for j in range(1, chain_length):
215 |             if i+j not in visited:
216 |                 if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS:
217 |                     break
218 |                 chains[i].append(i+j)
219 |                 visited.add(i+j)
220 | 
221 |     while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS:
222 |         i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS))
223 |         if i not in visited:
224 |             generate_attack_chain(i)
225 |             visited.add(i)
226 |     return chains
227 | 
228 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \
229 |                             delta: int, radius: float = None, country_code = 'US') -> List[Dict]:
230 |     """."""
231 |     atms = [] 
232 |     if length < 0:
233 |         raise Exception('Length must be > 0')
234 |         
235 |     start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
236 |     end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
237 |     timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None)
238 |     point_of_tr = faker.local_latlng(country_code = country_code)
239 |     latitude = point_of_tr[0] 
240 |     longitude = point_of_tr[1]
241 |     city = point_of_tr[2]
242 |     for _ in range(length):
243 |         current = timestamp + datetime.timedelta(hours=delta)
244 |         if radius is not None:
245 |             latitude = faker.coordinate(latitude, radius) 
246 |             longitude = faker.coordinate(longitude, radius)
247 |         amount = random.sample(cash_amounts, 1)[0]
248 |         transaction_id = generate_transaction_id(timestamp, credit_card_number, amount)
249 |         atms.append({'tid': transaction_id, 
250 |                      'datetime': current.strftime(DATE_FORMAT), 
251 |                      'cc_num': credit_card_number, 
252 |                      'category': 'Cash Withdrawal', 
253 |                      'amount': amount,
254 |                      'latitude': latitude, 
255 |                      'longitude': longitude,
256 |                      'city': city,
257 |                      'country': 'US',
258 |                      'fraud_label': 0
259 |                      })
260 |         timestamp = current
261 |     return atms
262 | 
263 | def generate_susceptible_cards(credit_cards: list) -> list:
264 |     """."""
265 |     susceptible_cards = []
266 |     visited_cards = []
267 |     for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items():
268 |         n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 
269 |         start, end = span
270 |         for _ in range(n):
271 |             for card in credit_cards:
272 |                 if card['age'] > start and card['age'] < end:
273 |                     if card['cc_num'] not in visited_cards:
274 |                         current = card
275 |                         visited_cards.append(card['cc_num'])
276 |                         break
277 |                     else:
278 |                         current = None                    
279 |             if current is not None:
280 |                 susceptible_cards.append(current)
281 |     return susceptible_cards
282 | 
283 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list:
284 |     """."""
285 |     normal_atm_withdrawals = []
286 |     atm_transactions = len(cash_amounts)
287 |     cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1))
288 |     atm_count = 0
289 |     while atm_count < atm_transactions:
290 |         for card in cash_withdrawal_cards:
291 |             for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 
292 |                 # interval in hours between normal cash withdrawals
293 |                 delta = random.randint(6, 168)
294 |                 atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS)         
295 |                 normal_atm_withdrawals.append(atm_tr)
296 |                 atm_count += ATM_WITHRAWAL_SEQ
297 |     return normal_atm_withdrawals
298 | 
299 | 
300 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list:
301 |     """."""
302 |     timestamps = []
303 |     timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT)
304 |     for _ in range(chain_length):
305 |         # interval in seconds between fraudulent attacks
306 |         delta = random.randint(30, 120)
307 |         current = timestamp + datetime.timedelta(seconds=delta)
308 |         timestamps.append(current.strftime(DATE_FORMAT))
309 |         timestamp = current
310 |     return timestamps 
311 | 
312 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list:
313 |     """."""
314 |     amounts = []
315 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
316 |         n = math.ceil(chain_length * percentage)
317 |         start, end = span
318 |         for _ in range(n):
319 |             amounts.append(get_random_transaction_amount(start, end+1))
320 |     return amounts[:chain_length]
321 | 
322 | 
323 | def update_transactions(transactions: list, chains: list) -> list:
324 |     """."""
325 |     for key, chain in chains.items():
326 |         transaction = transactions[key]
327 |         timestamp = transaction['datetime']
328 |         cc_num = transaction['cc_num']
329 |         amount = transaction['amount']
330 |         transaction['fraud_label'] = 1
331 |         inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain))
332 |         inject_amounts = generate_amounts_for_fraud_attacks(len(chain))
333 |         random.shuffle(inject_amounts)
334 |         for i, idx in enumerate(chain):
335 |             original_transaction = transactions[idx]
336 |             inject_timestamp = inject_timestamps[i]
337 |             original_transaction['datetime'] = inject_timestamp
338 |             original_transaction['fraud_label'] = 1
339 |             original_transaction['cc_num'] = cc_num
340 |             original_transaction['amount'] = inject_amounts[i]
341 |             original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0]
342 |             original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount)
343 |             transactions[idx] = original_transaction
344 | 
345 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list:
346 |     """."""
347 |     return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \
348 |                   int(FRAUD_RATIO * len(normal_atm_withdrawals)))
349 | 
350 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\
351 |                                   cash_amounts: list):
352 |     """."""
353 |     for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs:
354 |         # interval in seconds between fraudulent attacks
355 |         delta = random.randint(1, 5)
356 |         atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx]
357 |         pre_fraudulent_atm_tr = atm_withdrawal[0]
358 |         fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number =
359 |                 pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0]
360 |         fraudulent_atm_location = faker.location_on_land()
361 |         while fraudulent_atm_location[3] == 'US':
362 |             fraudulent_atm_location = faker.location_on_land()
363 |         fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'],
364 |                 DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT)
365 |         fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0]
366 |         fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1]
367 |         fraudulent_atm_tr['city'] = fraudulent_atm_location[2]
368 |         fraudulent_atm_tr['country'] = fraudulent_atm_location[3]  
369 |         fraudulent_atm_tr['fraud_label'] = 1 
370 |         atm_withdrawal.append(fraudulent_atm_tr)
371 |         normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal
372 |         
373 |         
374 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame:
375 |     """."""
376 |     for atm_withdrawal in normal_atm_withdrawals:
377 |         for withdrawal in atm_withdrawal:
378 |             transactions.append(withdrawal)  
379 |     return pd.DataFrame.from_records(transactions)
380 | 
381 | 
382 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame:
383 |     """."""
384 |     df = pd.DataFrame.from_records(credit_cards)
385 |     # Cast the columns to the correct Pandas DType
386 |     df['cc_num']= pd.to_numeric(df['cc_num'])
387 |     return df
388 | 
389 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame:
390 |     """."""
391 |     profiles_df = generate_df_with_profiles(credit_cards)
392 |     return profiles_df
393 | 
394 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame:
395 |     """."""
396 |     timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS)
397 |     amounts = generate_amounts()
398 |     categories = generate_categories(amounts)
399 |     cc_df = create_credit_cards_as_df(credit_cards)    
400 |     transactions = generate_transactions(cc_df['cc_num'], timestamps, categories)
401 |     cash_amounts = generate_cash_amounts()
402 |     chains = generate_chains()
403 |     susceptible_cards = generate_susceptible_cards(credit_cards)
404 |     normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards)
405 |     update_transactions(transactions, chains)
406 | 
407 |     fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals)
408 |     update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts)
409 | 
410 |     transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals)
411 |                                 
412 |     # Cast the columns to the correct Pandas DType
413 |     transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num'])
414 |     transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude'])
415 |     transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude'])
416 |     transactions_df['datetime']= pd.to_datetime(transactions_df['datetime'])
417 | 
418 |     return transactions_df
419 | 
420 | 


--------------------------------------------------------------------------------
/src/05-module/test_sml/test_sml.py:
--------------------------------------------------------------------------------
 1 | from sml import synthetic_data
 2 | from unittest import TestCase
 3 | import pytest
 4 | from contextlib import nullcontext as does_not_raise
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "credit_card_number, cash_amounts, length, delta, radius, country_code, excp",
 8 |     [("1111 2222 3333 4444",[112.10, 11.23], 1, 1, 10.0, 'US', does_not_raise())
 9 |     ,("1111 2222 3333 44",[-12.00], -1, 1, 1.0, 'IE', pytest.raises(Exception))]
10 | )    
11 | def test_generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \
12 |                                  delta: int, radius: float, country_code, excp):
13 |     with excp:
14 |         synthetic_data.generate_atm_withdrawal(credit_card_number, cash_amounts, length, delta, radius, country_code)
15 |         
16 | 


--------------------------------------------------------------------------------
/src/06-module/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2022 Jim Dowling
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/src/06-module/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ## Directory Structure
 4 | 
 5 | 
 6 | ├── LICENSE
 7 | ├── README.md          <- README explains this Python module to both developers and users.
 8 | │
 9 | ├── notebooks          <- Jupyter notebooks. Naming convention is a number (for ordering),
10 | │   └── my_module      <- A symbolic link to the 'my_module' directory
11 | │                         On Linux/Mac: cd notebooks ; ln -s ../my_module .
12 | │
13 | ├── requirements.txt   <- The requirements file for creating the Python environment. Install in a venv/conda environment.
14 | │                         `conda activate my_env`
15 | │                         my_env> `pip install -r requirements.txt`
16 | │
17 | ├── setup.py           <- Make this project pip installable with `pip install -e`
18 | ├── my_module          <- Source code for this project.
19 | │   ├── __init__.py    <- Makes a Python module
20 | │   │
21 | │   ├── pipelines      <- Feature pipelines, training pipelines, batch inference pipelines.
22 | │   │   │── feature_pipeline.py
23 | │   │   │── training_pipeline.py
24 | │   │   └── batch_inference_pipeline.py
25 | │   │
26 | │   ├── features       <- Python modules to turn raw data into features for use in both training and inference
27 | │   │   └── my_features.py
28 | │   │
29 | │   ├── transformations<- Python modules with model-specific transformation functions
30 | │   │   └── my_transformations.py
31 | │   │
32 | │   ├── tests          <- Pytest unit tests for feature logic
33 | │   │   └── test_features.py
34 | │   │
35 | │   ├── pipeline_tests <- Pytest to run end-to-end tests for pipelines
36 | │   │   └── test_feature_pipelines.py
37 | │   │
38 | │   └── visualization  <- Scripts to create exploratory and results oriented visualizations
39 | │       └── eda_visualize.py
40 | │
41 | └── scripts            <- Bash scripts for the project
42 | 


--------------------------------------------------------------------------------
/src/06-module/notebooks/predict_example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import hsfs
 4 | import joblib
 5 | 
 6 | class Predict(object):
 7 | 
 8 |     def __init__(self):
 9 |         """ Initializes the serving state, reads a trained model"""        
10 |         # load the trained model
11 |         self.model = joblib.load(os.environ["ARTIFACT_FILES_PATH"] + "/xgboost.pkl")
12 |         print("Initialization Complete")
13 | 
14 |     def predict(self, inputs):
15 |         """ Serves a prediction request usign a trained model"""        
16 |         return self.model.predict(np.asarray(inputs).reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable
17 |     
18 | 


--------------------------------------------------------------------------------
/src/06-module/notebooks/xgboost.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/featurestoreorg/serverless-ml-course/fda768df9b02b92c864cd1258f8e58085c82576a/src/06-module/notebooks/xgboost.pkl


--------------------------------------------------------------------------------
/src/06-module/requirements.txt:
--------------------------------------------------------------------------------
1 | hopsworks
2 | 


--------------------------------------------------------------------------------
/src/06-module/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name='sml',
 5 |     version='0.1',
 6 |     packages=['sml',],
 7 |     license='Apache v2',
 8 |     long_description='Serverless Machine Learning',
 9 | )
10 | 


--------------------------------------------------------------------------------
/src/06-module/sml/__init__.py:
--------------------------------------------------------------------------------
 1 | #   Licensed under the Apache License, Version 2.0 (the "License");
 2 | #   you may not use this file except in compliance with the License.
 3 | #   You may obtain a copy of the License at
 4 | #
 5 | #       http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | #   Unless required by applicable law or agreed to in writing, software
 8 | #   distributed under the License is distributed on an "AS IS" BASIS,
 9 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | #   See the License for the specific language governing permissions and
11 | #   limitations under the License.
12 | #
13 | 
14 | 
15 | __version__ = "0.1"
16 | 


--------------------------------------------------------------------------------
/src/06-module/sml/features/cc_features.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from datetime import datetime, date
  5 | from math import radians
  6 | 
  7 | # +
  8 | def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame:
  9 |     """Used only in feature pipelines (not online inference). 
 10 |        Unit test with DataFrames and sample data.
 11 |     """
 12 |     age_df = trans_df.merge(profiles_df, on="cc_num", how="left")
 13 |     trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y")
 14 |     profiles_df = age_df[["name", "sex", "mail", "birthdate", "City", "Country", "cc_num", "datetime", "month"]]
 15 |     return trans_df, profiles_df
 16 | 
 17 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame:
 18 |     """Used only in feature pipelines (not online inference). 
 19 |        Unit test with DataFrames and sample data.
 20 |     """
 21 |     card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left")
 22 |     card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y")
 23 |     trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D")
 24 |     return trans_df
 25 | 
 26 | 
 27 | # +
 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float:
 29 |     """Compute Haversine distance between each consecutive coordinate in (long, lat)."""
 30 | 
 31 | #     if long > 180 or prev_long > 180:
 32 | #         raise Exception('longitude cannot be greater than 180')
 33 | 
 34 | #     if lat > 90 or prev_lat > 90:
 35 | #         raise Exception('latitude cannot be greater than 90')
 36 | 
 37 | #     if long < -180 or prev_long < -180:
 38 | #         raise Exception('longitude cannot be less than -180')
 39 | 
 40 | #     if lat < -90 or prev_lat < -90:
 41 | #         raise Exception('latitude cannot be less than -90')
 42 | 
 43 |     if isinstance(long, pd.Series):
 44 |         long = long.map(lambda x: radians(x))
 45 |     else:
 46 |         long = radians(long)
 47 |     
 48 |     if isinstance(lat, pd.Series):
 49 |         lat = lat.map(lambda x: radians(x))
 50 |     else:
 51 |         lat = radians(lat)
 52 | 
 53 |     if isinstance(long, pd.Series):
 54 |         prev_long = prev_long.map(lambda x: radians(x))
 55 |     else:
 56 |         prev_long = radians(prev_long)
 57 | 
 58 |     if isinstance(lat, pd.Series):
 59 |         prev_lat = prev_lat.map(lambda x: radians(x))
 60 |     else:
 61 |         prev_lat = radians(prev_lat)
 62 |     
 63 |     long_diff = prev_long - long
 64 |     lat_diff = prev_lat - lat
 65 | 
 66 |     a = np.sin(lat_diff/2.0)**2
 67 |     b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2
 68 |     c = 2*np.arcsin(np.sqrt(a + b))
 69 | 
 70 |     return c
 71 | 
 72 | 
 73 | # -
 74 | 
 75 | def time_delta(prev_datetime: int, current_datetime: int)-> int:
 76 |     """Compute time difference between each consecutive transaction."""    
 77 |     return prev_datetime - current_datetime
 78 | 
 79 | def time_delta_to_days(time_delta: datetime)-> float:
 80 |     """."""    
 81 |     return time_delta.total_seconds() / 86400
 82 | 
 83 | def date_to_timestamp(date_obj: datetime)-> int:
 84 |     return int(date_obj.timestamp() * 1000)
 85 | 
 86 | def timestamp_to_date(timestamp: int)-> datetime:
 87 |     return datetime.fromtimestamp(timestamp // 1000)
 88 | 
 89 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame:
 90 |     
 91 |     # Convert coordinates into radians:
 92 |     trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians)
 93 |     
 94 |     trans_df.sort_values(["datetime", "cc_num"], inplace=True) 
 95 | 
 96 |     # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most
 97 |     # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end),
 98 |     # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang).
 99 |     trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
100 |         .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\
101 |         .reset_index(level=0, drop=True)\
102 |         .fillna(0)
103 | 
104 |     # Use the same `shift` operation in Pandas to get the previous row for a given cc_number
105 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
106 |         .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\
107 |         .reset_index(level=0, drop=True)
108 | #        .fillna(0) # handle the first datetime, which has no previous row when you call `shift`
109 | 
110 |     # Convert time_delta from seconds to days
111 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x))
112 |     trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0)    
113 |     trans_df = trans_df[["tid","datetime", "month", "cc_num","category", "amount", "city", "country", "age_at_transaction"\
114 |                          ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]]
115 |     # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC
116 |     # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins.
117 |     trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x))
118 |     return trans_df
119 | 
120 | 
121 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame:
122 |     
123 |     cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime")
124 | 
125 |     # Moving average of transaction volume.
126 |     df_mavg = pd.DataFrame(cc_group.mean())
127 |     df_mavg.columns = ["trans_volume_mavg", "datetime"]
128 |     df_mavg = df_mavg.reset_index(level=["cc_num"])
129 |     df_mavg = df_mavg.drop(columns=["cc_num", "datetime"])
130 |     df_mavg = df_mavg.sort_index()
131 | 
132 |     # Moving standard deviation of transaction volume.
133 |     df_std = pd.DataFrame(cc_group.mean())
134 |     df_std.columns = ["trans_volume_mstd", "datetime"]
135 |     df_std = df_std.reset_index(level=["cc_num"])
136 |     df_std = df_std.drop(columns=["cc_num", "datetime"])
137 |     df_std = df_std.fillna(0)
138 |     df_std = df_std.sort_index()
139 |     window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True)
140 | 
141 |     # Moving average of transaction frequency.
142 |     df_count = pd.DataFrame(cc_group.mean())
143 |     df_count.columns = ["trans_freq", "datetime"]
144 |     df_count = df_count.reset_index(level=["cc_num"])
145 |     df_count = df_count.drop(columns=["cc_num", "datetime"])
146 |     df_count = df_count.sort_index()
147 |     window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True)
148 | 
149 |     # Moving average of location difference between consecutive transactions.
150 |     cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean()
151 |     df_loc_delta_mavg = pd.DataFrame(cc_group)
152 |     df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"]
153 |     df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"])
154 |     df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"])
155 |     df_loc_delta_mavg = df_loc_delta_mavg.sort_index()
156 |     window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True)
157 | 
158 |     window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime", "month"]].sort_index(),left_index=True, right_index=True)
159 |  
160 |     return window_aggs_df
161 | 


--------------------------------------------------------------------------------
/src/06-module/sml/features/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | # %%
  4 | # pip install faker
  5 | 
  6 | from collections import defaultdict
  7 | from faker import Faker
  8 | import pandas as pd
  9 | import numpy as np
 10 | import datetime
 11 | import hashlib
 12 | import random
 13 | import math
 14 | import os
 15 | import bisect
 16 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple
 17 | 
 18 | # Seed for Reproducibility
 19 | faker = Faker()
 20 | faker.seed_locale('en_US', 0)
 21 | 
 22 | 
 23 | def set_random_seed(seed: int):
 24 |     random.seed(seed)
 25 |     np.random.seed(seed)
 26 |     faker.seed_instance(seed)
 27 | 
 28 | set_random_seed(12345)
 29 | 
 30 | 
 31 | TOTAL_UNIQUE_USERS = 1000
 32 | TOTAL_UNIQUE_TRANSACTIONS = 54000
 33 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 
 34 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 
 35 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10]
 36 | NORMAL_ATM_RADIUS = 0.01
 37 | 
 38 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
 39 | END_DATE = datetime.datetime.now().strftime(DATE_FORMAT) 
 40 | START_DATE = (datetime.datetime.now() - datetime.timedelta(days=30*6)).strftime(DATE_FORMAT) 
 41 | 
 42 | 
 43 | AMOUNT_DISTRIBUTION_PERCENTAGES = {
 44 |                                    0.05: (0.01, 1.01), 
 45 |                                    0.075: (1, 11.01),
 46 |                                    0.525: (10, 100.01),
 47 |                                    0.25: (100, 1000.01),
 48 |                                    0.099: (1000, 10000.01),
 49 |                                    0.001: (10000, 30000.01)
 50 |                                   }
 51 | 
 52 | CATEGORY_PERC_PRICE = {
 53 |                        "Grocery":              (0.5, 0.01, 100), 
 54 |                        "Restaurant/Cafeteria": (0.2, 1, 100),
 55 |                        "Health/Beauty":        (0.1, 10, 500.01),
 56 |                        "Domestic Transport":   (0.1, 10, 100.01),
 57 |                        "Clothing":             (0.05, 10, 2000.01),
 58 |                        "Electronics":          (0.02, 100, 10000.01),
 59 |                        "Sports/Outdoors":      (0.015, 10, 100.01),
 60 |                        "Holliday/Travel":      (0.014, 10, 100.01),              
 61 |                        "Jewelery":             (0.001, 10, 100.01)
 62 |                        }
 63 | 
 64 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent
 65 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS)
 66 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10]
 67 | 
 68 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = {
 69 |                                    0.055: (17, 24), 
 70 |                                    0.0015: (24, 34),
 71 |                                    0.0015: (34, 44),
 72 |                                    0.02: (44, 54),
 73 |                                    0.022: (54, 64),
 74 |                                    0.1: (64, 74),
 75 |                                    0.40: (74, 84),
 76 |                                    0.40: (84, 100),
 77 |                                   }
 78 | 
 79 | 
 80 | def date_to_year_month(date_obj: datetime)-> datetime.date:
 81 |     return date_obj.strftime('%Y-%m')
 82 | 
 83 | def generate_unique_credit_card_numbers(n: int) -> pd.Series:
 84 |     """."""    
 85 |     cc_ids = set()
 86 |     for _ in range(n):
 87 |         cc_id = faker.credit_card_number(card_type='visa')
 88 |         cc_ids.add(cc_id)
 89 |     return pd.Series(list(cc_ids))
 90 | 
 91 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 
 92 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit
 93 | 
 94 | def generate_list_credit_card_numbers() -> list:
 95 |     """."""    
 96 |     credit_cards = []
 97 |     credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS)
 98 |     delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
 99 |     delta_time_object + datetime.timedelta(days=-728)
100 |     for cc_num in credit_card_numbers:
101 |         credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")})        
102 |     return credit_cards
103 | 
104 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame:
105 |     """."""    
106 |     profiles = []
107 |     for credit_card in credit_cards:
108 |         address = faker.local_latlng(country_code = 'US')
109 |         age = 0 
110 |         profile = None
111 |         while age < 18 or age > 100:
112 |             profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate'])
113 |             dday = profile['birthdate']
114 |             delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day)
115 |             age = int(delta.days / 365)
116 |         profile['City'] = address[2]
117 |         profile['Country'] = address[3]
118 |         profile['cc_num'] = credit_card['cc_num']
119 |         credit_card['age'] = age
120 |         profiles.append(profile)
121 | 
122 |     # Cast the columns to the correct Pandas DType        
123 |     profiles_df = pd.DataFrame.from_records(profiles)
124 |     profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate'])
125 |     profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num'])
126 | 
127 |     return profiles_df
128 | 
129 | #  pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS
130 | def generate_timestamps(n: int) -> list:
131 |     """Return a list of timestamps of length 'n'."""    
132 |     start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
133 |     end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
134 |     timestamps = list()
135 |     for _ in range(n):
136 |         timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT)
137 |         timestamps.append(timestamp)
138 |     timestamps = sorted(timestamps)
139 |     return timestamps
140 | 
141 | def get_random_transaction_amount(start: float, end: float) -> float:
142 |     """."""    
143 |     amt = round(np.random.uniform(start, end), 2)
144 |     return amt
145 | 
146 | def generate_amounts() -> list:
147 |     """."""    
148 |     amounts = []
149 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
150 |         n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
151 |         start, end = span
152 |         for _ in range(n):
153 |             amounts.append(get_random_transaction_amount(start, end+1))
154 |     return amounts
155 | 
156 | def generate_categories(amounts) -> list:
157 |     """."""    
158 |     categories = []        
159 |     for category, category_perc_price in CATEGORY_PERC_PRICE.items():
160 |         percentage, min_price, max_price = category_perc_price
161 |         n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
162 |         for _ in range(n):
163 |             min_price_i = bisect.bisect_left(amounts, min_price)
164 |             max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i)
165 |             categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])})
166 | 
167 |     random.shuffle(categories)
168 |     return categories
169 | 
170 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str:
171 |     """."""    
172 |     hashable = f'{timestamp}{credit_card_number}{transaction_amount}'
173 |     hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest()
174 |     return hexdigest
175 | 
176 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list:
177 |     """."""    
178 |     transactions = []
179 |     for timestamp, category in zip(timestamps, categories):
180 |         credit_card_number = random.choice(credit_card_numbers)
181 |         point_of_tr = faker.local_latlng(country_code = 'US')
182 |         transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount'])
183 |         transactions.append({
184 |                              'tid': transaction_id, 
185 |                              'datetime': timestamp, 
186 |                              'cc_num': credit_card_number, 
187 |                              'category': category['category'], 
188 |                              'amount': category['amount'],
189 |                              'latitude': point_of_tr[0], 
190 |                              'longitude': point_of_tr[1],
191 |                              'city': point_of_tr[2],
192 |                              'country': point_of_tr[3],
193 |                              'fraud_label': 0
194 |                             }
195 |                            )
196 |     return transactions
197 | 
198 | def generate_cash_amounts() -> list:
199 |     """."""    
200 |     cash_amounts = []
201 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
202 |         n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage)
203 |         start, end = span
204 |         for _ in range(n):
205 |             cash_amounts.append(get_random_transaction_amount(start, end+1))
206 |     return cash_amounts
207 | 
208 | def generate_chains():
209 |     """."""    
210 |     visited = set()
211 |     chains = defaultdict(list)
212 | 
213 |     def size(chains: dict) -> int:
214 |         counts = {key: len(values)+1 for (key, values) in chains.items()}
215 |         return sum(counts.values())
216 | 
217 | 
218 |     def generate_attack_chain(i: int):
219 |         chain_length = random.choice(ATTACK_CHAIN_LENGTHS)
220 |         for j in range(1, chain_length):
221 |             if i+j not in visited:
222 |                 if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS:
223 |                     break
224 |                 chains[i].append(i+j)
225 |                 visited.add(i+j)
226 | 
227 |     while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS:
228 |         i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS))
229 |         if i not in visited:
230 |             generate_attack_chain(i)
231 |             visited.add(i)
232 |     return chains
233 | 
234 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \
235 |                             delta: int, radius: float = None, country_code = 'US') -> List[Dict]:
236 |     """."""
237 |     atms = [] 
238 |     start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
239 |     end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
240 |     timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None)
241 |     point_of_tr = faker.local_latlng(country_code = country_code)
242 |     latitude = point_of_tr[0] 
243 |     longitude = point_of_tr[1]
244 |     city = point_of_tr[2]
245 |     for _ in range(length):
246 |         current = timestamp - datetime.timedelta(hours=delta)
247 |         if radius is not None:
248 |             latitude = faker.coordinate(latitude, radius) 
249 |             longitude = faker.coordinate(longitude, radius)
250 |         amount = random.sample(cash_amounts, 1)[0]
251 |         transaction_id = generate_transaction_id(timestamp, credit_card_number, amount)
252 |         atms.append({'tid': transaction_id, 
253 |                      'datetime': current.strftime(DATE_FORMAT), 
254 |                      'cc_num': credit_card_number, 
255 |                      'category': 'Cash Withdrawal', 
256 |                      'amount': amount,
257 |                      'latitude': latitude, 
258 |                      'longitude': longitude,
259 |                      'city': city,
260 |                      'country': 'US',
261 |                      'fraud_label': 0
262 |                      })
263 |         timestamp = current
264 |     return atms
265 | 
266 | def generate_susceptible_cards(credit_cards: list) -> list:
267 |     """."""
268 |     susceptible_cards = []
269 |     visited_cards = []
270 |     for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items():
271 |         n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 
272 |         start, end = span
273 |         for _ in range(n):
274 |             for card in credit_cards:
275 |                 if card['age'] > start and card['age'] < end:
276 |                     if card['cc_num'] not in visited_cards:
277 |                         current = card
278 |                         visited_cards.append(card['cc_num'])
279 |                         break
280 |                     else:
281 |                         current = None                    
282 |             if current is not None:
283 |                 susceptible_cards.append(current)
284 |     return susceptible_cards
285 | 
286 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list:
287 |     """."""
288 |     normal_atm_withdrawals = []
289 |     atm_transactions = len(cash_amounts)
290 |     cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1))
291 |     atm_count = 0
292 |     while atm_count < atm_transactions:
293 |         for card in cash_withdrawal_cards:
294 |             for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 
295 |                 # interval in hours between normal cash withdrawals
296 |                 delta = random.randint(6, 168)
297 |                 atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS)         
298 |                 normal_atm_withdrawals.append(atm_tr)
299 |                 atm_count += ATM_WITHRAWAL_SEQ
300 |     return normal_atm_withdrawals
301 | 
302 | 
303 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list:
304 |     """."""
305 |     timestamps = []
306 |     timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT)
307 |     for _ in range(chain_length):
308 |         # interval in seconds between fraudulent attacks
309 |         delta = random.randint(30, 120)
310 |         current = timestamp + datetime.timedelta(seconds=delta)
311 |         timestamps.append(current.strftime(DATE_FORMAT))
312 |         timestamp = current
313 |     return timestamps 
314 | 
315 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list:
316 |     """."""
317 |     amounts = []
318 |     for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
319 |         n = math.ceil(chain_length * percentage)
320 |         start, end = span
321 |         for _ in range(n):
322 |             amounts.append(get_random_transaction_amount(start, end+1))
323 |     return amounts[:chain_length]
324 | 
325 | 
326 | def update_transactions(transactions: list, chains: list) -> list:
327 |     """."""
328 |     for key, chain in chains.items():
329 |         transaction = transactions[key]
330 |         timestamp = transaction['datetime']
331 |         cc_num = transaction['cc_num']
332 |         amount = transaction['amount']
333 |         transaction['fraud_label'] = 1
334 |         inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain))
335 |         inject_amounts = generate_amounts_for_fraud_attacks(len(chain))
336 |         random.shuffle(inject_amounts)
337 |         for i, idx in enumerate(chain):
338 |             original_transaction = transactions[idx]
339 |             inject_timestamp = inject_timestamps[i]
340 |             original_transaction['datetime'] = inject_timestamp
341 |             original_transaction['fraud_label'] = 1
342 |             original_transaction['cc_num'] = cc_num
343 |             original_transaction['amount'] = inject_amounts[i]
344 |             original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0]
345 |             original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount)
346 |             transactions[idx] = original_transaction
347 | 
348 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list:
349 |     """."""
350 |     return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \
351 |                   int(FRAUD_RATIO * len(normal_atm_withdrawals)))
352 | 
353 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\
354 |                                   cash_amounts: list):
355 |     """."""
356 |     for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs:
357 |         # interval in seconds between fraudulent attacks
358 |         delta = random.randint(1, 5)
359 |         atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx]
360 |         pre_fraudulent_atm_tr = atm_withdrawal[0]
361 |         fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number =
362 |                 pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0]
363 |         fraudulent_atm_location = faker.location_on_land()
364 |         while fraudulent_atm_location[3] == 'US':
365 |             fraudulent_atm_location = faker.location_on_land()
366 |             
367 |         fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'],
368 |                 DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT)
369 |         
370 |         fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0]
371 |         fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1]
372 |         fraudulent_atm_tr['city'] = fraudulent_atm_location[2]
373 |         fraudulent_atm_tr['country'] = fraudulent_atm_location[3]  
374 |         fraudulent_atm_tr['fraud_label'] = 1 
375 |         atm_withdrawal.append(fraudulent_atm_tr)
376 |         normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal
377 |         
378 |         
379 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame:
380 |     """."""
381 |     for atm_withdrawal in normal_atm_withdrawals:
382 |         for withdrawal in atm_withdrawal:
383 |             transactions.append(withdrawal)  
384 |     return pd.DataFrame.from_records(transactions)
385 | 
386 | 
387 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame:
388 |     """."""
389 |     df = pd.DataFrame.from_records(credit_cards)
390 |     # Cast the columns to the correct Pandas DType
391 |     df['cc_num']= pd.to_numeric(df['cc_num'])
392 |     return df
393 | 
394 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame:
395 |     """."""
396 |     profiles_df = generate_df_with_profiles(credit_cards)
397 |     return profiles_df
398 | 
399 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame:
400 |     """."""
401 |     timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS)
402 |     amounts = generate_amounts()
403 |     categories = generate_categories(amounts)
404 |     cc_df = create_credit_cards_as_df(credit_cards)    
405 |     transactions = generate_transactions(cc_df['cc_num'], timestamps, categories)
406 |     cash_amounts = generate_cash_amounts()
407 |     chains = generate_chains()
408 |     susceptible_cards = generate_susceptible_cards(credit_cards)
409 |     
410 |     normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards)
411 |     update_transactions(transactions, chains)
412 |     fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals)
413 |     update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts)
414 |     
415 |     transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals)
416 |     transactions_df["datetime"] = transactions_df.datetime.map(lambda x: datetime.datetime.strptime(x, DATE_FORMAT))
417 |     transactions_df["month"] = transactions_df.datetime.map(lambda x: date_to_year_month(x))
418 |     
419 |     # Cast the columns to the correct Pandas DType
420 |     transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num'])
421 |     transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude'])
422 |     transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude'])
423 |     transactions_df['datetime']= pd.to_datetime(transactions_df['datetime'])
424 | 
425 |     fraud_labels = transactions_df[["tid", "cc_num", "datetime", "month", "fraud_label"]]
426 |     transactions_df = transactions_df.drop(columns=["fraud_label"])
427 |     return transactions_df, fraud_labels
428 | 


--------------------------------------------------------------------------------
/src/06-module/sml/pipelines/streamlit_app.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from math import radians
  3 | from sml import cc_features
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | import plotly.express as px
  8 | from matplotlib import pyplot
  9 | import warnings
 10 | 
 11 | import hopsworks
 12 | from sml import synthetic_data
 13 | 
 14 | import streamlit as st
 15 | 
 16 | import folium
 17 | from streamlit_folium import st_folium
 18 | import json
 19 | 
 20 | time_now = int(datetime.datetime.now().timestamp() * 1000)
 21 | synthetic_data.set_random_seed(12345)
 22 | credit_cards = [cc["cc_num"] for cc in synthetic_data.generate_list_credit_card_numbers()]
 23 | lat = 0
 24 | long = 0
 25 | 
 26 | warnings.filterwarnings("ignore")
 27 | 
 28 | 
 29 | @st.cache(allow_output_mutation=True, suppress_st_warning=True)
 30 | def retrive_dataset():
 31 |     st.write(36 * "-")
 32 |     print_fancy_header('\n💾 Dataset Retrieving...')
 33 |     feature_view = fs.get_feature_view("transactions_fraud_online_fv", 1)
 34 |     batch_data = feature_view.get_batch_data()
 35 |     return batch_data
 36 | 
 37 | 
 38 | @st.cache(suppress_st_warning=True, allow_output_mutation=True)
 39 | def get_feature_views():
 40 |     fv = fs.get_feature_view("transactions_fraud_online_fv", 1)
 41 |     latest_record_fv = fs.get_feature_view("latest_recorded_transactions_fraud_online_fv", 1)
 42 |     return fv, latest_record_fv
 43 | 
 44 | 
 45 | @st.cache(suppress_st_warning=True, allow_output_mutation=True)
 46 | def get_deployment(project):
 47 |     mr = project.get_model_registry()
 48 |     ms = project.get_model_serving()
 49 |     deployment = ms.get_deployment("fraudonlinemodeldeployment")
 50 |     return deployment
 51 | 
 52 | 
 53 | def explore_data():
 54 |     st.write(36 * "-")
 55 |     print_fancy_header('\n👁 Data Exploration...')
 56 |     labels = ["Normal", "Fraudulent"]
 57 |     unique, counts = np.unique(test_mar_y.fraud_label.values, return_counts=True)
 58 |     values = counts.tolist()
 59 | 
 60 |     def plot_pie(values, labels):
 61 |         fig = px.pie(values=values, names=labels, title='Distribution of fraud transactions')
 62 |         return fig
 63 | 
 64 |     fig1 = plot_pie(values, labels)
 65 |     st.plotly_chart(fig1)
 66 | 
 67 | 
 68 | def process_input_vector(cc_num, current_datetime, amount, long, lat):
 69 |     long = radians(long)
 70 |     lat = radians(lat)
 71 | 
 72 |     current_coordinates = pd.DataFrame({
 73 |         "datetime": [int(current_datetime)],
 74 |         "cc_num": [cc_num],
 75 |         "latitude": [long],
 76 |         "longitude": [lat]
 77 | 
 78 |     })
 79 | 
 80 |     # get fv for the latest recorded transactions 
 81 |     latest_record_vector = latest_record_fv.get_feature_vector({"cc_num": cc_num})
 82 |     # compute deltas between previous and current
 83 |     loc_delta_t_minus_1 = cc_features.haversine_distance(long=long, lat=lat, prev_long=latest_record_vector[3],
 84 |                                                          prev_lat=latest_record_vector[2])
 85 |     time_delta_t_minus_1 = cc_features.time_delta(cc_features.timestamp_to_date(latest_record_vector[0]),
 86 |                                                   cc_features.timestamp_to_date(current_datetime))
 87 |     time_delta_t_minus_1 = cc_features.time_delta_to_days(time_delta_t_minus_1)
 88 |     # get all features
 89 |     feature_vector = fv.get_feature_vector({"cc_num": cc_num},
 90 |                                            passed_features={"amout": amount,
 91 |                                                             "loc_delta_t_minus_1": loc_delta_t_minus_1,
 92 |                                                             "time_delta_t_minus_1": time_delta_t_minus_1})
 93 | 
 94 |     # drop extra features
 95 |     indexes_to_remove = [0, 1]
 96 |     return {"inputs": [i for j, i in enumerate(feature_vector) if j not in indexes_to_remove]}, current_coordinates
 97 | 
 98 | 
 99 | def print_fancy_header(text, font_size=24):
100 |     res = f'<span style="color:#ff5f27; font-size: {font_size}px;">{text}</span>'
101 |     st.markdown(res, unsafe_allow_html=True)
102 | 
103 | 
104 | progress_bar = st.sidebar.header('⚙️ Working Progress')
105 | progress_bar = st.sidebar.progress(0)
106 | st.title('🆘 Fraud transactions detection 🆘')
107 | 
108 | st.write(36 * "-")
109 | print_fancy_header('\n📡 Connecting to Hopsworks Feature Store...')
110 | 
111 | project = hopsworks.login()
112 | fs = project.get_feature_store()
113 | progress_bar.progress(15)
114 | 
115 | st.write(36 * "-")
116 | print_fancy_header('\n🤖 Connecting to Model Registry on Hopsworks...')
117 | deployment = get_deployment(project)
118 | deployment.start()
119 | st.write("✅ Connected!")
120 | 
121 | progress_bar.progress(40)
122 | 
123 | st.write(36 * "-")
124 | print_fancy_header('\n✨ Feature view retrieving...')
125 | fv, latest_record_fv = get_feature_views()
126 | st.write("✅ Retrieved!")
127 | 
128 | progress_bar.progress(55)
129 | 
130 | st.write(36 * "-")
131 | print_fancy_header('\n🧠 On map bellow select location of ATM machine')
132 | with st.form(key="Selecting cc_num"):
133 |     cc_num = st.selectbox(
134 |         'Select a credit card number.',
135 |         (credit_cards)
136 |     )
137 | 
138 |     amount = st.slider(
139 |         '💶 Select withdrawal amount',
140 |         5, 1000)
141 | 
142 |     # my_map = folium.Map(location=[41, -73.5], zoom_start=8)
143 |     my_map = folium.Map(location=[52, 24], zoom_start=3)
144 | 
145 |     my_map.add_child(folium.LatLngPopup())
146 |     folium.TileLayer('Stamen Terrain').add_to(my_map)
147 |     folium.TileLayer('Stamen Toner').add_to(my_map)
148 |     folium.TileLayer('Stamen Water Color').add_to(my_map)
149 |     folium.TileLayer('cartodbpositron').add_to(my_map)
150 |     folium.TileLayer('cartodbdark_matter').add_to(my_map)
151 |     folium.LayerControl().add_to(my_map)
152 | 
153 |     res_map = st_folium(my_map, height=300, width=600)
154 | 
155 |     try:
156 |         lat, long = res_map["last_clicked"]["lat"], res_map["last_clicked"]["lng"]
157 | 
158 |         st.print_fancy_header("🏧 Withdrawal coordinates:")
159 |         st.write(f"Latitude: {lat}")
160 |         st.write(f"Longitude: {long}")
161 |     except Exception as err:
162 |         print(err)
163 |         pass
164 | 
165 |     submit_button = st.form_submit_button(label='Withdraw')
166 | 
167 | progress_bar.progress(70)
168 | 
169 | st.write(36 * "-")
170 | 
171 | # run code below if deployment doesnt work
172 | # print_fancy_header("Initialise serving...")
173 | # fv.init_serving(1)
174 | # time_now = int(datetime.datetime.now().timestamp()*1000)
175 | 
176 | data, current_coordinates = process_input_vector(cc_num=int(cc_num),
177 |                             current_datetime=int(time_now),
178 |                             amount=amount,
179 |                             lat=lat, long=long)
180 | 
181 | if st.button('📊 Make a prediction'):
182 |     res = deployment.predict(data)
183 |     progress_bar.progress(80)
184 |     negative = "**👌 Not a suspicious**"
185 |     positive = "**🆘 Fraudulent**"
186 |     res = negative if res["predictions"][0] == 0 else positive
187 |     print_fancy_header(res + " transaction!")
188 |     progress_bar.progress(100)
189 |     deployment.stop()
190 |     st.write(36 * "-")
191 |     st.write("Stopping the deployment...")
192 |     st.write("")
193 |     st.write('\n🎉 📈 🤝 App Finished Successfully 🤝 📈 🎉')
194 | 
195 |     # update fg
196 |     latest_recorded_transactions_fraud_online_fg = fs.get_or_create_feature_group(
197 |         name="latest_recorded_transactions_fraud_online",
198 |         version=1
199 |     )
200 |     latest_recorded_transactions_fraud_online_fg.insert(current_coordinates)
201 | 
202 | st.button("Re-run")
203 | 


--------------------------------------------------------------------------------
/src/06-module/sml/pipelines/streamlit_batch_app.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import joblib
  3 | from math import radians
  4 | from sml import cc_features
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | import plotly.express as px
  9 | from matplotlib import pyplot
 10 | import warnings
 11 | 
 12 | import hopsworks
 13 | from sml import synthetic_data
 14 | 
 15 | import streamlit as st
 16 | 
 17 | import folium
 18 | from streamlit_folium import st_folium
 19 | import json
 20 | 
 21 | time_now = int(datetime.datetime.now().timestamp() * 1000)
 22 | start_date = (datetime.datetime.now() - datetime.timedelta(hours=24)).date()
 23 | end_date = (datetime.datetime.now()).date()
 24 | synthetic_data.set_random_seed(12345)
 25 | credit_cards = [cc["cc_num"] for cc in synthetic_data.generate_list_credit_card_numbers()]
 26 | lat = 0
 27 | long = 0
 28 | 
 29 | warnings.filterwarnings("ignore")
 30 | 
 31 | project = hopsworks.login()
 32 | fs = project.get_feature_store()
 33 | 
 34 | @st.cache(allow_output_mutation=True, suppress_st_warning=True)
 35 | def retrive_dataset(start_date, end_date):
 36 |     st.write(36 * "-")
 37 |     print_fancy_header('\n💾 Dataset Retrieving...')
 38 |     feature_view = fs.get_feature_view("transactions_fraud_online_fv", 1)
 39 |     batch_data = feature_view.get_batch_data(start_time = start_date, end_time = end_date)
 40 |     batch_data.drop(["cc_num", "datetime"], axis = 1, inplace=True)
 41 |     return batch_data
 42 | 
 43 | 
 44 | @st.cache(suppress_st_warning=True, allow_output_mutation=True)
 45 | def get_feature_views():
 46 |     fv = fs.get_feature_view("transactions_fraud_online_fv", 1)
 47 |     latest_record_fv = fs.get_feature_view("latest_recorded_transactions_fraud_online_fv", 1)
 48 |     return fv, latest_record_fv
 49 | 
 50 | 
 51 | @st.cache(allow_output_mutation=True,suppress_st_warning=True)
 52 | def get_model(project = project):
 53 |     mr = project.get_model_registry()
 54 |     model = mr.get_model("transactions_fraud_online_xgboost", version = 1)
 55 |     model_dir = model.download()
 56 |     return joblib.load(model_dir + "/xgboost.pkl")
 57 | 
 58 | def explore_data(batch_data):
 59 |     st.write(36 * "-")
 60 |     print_fancy_header('\n👁 Data Exploration...')
 61 |     labels = ["Normal", "Fraudulent"]
 62 |     unique, counts = np.unique(batch_data.fraud.values, return_counts=True)
 63 |     values = counts.tolist()
 64 | 
 65 |     def plot_pie(values, labels):
 66 |         fig = px.pie(values=values, names=labels, title='Distribution of predicted fraud transactions')
 67 |         return fig
 68 | 
 69 |     fig1 = plot_pie(values, labels)
 70 |     st.plotly_chart(fig1)
 71 | 
 72 | 
 73 | def print_fancy_header(text, font_size=24):
 74 |     res = f'<span style="color:#ff5f27; font-size: {font_size}px;">{text}</span>'
 75 |     st.markdown(res, unsafe_allow_html=True)
 76 | 
 77 | def transform_preds(predictions):
 78 |     return ['Fraud' if pred == 1 else 'Not Fraud' for pred in predictions]    
 79 | 
 80 | progress_bar = st.sidebar.header('⚙️ Working Progress')
 81 | progress_bar = st.sidebar.progress(0)
 82 | st.title('🆘 Fraud transactions detection 🆘')
 83 | 
 84 | st.write(36 * "-")
 85 | print_fancy_header('\n📡 Connecting to Hopsworks Feature Store...')
 86 | 
 87 | st.write(36 * "-")
 88 | print_fancy_header('\n🤖 Connecting to Model Registry on Hopsworks...')
 89 | model = get_model(project)
 90 | st.write(model)
 91 | st.write("✅ Connected!")
 92 | 
 93 | progress_bar.progress(40)
 94 | 
 95 | st.write(36 * "-")
 96 | print_fancy_header('\n✨ Fetch batch data and predict')
 97 | fv, latest_record_fv = get_feature_views()
 98 | 
 99 | batch_data = retrive_dataset(start_date, end_date)
100 | st.write("✅ Retrieved!")
101 | progress_bar.progress(55)
102 | 
103 | if st.button('📊 Make a prediction'):
104 |     predictions = model.predict(batch_data)
105 |     predictions = transform_preds(predictions)
106 |     batch_data_to_explore = batch_data.copy()
107 |     batch_data_to_explore['fraud'] = predictions
108 |     explore_data(batch_data_to_explore)
109 | 
110 | st.button("Re-run")
111 | 


--------------------------------------------------------------------------------